Python源码示例:sklearn.preprocessing.binarize()

示例1
def transform(self, X):
        """Binarize each element of X

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape [n_samples, n_features]
            The data to binarize, element by element.
        """
        df = True
        try:
            index = X.index
            columns = X.columns
        except AttributeError:
            df = False

        X_ = binarize(X, threshold=self.threshold, copy=self.copy)

        if df:
            return pd.DataFrame(data=X_,index=index,columns=columns)
        else:
            return X_ 
示例2
def predict(self, data):
        """
        1) Predicts an outcome given facts
        2) Predicts probability that prediction is correct
            2.1) Range goes from [0-1] where x < 0.5 is False
            2.2) The model only returns the probability that a fact is 1
            2.3) therefore to predict that the probability that a fact is 0 we do
                 1 - x when x < 0.5

        :param data: numpy([1, 0, 0, ...])
        :return: np.array([...])
        """
        if self.model is None:
            self.model = Load.load_binary("multi_class_svm_model.bin")
        data = binarize([data], threshold=0)
        probabilities = self.model.predict_proba(data)[0]
        predictions = self.model.predict(data)
        for i in range(len(probabilities)):
            prediction = predictions[0][i]
            if prediction == 0:
                probabilities[i] = 1 - probabilities[i]
            probabilities[i] = format(probabilities[i], '.2f')
        return self.model.predict(data), probabilities 
示例3
def reshape_dataset(self):
        """
        Restructure the data to accomodate the sklearn library
        1) Reshape the x data
            1.1) 2D numpy array: [
                    [precedent #1 facts],
                    [precedent #2 facts],
                    ...
                ]
        2) Reshape the y data
        :return: x_total <#1.1>, y_total <#2.4>
        """

        # 1
        x_total = np.array(
            [np.reshape(precedent['facts_vector'], (len(precedent['facts_vector'], ))) for precedent in self.data_set])
        x_total = binarize(x_total, threshold=0)

        # 2
        y_list = []
        for precedent in self.data_set:
            y_list.append(self.__classify_precedent(precedent))
        y_total = np.array(y_list)
        return x_total, y_total 
示例4
def test_preprocessing_assignment(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        original_columns = df.data.columns
        df['sepal length (cm)'] = df['sepal length (cm)'].preprocessing.binarize(threshold=6)
        self.assertIsInstance(df, pdml.ModelFrame)
        binarized = pp.binarize(np.atleast_2d(iris.data[:, 0]), threshold=6)
        expected = np.hstack([binarized.T, iris.data[:, 1:]])
        self.assert_numpy_array_almost_equal(df.data.values, expected)
        tm.assert_index_equal(df.data.columns, original_columns)

        # recreate data
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        target_columns = ['sepal length (cm)', 'sepal width (cm)']
        df[target_columns] = df[target_columns].preprocessing.binarize(threshold=6)
        self.assertIsInstance(df, pdml.ModelFrame)
        binarized = pp.binarize(iris.data[:, 0:2], threshold=6)
        expected = np.hstack([binarized, iris.data[:, 2:]])
        self.assert_numpy_array_almost_equal(df.data.values, expected)
        tm.assert_index_equal(df.data.columns, original_columns) 
示例5
def _fit_data(self, X):
        """Binarize the data for each column separately.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        X_transformed : array-like
            Returns the data where in each columns the labels are
            binarized.

        """

        if self.binarize is not None:
            X = binarize(X, threshold=self.binarize)

        for i in range(X.shape[1]):

            # initialise binarizer and save
            binarizer = LabelBinarizer()

            if self.binarize:
                binarizer.classes_ = np.array([0, 1])

            # fit the data to the binarizer
            binarizer.fit(X[:, i])

            self._binarizers.append(binarizer)

        return self._transform_data(X) 
示例6
def _transform_data(self, X):
        """Binarize the data for each column separately."""

        if self._binarizers == []:
            raise NotFittedError()

        if self.binarize is not None:
            X = binarize(X, threshold=self.binarize)

        if len(self._binarizers) != X.shape[1]:
            raise ValueError(
                "Expected input with %d features, got %d instead" %
                (len(self._binarizers), X.shape[1]))

        X_parts = []

        for i in range(X.shape[1]):

            X_i = self._binarizers[i].transform(X[:, i])

            # sklearn returns ndarray with shape (samples, 1) on binary input.
            if self._binarizers[i].classes_.shape[0] == 1:
                X_parts.append(1 - X_i)
            elif self._binarizers[i].classes_.shape[0] == 2:
                X_parts.append(1 - X_i)
                X_parts.append(X_i)
            else:
                X_parts.append(X_i)

        return np.concatenate(X_parts, axis=1) 
示例7
def __init__(self, alpha=1.0, binarize=0.0, fit_prior=True,
                 class_prior=None):

        self.alpha = alpha
        self.binarize = binarize
        self.fit_prior = fit_prior
        self.class_log_prior_ = class_prior
        self.class_prior = class_prior

        self._binarizers = [] 
示例8
def __init__(self,
                 init='jaro',
                 max_iter=100,
                 binarize=binarize,
                 atol=10e-5):
        self.init = init
        self.max_iter = max_iter
        self.binarize = binarize
        self.atol = atol

        self._binarizers = [] 
示例9
def g(a):
    from sklearn.preprocessing import binarize
    return f(a) 
示例10
def test_run_isolated_from_function_from_source():
    args = [1,3,7]
    f_source = b'def f(a):\n    return a+1\n'
    f1 = featurehub.util.get_function(f_source)
    g_source = b'def f(a):\n    return a+1\n\ndef g(a):\n    from sklearn.preprocessing import binarize\n    return f(a)\n'
    g1 = featurehub.util.get_function(g_source)
    for arg in args:
        assert f1(arg) == featurehub.util.run_isolated(f1, arg)
        assert g1(arg) == featurehub.util.run_isolated(g1, arg) 
示例11
def test_run_isolated_from_function2_from_source():
    args = [1,3,7]
    f_source = b'def f(a):\n    return a+1\n'
    f1 = featurehub.util.get_function2(f_source)
    g_source = b'def f(a):\n    return a+1\n\ndef g(a):\n    from sklearn.preprocessing import binarize\n    return f(a)\n'
    g1 = featurehub.util.get_function2(g_source)
    for arg in args:
        assert f1(arg) == featurehub.util.run_isolated(f1, arg)
        assert g1(arg) == featurehub.util.run_isolated(g1, arg)

# ------------------------------------------------------------------------------ 
# Test compute_dataset_hash 
示例12
def hi_lo_age(dataset):
    from sklearn.preprocessing import binarize
    cutoff = 30
    return binarize(dataset["users"]["age"].values.reshape(-1,1), cutoff) 
示例13
def test_binarize(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.preprocessing.binarize()
        expected = pp.binarize(iris.data)

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.data.values, expected)
        tm.assert_index_equal(result.columns, df.data.columns)

        result = df.preprocessing.binarize(threshold=5)
        expected = pp.binarize(iris.data, threshold=5)

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.data.values, expected)
        tm.assert_index_equal(result.columns, df.data.columns)

        s = df['sepal length (cm)']
        self.assertIsInstance(s, pdml.ModelSeries)
        result = s.preprocessing.binarize()
        expected = pp.binarize(iris.data[:, 0].reshape(-1, 1))

        self.assertIsInstance(result, pdml.ModelSeries)
        self.assert_numpy_array_almost_equal(result.values, expected.flatten())
        self.assertEqual(result.name, 'sepal length (cm)')

        result = s.preprocessing.binarize(threshold=6)
        expected = pp.binarize(iris.data[:, 0].reshape(-1, 1), threshold=6)

        self.assertIsInstance(result, pdml.ModelSeries)
        self.assert_numpy_array_almost_equal(result.values, expected.flatten())
        self.assertEqual(result.name, 'sepal length (cm)') 
示例14
def score(self, tpr_threshold=None, cutoff_threshold=None):
        """
        Calculates the scoring metrics using a cutoff threshold that attains a true positive rate
        that is equal or greater than the desired tpr_threshold

        Args
        ----
        tpr_threshold : float
            Minimum true positive rate to achieve
        cutoff_threshold : float
            As an alternative to using a minimum true positive, a probability cutoff threshold
            can be specified to calculate the scoring
        """

        if tpr_threshold is None and cutoff_threshold is None:
            raise ValueError('Either tpr_threshold or cutoff_threshold must be specified')

        scores = OrderedDict((k, []) for (k, v) in self.scoring.items())
        self.thresholds_ = []
        self.tpr_ = []
        self.fpr_ = []
        self.roc_thresholds_ = []

        for idx in self.test_idx_:
            # split fold
            y_true = self.y_true[idx]
            y_pred_ = self.y_pred_[idx, :]

            # get roc curve data
            fpr, tpr, thresholds = roc_curve(
                y_true, y_pred_[:, self.positive])

            self.fpr_.append(fpr)
            self.tpr_.append(tpr)
            self.roc_thresholds_.append(thresholds)

            # calculate cutoff that produces tpr >= threshold
            if cutoff_threshold is None:
                opt_threshold = thresholds[np.where(tpr >= tpr_threshold)[0].min()]
                self.thresholds_ = np.append(self.thresholds_, opt_threshold)
            else:
                opt_threshold = cutoff_threshold

            # calculate performance metrics
            y_pred_opt = binarize(y_pred_, opt_threshold)

            # calculate scores
            for name, score_func in self.scoring.items():
                score_func = self.scoring[name]
                scores[name] = np.append(scores[name], score_func(y_true, y_pred_opt[:, self.positive]))

        return scores