Python源码示例:sklearn.preprocessing.binarize()
示例1
def transform(self, X):
"""Binarize each element of X
Parameters
----------
X : {array-like, sparse matrix}, shape [n_samples, n_features]
The data to binarize, element by element.
"""
df = True
try:
index = X.index
columns = X.columns
except AttributeError:
df = False
X_ = binarize(X, threshold=self.threshold, copy=self.copy)
if df:
return pd.DataFrame(data=X_,index=index,columns=columns)
else:
return X_
示例2
def predict(self, data):
"""
1) Predicts an outcome given facts
2) Predicts probability that prediction is correct
2.1) Range goes from [0-1] where x < 0.5 is False
2.2) The model only returns the probability that a fact is 1
2.3) therefore to predict that the probability that a fact is 0 we do
1 - x when x < 0.5
:param data: numpy([1, 0, 0, ...])
:return: np.array([...])
"""
if self.model is None:
self.model = Load.load_binary("multi_class_svm_model.bin")
data = binarize([data], threshold=0)
probabilities = self.model.predict_proba(data)[0]
predictions = self.model.predict(data)
for i in range(len(probabilities)):
prediction = predictions[0][i]
if prediction == 0:
probabilities[i] = 1 - probabilities[i]
probabilities[i] = format(probabilities[i], '.2f')
return self.model.predict(data), probabilities
示例3
def reshape_dataset(self):
"""
Restructure the data to accomodate the sklearn library
1) Reshape the x data
1.1) 2D numpy array: [
[precedent #1 facts],
[precedent #2 facts],
...
]
2) Reshape the y data
:return: x_total <#1.1>, y_total <#2.4>
"""
# 1
x_total = np.array(
[np.reshape(precedent['facts_vector'], (len(precedent['facts_vector'], ))) for precedent in self.data_set])
x_total = binarize(x_total, threshold=0)
# 2
y_list = []
for precedent in self.data_set:
y_list.append(self.__classify_precedent(precedent))
y_total = np.array(y_list)
return x_total, y_total
示例4
def test_preprocessing_assignment(self):
iris = datasets.load_iris()
df = pdml.ModelFrame(iris)
original_columns = df.data.columns
df['sepal length (cm)'] = df['sepal length (cm)'].preprocessing.binarize(threshold=6)
self.assertIsInstance(df, pdml.ModelFrame)
binarized = pp.binarize(np.atleast_2d(iris.data[:, 0]), threshold=6)
expected = np.hstack([binarized.T, iris.data[:, 1:]])
self.assert_numpy_array_almost_equal(df.data.values, expected)
tm.assert_index_equal(df.data.columns, original_columns)
# recreate data
iris = datasets.load_iris()
df = pdml.ModelFrame(iris)
target_columns = ['sepal length (cm)', 'sepal width (cm)']
df[target_columns] = df[target_columns].preprocessing.binarize(threshold=6)
self.assertIsInstance(df, pdml.ModelFrame)
binarized = pp.binarize(iris.data[:, 0:2], threshold=6)
expected = np.hstack([binarized, iris.data[:, 2:]])
self.assert_numpy_array_almost_equal(df.data.values, expected)
tm.assert_index_equal(df.data.columns, original_columns)
示例5
def _fit_data(self, X):
"""Binarize the data for each column separately.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Returns
-------
X_transformed : array-like
Returns the data where in each columns the labels are
binarized.
"""
if self.binarize is not None:
X = binarize(X, threshold=self.binarize)
for i in range(X.shape[1]):
# initialise binarizer and save
binarizer = LabelBinarizer()
if self.binarize:
binarizer.classes_ = np.array([0, 1])
# fit the data to the binarizer
binarizer.fit(X[:, i])
self._binarizers.append(binarizer)
return self._transform_data(X)
示例6
def _transform_data(self, X):
"""Binarize the data for each column separately."""
if self._binarizers == []:
raise NotFittedError()
if self.binarize is not None:
X = binarize(X, threshold=self.binarize)
if len(self._binarizers) != X.shape[1]:
raise ValueError(
"Expected input with %d features, got %d instead" %
(len(self._binarizers), X.shape[1]))
X_parts = []
for i in range(X.shape[1]):
X_i = self._binarizers[i].transform(X[:, i])
# sklearn returns ndarray with shape (samples, 1) on binary input.
if self._binarizers[i].classes_.shape[0] == 1:
X_parts.append(1 - X_i)
elif self._binarizers[i].classes_.shape[0] == 2:
X_parts.append(1 - X_i)
X_parts.append(X_i)
else:
X_parts.append(X_i)
return np.concatenate(X_parts, axis=1)
示例7
def __init__(self, alpha=1.0, binarize=0.0, fit_prior=True,
class_prior=None):
self.alpha = alpha
self.binarize = binarize
self.fit_prior = fit_prior
self.class_log_prior_ = class_prior
self.class_prior = class_prior
self._binarizers = []
示例8
def __init__(self,
init='jaro',
max_iter=100,
binarize=binarize,
atol=10e-5):
self.init = init
self.max_iter = max_iter
self.binarize = binarize
self.atol = atol
self._binarizers = []
示例9
def g(a):
from sklearn.preprocessing import binarize
return f(a)
示例10
def test_run_isolated_from_function_from_source():
args = [1,3,7]
f_source = b'def f(a):\n return a+1\n'
f1 = featurehub.util.get_function(f_source)
g_source = b'def f(a):\n return a+1\n\ndef g(a):\n from sklearn.preprocessing import binarize\n return f(a)\n'
g1 = featurehub.util.get_function(g_source)
for arg in args:
assert f1(arg) == featurehub.util.run_isolated(f1, arg)
assert g1(arg) == featurehub.util.run_isolated(g1, arg)
示例11
def test_run_isolated_from_function2_from_source():
args = [1,3,7]
f_source = b'def f(a):\n return a+1\n'
f1 = featurehub.util.get_function2(f_source)
g_source = b'def f(a):\n return a+1\n\ndef g(a):\n from sklearn.preprocessing import binarize\n return f(a)\n'
g1 = featurehub.util.get_function2(g_source)
for arg in args:
assert f1(arg) == featurehub.util.run_isolated(f1, arg)
assert g1(arg) == featurehub.util.run_isolated(g1, arg)
# ------------------------------------------------------------------------------
# Test compute_dataset_hash
示例12
def hi_lo_age(dataset):
from sklearn.preprocessing import binarize
cutoff = 30
return binarize(dataset["users"]["age"].values.reshape(-1,1), cutoff)
示例13
def test_binarize(self):
iris = datasets.load_iris()
df = pdml.ModelFrame(iris)
result = df.preprocessing.binarize()
expected = pp.binarize(iris.data)
self.assertIsInstance(result, pdml.ModelFrame)
self.assert_numpy_array_almost_equal(result.data.values, expected)
tm.assert_index_equal(result.columns, df.data.columns)
result = df.preprocessing.binarize(threshold=5)
expected = pp.binarize(iris.data, threshold=5)
self.assertIsInstance(result, pdml.ModelFrame)
self.assert_numpy_array_almost_equal(result.data.values, expected)
tm.assert_index_equal(result.columns, df.data.columns)
s = df['sepal length (cm)']
self.assertIsInstance(s, pdml.ModelSeries)
result = s.preprocessing.binarize()
expected = pp.binarize(iris.data[:, 0].reshape(-1, 1))
self.assertIsInstance(result, pdml.ModelSeries)
self.assert_numpy_array_almost_equal(result.values, expected.flatten())
self.assertEqual(result.name, 'sepal length (cm)')
result = s.preprocessing.binarize(threshold=6)
expected = pp.binarize(iris.data[:, 0].reshape(-1, 1), threshold=6)
self.assertIsInstance(result, pdml.ModelSeries)
self.assert_numpy_array_almost_equal(result.values, expected.flatten())
self.assertEqual(result.name, 'sepal length (cm)')
示例14
def score(self, tpr_threshold=None, cutoff_threshold=None):
"""
Calculates the scoring metrics using a cutoff threshold that attains a true positive rate
that is equal or greater than the desired tpr_threshold
Args
----
tpr_threshold : float
Minimum true positive rate to achieve
cutoff_threshold : float
As an alternative to using a minimum true positive, a probability cutoff threshold
can be specified to calculate the scoring
"""
if tpr_threshold is None and cutoff_threshold is None:
raise ValueError('Either tpr_threshold or cutoff_threshold must be specified')
scores = OrderedDict((k, []) for (k, v) in self.scoring.items())
self.thresholds_ = []
self.tpr_ = []
self.fpr_ = []
self.roc_thresholds_ = []
for idx in self.test_idx_:
# split fold
y_true = self.y_true[idx]
y_pred_ = self.y_pred_[idx, :]
# get roc curve data
fpr, tpr, thresholds = roc_curve(
y_true, y_pred_[:, self.positive])
self.fpr_.append(fpr)
self.tpr_.append(tpr)
self.roc_thresholds_.append(thresholds)
# calculate cutoff that produces tpr >= threshold
if cutoff_threshold is None:
opt_threshold = thresholds[np.where(tpr >= tpr_threshold)[0].min()]
self.thresholds_ = np.append(self.thresholds_, opt_threshold)
else:
opt_threshold = cutoff_threshold
# calculate performance metrics
y_pred_opt = binarize(y_pred_, opt_threshold)
# calculate scores
for name, score_func in self.scoring.items():
score_func = self.scoring[name]
scores[name] = np.append(scores[name], score_func(y_true, y_pred_opt[:, self.positive]))
return scores