Python源码示例:sklearn.preprocessing.LabelBinarizer()
示例1
def mmb_evaluate_model(self):
"""
Returns scores from cross validation evaluation on the malicious / benign classifier
"""
predictive_features = self.features['predictive_features']
self.clf_X = self.modeldata[predictive_features].values
self.clf_y = np.array(self.modeldata['label'])
X_train, X_test, y_train, y_test = train_test_split(self.clf_X, self.clf_y, test_size=0.2, random_state=0)
lb = LabelBinarizer()
y_train = np.array([number[0] for number in lb.fit_transform(y_train)])
eval_cls = RandomForestClassifier(n_estimators=100, max_features=.2)
eval_cls.fit(X_train, y_train)
recall = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='recall')
precision = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='precision')
accuracy = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='accuracy')
f1_score = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='f1_macro')
return {'accuracy': accuracy, 'f1': f1_score, 'precision': precision, 'recall': recall}
示例2
def test_sklearn_labelbin(self):
m = np.array([1.0, .81, .85, .81, .85, .81])
u = np.array([1.0, .23, .50, .23, .30, 0.13])
# Create the train dataset.
X_train, true_links = binary_vectors(
1000, 500, m=m, u=u, random_state=535, return_links=True)
binarizer = LabelBinarizer()
binarizer.fit(X_train.iloc[:, 0])
assert len(binarizer.classes_) == 1
binarizer.classes_ = np.array([0, 1])
assert len(binarizer.classes_) == 2
binarizer.transform(X_train.iloc[:, 1])
assert len(binarizer.classes_) == 2
示例3
def formatClass(rootFile, Cl):
import sklearn.preprocessing as pp
print('==========================================================================\n')
print(' Running basic TensorFlow. Creating class data in binary form...')
Cl2 = pp.LabelBinarizer().fit_transform(Cl)
import matplotlib.pyplot as plt
plt.hist([float(x) for x in Cl], bins=np.unique([float(x) for x in Cl]), edgecolor="black")
plt.xlabel('Class')
plt.ylabel('Occurrances')
plt.title('Class distibution')
plt.savefig(rootFile + '_ClassDistrib.png', dpi = 160, format = 'png') # Save plot
if tfDef.plotClassDistribTF == True:
print(' Plotting Class distibution \n')
plt.show()
return Cl2
#********************************************************************************
示例4
def test_cross_val_predict():
# Make sure it works in cross_val_predict for multiclass.
X, y = load_iris(return_X_y=True)
y = LabelBinarizer().fit_transform(y)
X = StandardScaler().fit_transform(X)
mlp = MLPClassifier(n_epochs=10,
solver_kwargs={'learning_rate': 0.05},
random_state=4567).fit(X, y)
cv = KFold(n_splits=4, random_state=457, shuffle=True)
y_oos = cross_val_predict(mlp, X, y, cv=cv, method='predict_proba')
auc = roc_auc_score(y, y_oos, average=None)
assert np.all(auc >= 0.96)
示例5
def bio_classification_report(y_true, y_pred):
"""
Classification report for a l ist of BIOSE-encoded sequences.
It computes token-level metrics and discards 'O' labels.
:param y_true:
:param y_pred:
:return:
"""
lb = LabelBinarizer()
y_true_combined = lb.fit_transform(y_true)
y_pred_combined = lb.transform(y_pred)
tagset = set(lb.classes_) - {'O'}
tagset = set(lb.classes_)
tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
class_indices = {
cls: idx for idx, cls in enumerate(lb.classes_)
}
return classification_report(
y_true_combined,
y_pred_combined,
labels=[class_indices[cls] for cls in tagset],
target_names=tagset
)
示例6
def eval(self, test_x, test_y, crf_model):
tagger = pycrfsuite.Tagger()
tagger.open(crf_model)
y_pred = []
for feat_list in test_x:
preds = tagger.tag(feat_list)
y_pred.append(preds)
lb = LabelBinarizer()
y_true_all = lb.fit_transform(list(chain.from_iterable(test_y)))
y_pred_all = lb.transform(list(chain.from_iterable(y_pred)))
tagset = sorted(set(lb.classes_))
class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
print(classification_report(
y_true_all,
y_pred_all,
labels=[class_indices[cls] for cls in tagset],
target_names=tagset,
digits=5
))
示例7
def eval(self, test_x, test_y, crf_model):
tagger = pycrfsuite.Tagger()
tagger.open(crf_model)
y_pred = []
for feat_list in test_x:
preds = tagger.tag(feat_list)
y_pred.append(preds)
lb = LabelBinarizer()
y_true_all = lb.fit_transform(list(chain.from_iterable(test_y)))
y_pred_all = lb.transform(list(chain.from_iterable(y_pred)))
tagset = sorted(set(lb.classes_))
class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
print(classification_report(
y_true_all,
y_pred_all,
labels=[class_indices[cls] for cls in tagset],
target_names=tagset,
digits=5
))
示例8
def train(self, x, y):
"""
Training multiple estimators each for distinguishing a pair of classes.
Args:
x (numpy.ndarray): input points
y (numpy.ndarray): input labels
Raises:
Exception: given all data points are assigned to the same class,
the prediction would be boring
"""
self.label_binarizer_ = LabelBinarizer(neg_label=0)
Y = self.label_binarizer_.fit_transform(y)
self.classes = self.label_binarizer_.classes_
columns = (np.ravel(col) for col in Y.T)
self.estimators = []
for _, column in enumerate(columns):
unique_y = np.unique(column)
if len(unique_y) == 1:
raise Exception("given all data points are assigned to the same class, "
"the prediction would be boring.")
estimator = self.estimator_cls(*self.params)
estimator.fit(x, column)
self.estimators.append(estimator)
示例9
def __init__(self, feature_vector_size, label_words):
self.ann = cv2.ml.ANN_MLP_create()
# Number of centroids used to build the feature vectors
input_size = feature_vector_size
# Number of models to recongnize
output_size = len(label_words)
# Applying Heaton rules
hidden_size = (input_size * (2 / 3)) + output_size
nn_config = np.array([input_size, hidden_size, output_size], dtype=np.uint8)
self.label_words = label_words
self.ann.setLayerSizes(np.array(nn_config))
# Symmetrical Sigmoid as activation function
self.ann.setActivationFunction(cv2.ml.ANN_MLP_SIGMOID_SYM)
# Map models as tuples of probabilities
self.le = preprocessing.LabelBinarizer()
self.le.fit(label_words) # Label words are ['dress', 'footwear', 'backpack']
示例10
def test_conversion_with_sparse_y(self):
"""Tests conversion of a model that's fitted with y values in a sparse format."""
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
self.iris_X, self.iris_y, test_size=0.2, train_size=0.8
)
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer(sparse_output=True)
binarized_y = lb.fit_transform(y_train)
sklearn_model = KNeighborsClassifier(algorithm="brute")
sklearn_model.fit(X_train, binarized_y)
self.assertRaises(ValueError, sklearn.convert, sklearn_model)
示例11
def fit(self, X, y):
"""
:param X_: shape = [n_samples, n_features]
:param y: shape = [n_samples]
:return: self
"""
labelbin = LabelBinarizer()
Y = labelbin.fit_transform(y)
self.classes = labelbin.classes_
self.class_count = np.zeros(Y.shape[1], dtype=np.float64)
self.feature_count = np.zeros((Y.shape[1], X.shape[1]),
dtype=np.float64)
self.feature_count += Y.T @ X
self.class_count += Y.sum(axis=0)
smoothed_fc = self.feature_count + self.alpha
smoothed_cc = smoothed_fc.sum(axis=1)
self.feature_log_prob = (np.log(smoothed_fc) -
np.log(smoothed_cc.reshape(-1, 1)))
示例12
def encode_labels(self, label_dict, srcids):
flat_labels = ['O']
if self.use_brick_flag:
with open('brick/tags.json', 'r') as fp:
brick_tags = json.load(fp)
flat_labels += ['B_' + tag for tag in brick_tags] + \
['I_' + tag for tag in brick_tags]
flat_labels += reduce(adder, [reduce(adder, label_dict[srcid].values()) for srcid in srcids])
self.le = LabelBinarizer().fit(flat_labels)
stack = []
for srcid in srcids:
labels = label_dict[srcid]
sentences = self.sentence_dict[srcid]
for metadata_type in self.sentence_dict[srcid].keys():
labels = label_dict[srcid][metadata_type]
if len(labels) == 0:
encoded = np.zeros((self.max_len, encoded.shape[1]))
else:
encoded = self.le.transform(labels)
encoded = np.vstack([encoded, np.zeros(
(self.max_len - encoded.shape[0],
encoded.shape[1]))])
stack.append(encoded)
return np.stack(stack)
示例13
def get_mnist_data():
"""Loads the MNIST data set into memory.
Returns
-------
X : array-like, shape=[n_samples, n_features]
Training data for the MNIST data set.
y : array-like, shape=[n_samples,]
Labels for the MNIST data set.
"""
digits = load_digits()
X, y = digits.data, digits.target
y = LabelBinarizer().fit_transform(y)
return X, y
示例14
def __init__(self, n_hidden=20, alpha=0.5, rbf_width=1.0,
activation_func='tanh', activation_args=None,
user_components=None, regressor=None,
binarizer=LabelBinarizer(-1, 1),
random_state=None):
super(ELMClassifier, self).__init__(n_hidden=n_hidden,
alpha=alpha,
random_state=random_state,
activation_func=activation_func,
activation_args=activation_args,
user_components=user_components,
rbf_width=rbf_width,
regressor=regressor)
self.classes_ = None
self.binarizer = binarizer
示例15
def make_xgboost_dataframe_mapper(dtypes, missing_value_aware = True):
"""Construct a DataFrameMapper for feeding complex data into an XGBModel.
Parameters
----------
dtypes: iterable of tuples (column, dtype)
missing_value_aware: boolean
If true, use missing value aware transformers.
Returns
-------
DataFrameMapper
"""
features = list()
for column, dtype in dtypes.items():
if _is_categorical(dtype):
features.append(([column], PMMLLabelBinarizer(sparse_output = True) if missing_value_aware else LabelBinarizer(sparse_output = True)))
else:
features.append(([column], None))
return DataFrameMapper(features)
示例16
def _check_X_y(self, X, y):
# helpful error message for sklearn < 1.17
is_2d = hasattr(y, 'shape') and len(y.shape) > 1 and y.shape[1] >= 2
if is_2d or type_of_target(y) != 'binary':
raise TypeError("Only binary targets supported. For training "
"multiclass or multilabel models, you may use the "
"OneVsRest or OneVsAll metaestimators in "
"scikit-learn.")
X, Y = check_X_y(X, y, dtype=np.double, accept_sparse='csc',
multi_output=False)
self.label_binarizer_ = LabelBinarizer(pos_label=1, neg_label=-1)
y = self.label_binarizer_.fit_transform(Y).ravel().astype(np.double)
return X, y
示例17
def test_model_label_binariser_default(self):
X = np.array([1, 2, 6, 4, 2])
model = LabelBinarizer().fit(X)
model_onnx = convert_sklearn(
model,
"scikit-learn label binariser",
[("input", Int64TensorType([None]))],
)
self.assertTrue(model_onnx is not None)
dump_data_and_model(
X.astype(np.int64),
model,
model_onnx,
basename="SklearnLabelBinariserDefault",
allow_failure="StrictVersion("
"onnxruntime.__version__)"
"<= StrictVersion('0.2.1')",
)
示例18
def test_model_label_binariser_neg_label(self):
X = np.array([1, 2, 6, 4, 2])
model = LabelBinarizer(neg_label=-101).fit(X)
model_onnx = convert_sklearn(
model,
"scikit-learn label binariser",
[("input", Int64TensorType([None]))],
)
self.assertTrue(model_onnx is not None)
dump_data_and_model(
X.astype(np.int64),
model,
model_onnx,
basename="SklearnLabelBinariserNegLabel",
allow_failure="StrictVersion("
"onnxruntime.__version__)"
"<= StrictVersion('0.2.1')",
)
示例19
def test_model_label_binariser_pos_label(self):
X = np.array([1, 2, 6, 4, 2])
model = LabelBinarizer(pos_label=123).fit(X)
model_onnx = convert_sklearn(
model,
"scikit-learn label binariser",
[("input", Int64TensorType([None]))],
)
self.assertTrue(model_onnx is not None)
dump_data_and_model(
X.astype(np.int64),
model,
model_onnx,
basename="SklearnLabelBinariserPosLabel",
allow_failure="StrictVersion("
"onnxruntime.__version__)"
"<= StrictVersion('0.2.1')",
)
示例20
def test_model_label_binariser_neg_pos_label(self):
X = np.array([1, 2, 6, 4, 2])
model = LabelBinarizer(neg_label=10, pos_label=20).fit(X)
model_onnx = convert_sklearn(
model,
"scikit-learn label binariser",
[("input", Int64TensorType([None]))],
)
self.assertTrue(model_onnx is not None)
dump_data_and_model(
X.astype(np.int64),
model,
model_onnx,
basename="SklearnLabelBinariserNegPosLabel",
allow_failure="StrictVersion("
"onnxruntime.__version__)"
"<= StrictVersion('0.2.1')",
)
示例21
def test_model_label_binariser_binary_labels(self):
X = np.array([1, 0, 0, 0, 1])
model = LabelBinarizer().fit(X)
model_onnx = convert_sklearn(
model,
"scikit-learn label binariser",
[("input", Int64TensorType([None]))],
)
self.assertTrue(model_onnx is not None)
dump_data_and_model(
X.astype(np.int64),
model,
model_onnx,
basename="SklearnLabelBinariserBinaryLabels",
allow_failure="StrictVersion("
"onnxruntime.__version__)"
"<= StrictVersion('0.2.1')",
)
示例22
def test_objectmapper(self):
df = pdml.ModelFrame([])
self.assertIs(df.preprocessing.Binarizer, pp.Binarizer)
self.assertIs(df.preprocessing.FunctionTransformer,
pp.FunctionTransformer)
self.assertIs(df.preprocessing.Imputer, pp.Imputer)
self.assertIs(df.preprocessing.KernelCenterer, pp.KernelCenterer)
self.assertIs(df.preprocessing.LabelBinarizer, pp.LabelBinarizer)
self.assertIs(df.preprocessing.LabelEncoder, pp.LabelEncoder)
self.assertIs(df.preprocessing.MultiLabelBinarizer, pp.MultiLabelBinarizer)
self.assertIs(df.preprocessing.MaxAbsScaler, pp.MaxAbsScaler)
self.assertIs(df.preprocessing.MinMaxScaler, pp.MinMaxScaler)
self.assertIs(df.preprocessing.Normalizer, pp.Normalizer)
self.assertIs(df.preprocessing.OneHotEncoder, pp.OneHotEncoder)
self.assertIs(df.preprocessing.PolynomialFeatures, pp.PolynomialFeatures)
self.assertIs(df.preprocessing.RobustScaler, pp.RobustScaler)
self.assertIs(df.preprocessing.StandardScaler, pp.StandardScaler)
示例23
def test_LabelBinarizer(self):
arr = np.array([1, 2, 3, 2])
s = pdml.ModelSeries(arr, index=['a', 'b', 'c', 'd'])
mod1 = s.pp.LabelBinarizer()
s.fit(mod1)
result = s.transform(mod1)
expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 1, 0]])
self.assertIsInstance(result, pdml.ModelFrame)
self.assert_numpy_array_almost_equal(result.values, expected)
tm.assert_index_equal(result.index, s.index)
mod1 = s.pp.LabelBinarizer()
result = s.fit_transform(mod1)
self.assertIsInstance(result, pdml.ModelFrame)
self.assert_numpy_array_almost_equal(result.values, expected)
inversed = result.inverse_transform(mod1)
self.assertIsInstance(inversed, pdml.ModelFrame)
self.assert_numpy_array_almost_equal(inversed.values.flatten(), arr)
tm.assert_index_equal(result.index, s.index)
示例24
def setup_model(self, config):
self.set_params(**config.params)
self.label_encoder = LabelBinarizer()
self.gaz_encoder = LabelBinarizer()
self.graph = tf.Graph()
self.saver = None
self.example_type = config.example_type
self.features = config.features
self.query_encoder = WordSequenceEmbedding(
self.padding_length,
self.token_embedding_dimension,
self.token_pretrained_embedding_filepath,
)
if self.use_char_embeddings:
self.char_encoder = CharacterSequenceEmbedding(
self.padding_length,
self.character_embedding_dimension,
self.max_char_per_word,
)
示例25
def _gaz_transform(self, list_of_tokens_to_transform):
"""This function is used to handle special logic around SKLearn's LabelBinarizer
class which behaves in a non-standard way for 2 classes. In a 2 class system,
it encodes the classes as [0] and [1]. However, in a 3 class system, it encodes
the classes as [0,0,1], [0,1,0], [1,0,0] and sustains this behavior for num_class > 2.
We want to encode 2 class systems as [0,1] and [1,0]. This function does that.
Args:
list_of_tokens_to_transform (list): A sequence of class labels
Returns:
(array): corrected encoding from the binarizer
"""
output = self.gaz_encoder.transform(list_of_tokens_to_transform)
if len(self.gaz_encoder.classes_) == 2:
output = np.hstack((1 - output, output))
return output
示例26
def _get_child_predict(self, clf, X, index=None):
if self.stack_by_proba and hasattr(clf, 'predict_proba'):
if self.save_stage0 and index is not None:
proba = util.saving_predict_proba(clf, X, index)
else:
proba = clf.predict_proba(X)
return proba[:, 1:]
elif hasattr(clf, 'predict'):
predict_result = clf.predict(X)
if isinstance(clf, ClassifierMixin):
lb = LabelBinarizer()
lb.fit(predict_result)
return lb.fit_transform(predict_result)
else:
return predict_result.reshape((predict_result.size, 1))
else:
return clf.fit_transform(X)
示例27
def _fit_data(self, X):
"""Binarize the data for each column separately.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Returns
-------
X_transformed : array-like
Returns the data where in each columns the labels are
binarized.
"""
if self.binarize is not None:
X = binarize(X, threshold=self.binarize)
for i in range(X.shape[1]):
# initialise binarizer and save
binarizer = LabelBinarizer()
if self.binarize:
binarizer.classes_ = np.array([0, 1])
# fit the data to the binarizer
binarizer.fit(X[:, i])
self._binarizers.append(binarizer)
return self._transform_data(X)
示例28
def test_sklearn_preinit(self):
m = np.array([1.0, .81, .85, .81, .85, .81])
u = np.array([1.0, .23, .50, .23, .30, 0.13])
# Create the train dataset.
X_train, true_links = binary_vectors(
1000, 500, m=m, u=u, random_state=535, return_links=True)
binarizer = LabelBinarizer()
binarizer.classes_ = np.array([0, 1])
binarizer.transform(X_train.iloc[:, 1])
assert len(binarizer.classes_) == 2
示例29
def knn_class_fit(train, label):
"""训练数据模型"""
binary = LabelBinarizer() # 二值化
y_ = binary.fit_transform(label)
clf = KNeighborsClassifier()
clf.fit(train, np.ravel(y_))
return clf, binary
示例30
def binarize_col(train, test, col):
encoder = LabelBinarizer()
cat_train_1hot = encoder.fit_transform(train[col])
cat_test_1hot = encoder.transform(test[col])
return cat_train_1hot, cat_test_1hot