Python源码示例:sklearn.preprocessing.MultiLabelBinarizer()
示例1
def __init__(self, model_module, weights_path, evaluation_strategy="s2"):
"""
Test metadata format
---------------------
filename : string
class_ids: string of ints with space as a delimiter
"""
test_dataset = pd.read_csv(IRMAS_TESTING_META_PATH, names=["filename", "class_ids"])
self.X = list(test_dataset.filename)
targets = [[int(category) for category in target.split()] for target in test_dataset.class_ids]
self.ml_binarizer = MultiLabelBinarizer().fit(targets)
self.y_true = self.ml_binarizer.transform(targets)
self.y_pred = np.zeros(shape=self.y_true.shape)
self.y_pred_raw = np.zeros(shape=self.y_true.shape)
self.y_pred_raw_average = np.zeros(shape=self.y_true.shape)
self.model_module = model_module
self.weights_path = weights_path
self.feature_filenames = os.listdir(os.path.join(IRMAS_TEST_FEATURE_BASEPATH, model_module.BASE_NAME))
self.dataset_mean = np.load(os.path.join(MODEL_MEANS_BASEPATH, "{}_mean.npy".format(model_module.BASE_NAME)))
self.evaluation_strategy = evaluation_strategy
self.thresholds_s1 = [0.10, 0.12, 0.14, 0.16, 0.18, 0.20, 0.22, 0.24]
self.thresholds_s2 = [0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
示例2
def cat_onehot_encoder_m(df,y,col,selection=True):
## ZJN: test raise memory error
# raise MemoryError
mlbs = MultiLabelBinarizer(sparse_output=True).fit(df.values)
from scipy.sparse import csr_matrix
features_tmp = mlbs.transform(df.values)
features_tmp = csr_matrix(features_tmp,dtype=float).tocsr()
models = None
auc_score = None
if selection is True:
auc_score, models = train_lightgbm_for_feature_selection(features_tmp, y)
print(col, "auc", auc_score)
#new_feature = pd.DataFrame(features_tmp,columns=["mul_feature_"+col])
new_feature = features_tmp
from scipy.sparse import hstack
return new_feature,mlbs,models,auc_score
示例3
def setUp(self):
FILENAME = "../data/images/overfeat_raw.txt"
data = prepare.data_from_csv(FILENAME, sep='\\t')
TARGET = 'Labels'
self.SENS = ['Race']
self.EXPL = []
labeled_data = [ast.literal_eval(s) for s in data[TARGET]]
for l in labeled_data:
assert len(l) == 5
label_encoder = preprocessing.MultiLabelBinarizer()
labeled_data = label_encoder.fit_transform(labeled_data)
labels = label_encoder.classes_
df_labels = pd.DataFrame(labeled_data, columns=labels)
self.data = DataSource(pd.concat([data.drop(TARGET, axis=1), df_labels],
axis=1))
self.TARGET = labels.tolist()
示例4
def evaluate(self, preds):
acc = eval_func.sequential_accuracy(
[self.label_dict[srcid] for srcid in preds.keys()],
[preds[srcid] for srcid in preds.keys()])
pred = [preds[srcid] for srcid in preds.keys()]
true = [self.label_dict[srcid] for srcid in preds.keys()]
mlb = MultiLabelBinarizer()
mlb.fit(pred + true)
encoded_true = mlb.transform(true)
encoded_pred = mlb.transform(pred)
macro_f1 = f1_score(encoded_true, encoded_pred, average='macro')
f1 = f1_score(encoded_true, encoded_pred, average='weighted')
res = {
'accuracy': acc,
'f1': f1,
'macro_f1': macro_f1
}
return res
示例5
def feature_vectorizer(X_train, X_test, y_train, y_test):
"""prepare X data with tfidf and y with multi label binarizer"""
vectorizer = TfidfVectorizer(
analyzer="word", min_df=0.0,
max_df=1.0, strip_accents=None,
encoding="utf-8", preprocessor=None,
token_pattern=r"(?u)\S\S+", max_features=1000,
)
# fit only training data
vectorizer.fit(X_train)
_save_data(vectorizer, "/workdir/models/X_vectorizer.pk")
X_train_features = vectorizer.transform(X_train)
X_test_features = vectorizer.transform(X_test)
# use multiLabelBinarizer to create one-hot encoding of labels for y data
mlb = MultiLabelBinarizer()
# fit only training data
mlb.fit(y_train)
_save_data(mlb, "/workdir/models/label_binarizer.pk")
y_train_features = mlb.transform(y_train)
y_test_features = mlb.transform(y_test)
return X_train_features, X_test_features, y_train_features, y_test_features
示例6
def _build_label_dict(self,
labels: List[str]):
from sklearn.preprocessing import MultiLabelBinarizer
if self.multi_label:
label_set = set()
for i in labels:
label_set = label_set.union(list(i))
else:
label_set = set(labels)
self.label2idx = {}
for idx, label in enumerate(sorted(label_set)):
self.label2idx[label] = len(self.label2idx)
self.idx2label = dict([(value, key) for key, value in self.label2idx.items()])
self.dataset_info['label_count'] = len(self.label2idx)
self.multi_label_binarizer = MultiLabelBinarizer(classes=list(self.label2idx.keys()))
示例7
def text_similarity(df, col):
"""
Convert strings to their unicode representation and then apply one hot encoding, creating one feature for each unique character in the column.
This can be useful when similarity between strings is significant.
"""
unique = pd.DataFrame(df[col].unique(), columns=[col])
encoded = pd.DataFrame(unique.loc[:,col].apply(lambda s: [ord(a) for a in s]), index=unique.index)
mlb = preprocessing.MultiLabelBinarizer()
encoded = pd.DataFrame(mlb.fit_transform(encoded[col]),columns=mlb.classes_, index=encoded.index).add_prefix(col+"_")
unique = unique.join(encoded)
return unique.set_index(col)
示例8
def prepVect(min_df=2, max_features=50000, n_captions=5, n_sbu=None,
multilabel=False):
print "prepping the Word Tokenizer..."
_0, _1, trY, _3 = coco(mode='full', n_captions=n_captions)
if n_sbu:
_4, sbuY, _5 = sbuXYFilenames(n_sbu)
trY.extend(sbuY)
vect = Tokenizer(min_df=min_df, max_features=max_features)
captions = sampleCaptions(trY, n_captions)
vect.fit(captions)
if multilabel:
mlb = MultiLabelBinarizer()
mlb.fit(vect.transform(captions))
return vect, mlb
# if not multilabel:
return vect
示例9
def test_objectmapper(self):
df = pdml.ModelFrame([])
self.assertIs(df.preprocessing.Binarizer, pp.Binarizer)
self.assertIs(df.preprocessing.FunctionTransformer,
pp.FunctionTransformer)
self.assertIs(df.preprocessing.Imputer, pp.Imputer)
self.assertIs(df.preprocessing.KernelCenterer, pp.KernelCenterer)
self.assertIs(df.preprocessing.LabelBinarizer, pp.LabelBinarizer)
self.assertIs(df.preprocessing.LabelEncoder, pp.LabelEncoder)
self.assertIs(df.preprocessing.MultiLabelBinarizer, pp.MultiLabelBinarizer)
self.assertIs(df.preprocessing.MaxAbsScaler, pp.MaxAbsScaler)
self.assertIs(df.preprocessing.MinMaxScaler, pp.MinMaxScaler)
self.assertIs(df.preprocessing.Normalizer, pp.Normalizer)
self.assertIs(df.preprocessing.OneHotEncoder, pp.OneHotEncoder)
self.assertIs(df.preprocessing.PolynomialFeatures, pp.PolynomialFeatures)
self.assertIs(df.preprocessing.RobustScaler, pp.RobustScaler)
self.assertIs(df.preprocessing.StandardScaler, pp.StandardScaler)
示例10
def NodeClassification(embedding_look_up, node_list, labels, testing_ratio, seed):
X_train, y_train, X_test, y_test = split_train_test_classify(embedding_look_up, node_list, labels,
testing_ratio=testing_ratio,seed=seed)
binarizer = MultiLabelBinarizer(sparse_output=True)
y_all = np.append(y_train, y_test)
binarizer.fit(y_all)
y_train = binarizer.transform(y_train).todense()
y_test = binarizer.transform(y_test).todense()
model = OneVsRestClassifier(LogisticRegression(random_state=seed, solver='lbfgs'))
model.fit(X_train, y_train)
y_pred_prob = model.predict_proba(X_test)
## small trick : we assume that we know how many label to predict
y_pred = get_y_pred(y_test, y_pred_prob)
accuracy = accuracy_score(y_test, y_pred)
micro_f1 = f1_score(y_test, y_pred, average="micro")
macro_f1 = f1_score(y_test, y_pred, average="macro")
print('#' * 9 + ' Node Classification Performance ' + '#' * 9)
print(f'Accuracy: {accuracy:.3f}, Micro-F1: {micro_f1:.3f}, Macro-F1: {macro_f1:.3f}')
print('#' * 50)
return accuracy, micro_f1, macro_f1
示例11
def build_input_label_data(labels, class_order):
from sklearn.preprocessing import MultiLabelBinarizer
from itertools import chain
bml = MultiLabelBinarizer(classes=class_order, sparse_output=True)
indexes = sp.find(bml.fit_transform(labels))
y = []
for i in range(len(labels)):
y.append([])
for i,j in zip(indexes[0], indexes[1]):
y[i].append(j)
return y
# padding operation
# =========================================================
示例12
def __init__(self, inputs, labels, test_indices=None, **kwargs):
"""Encapsulates all pieces of data to run an experiment. This is basically a bag of items that makes it
easy to serialize and deserialize everything as a unit.
Args:
inputs: The raw model inputs. This can be set to None if you dont want
to serialize this value when you save the dataset.
labels: The raw output labels.
test_indices: The optional test indices to use. Ideally, this should be generated one time and reused
across experiments to make results comparable. `generate_test_indices` can be used generate first
time indices.
**kwargs: Additional key value items to store.
"""
self.X = np.array(inputs)
self.y = np.array(labels)
for key, value in kwargs.items():
setattr(self, key, value)
self._test_indices = None
self._train_indices = None
self.test_indices = test_indices
self.is_multi_label = isinstance(labels[0], (set, list, tuple))
self.label_encoder = MultiLabelBinarizer() if self.is_multi_label else LabelBinarizer()
self.y = self.label_encoder.fit_transform(self.y).flatten()
示例13
def __init__(self, vectors, clf):
self.embeddings = vectors
self.clf = TopKRanker(clf)
self.binarizer = MultiLabelBinarizer(sparse_output=True)
示例14
def setUp(self):
dataset_filepath = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
'datasets/yeast_train.svm')
X, y = load_svmlight_file(dataset_filepath, multilabel=True)
self.X = X.todense().tolist()
self.y = MultiLabelBinarizer().fit_transform(y).tolist()
self.quota = 10
示例15
def main(argv=sys.argv):
if len(argv) != 1:
usage(argv)
FILENAME = "../../../data/recommender/recommendations.txt"
OUTPUT_DIR = "."
data = prepare.data_from_csv(FILENAME, sep='\\t',
to_drop=['RMSE', 'Avg Movie Age',
'Avg Recommended Rating',
'Avg Seen Rating', 'Occupation'])
TARGET = 'Types'
SENS = ['Gender']
EXPL = []
labeled_data = [ast.literal_eval(s) for s in data[TARGET]]
for labels in labeled_data:
assert len(labels) == 5
label_encoder = preprocessing.MultiLabelBinarizer()
labeled_data = label_encoder.fit_transform(labeled_data)
labels = label_encoder.classes_
df_labels = pd.DataFrame(labeled_data, columns=labels)
data = pd.concat([data.drop(TARGET, axis=1), df_labels], axis=1)
TARGET = labels.tolist()
data_source = DataSource(data)
# Instantiate the experiment
inv = Discovery(data_source, SENS, TARGET, EXPL, topk=10, random_state=0)
# Train the classifier
train([inv])
# Evaluate on the testing set
test([inv])
# Create the report
report([inv], "discovery", OUTPUT_DIR)
示例16
def __init__(self, embeddings, clf):
self.embeddings = embeddings
self.clf = TopKRanker(clf)
self.binarizer = MultiLabelBinarizer(sparse_output=True)
示例17
def binarize_labels(labels, sparse_output=False, return_classes=False):
"""Convert labels vector to a binary label matrix.
In the default single-label case, labels look like
labels = [y1, y2, y3, ...].
Also supports the multi-label format.
In this case, labels should look something like
labels = [[y11, y12], [y21, y22, y23], [y31], ...].
Parameters
----------
labels : array-like, shape [num_samples]
Array of node labels in categorical single- or multi-label format.
sparse_output : bool, default False
Whether return the label_matrix in CSR format.
return_classes : bool, default False
Whether return the classes corresponding to the columns of the label matrix.
Returns
-------
label_matrix : np.ndarray or sp.csr_matrix, shape [num_samples, num_classes]
Binary matrix of class labels.
num_classes = number of unique values in "labels" array.
label_matrix[i, k] = 1 <=> node i belongs to class k.
classes : np.array, shape [num_classes], optional
Classes that correspond to each column of the label_matrix.
"""
if hasattr(labels[0], '__iter__'): # labels[0] is iterable <=> multilabel format
binarizer = MultiLabelBinarizer(sparse_output=sparse_output)
else:
binarizer = LabelBinarizer(sparse_output=sparse_output)
label_matrix = binarizer.fit_transform(labels).astype(np.float32)
return (label_matrix, binarizer.classes_) if return_classes else label_matrix
示例18
def multilabel_to_indicator_df(y: List[List[str]], labels: List[str]) -> pd.DataFrame:
"""
Convert a list of label lists to a 0/1 indicator dataframe.
Args:
y: List of label lists
labels: List of all unique labels found in y
Returns:
The dataframe will have a column for each label and a row for each observation,
with a 1 if the observation has that label or a 0 if not.
"""
mlb = MultiLabelBinarizer(classes=labels)
return pd.DataFrame(mlb.fit_transform(y), columns=mlb.classes_)
示例19
def __init__(self, estimator: TEXT_CLASSIFIER_ESTIMATOR, multilabel=False, **params):
self.multilabel = multilabel
if multilabel:
self.y_encoder = MultiLabelBinarizer()
self.estimator = estimator
self.params = params
if estimator == TEXT_CLASSIFIER_ESTIMATOR.FAST_TEXT:
self.ft = None
if estimator == TEXT_CLASSIFIER_ESTIMATOR.SVC:
self.svc = None
if estimator == TEXT_CLASSIFIER_ESTIMATOR.PIPELINE:
if "pipeline" in params:
self.pipeline = params["pipeline"]
else:
self.pipeline = None
示例20
def binarize_labels(true_labels, pred_labels, excluding_labels=[]):
excluding_labels = ['building-ebu3b']
srcids = list(pred_labels.keys())
tot_labels = [[label for label in labels if label not in excluding_labels]
for labels in list(pred_labels.values()) + list(true_labels.values())
]
mlb = MultiLabelBinarizer().fit(tot_labels)
pred_mat = mlb.transform(pred_labels.values())
true_mat = mlb.transform(true_labels.values())
return true_mat, pred_mat
示例21
def evaluate(self, preds):
srcids = list(preds.keys())
pred_tags_list = [reduce(adder,
[preds[srcid][t]
for t in self.available_metadata_types])
for srcid in srcids]
true_tags_list = [reduce(adder,
[self.label_dict[srcid][t]
for t in self.available_metadata_types])
for srcid in srcids]
acc = eval_func.sequential_accuracy(true_tags_list,
pred_tags_list)
pred = [preds[srcid] for srcid in preds.keys()]
true = [self.label_dict[srcid] for srcid in preds.keys()]
mlb = MultiLabelBinarizer()
mlb.fit(pred + true)
encoded_true = mlb.transform(true)
encoded_pred = mlb.transform(pred)
macro_f1 = f1_score(encoded_true, encoded_pred, average='macro')
f1 = f1_score(encoded_true, encoded_pred, average='weighted')
res = {
'accuracy': acc,
'f1': f1,
'macro_f1': macro_f1
}
return res
示例22
def binarize_labels(true_labels, pred_labels):
srcids = list(pred_labels.keys())
tot_labels = [list(labels) for labels in
list(pred_labels.values()) + list(true_labels.values())]
mlb = MultiLabelBinarizer().fit(tot_labels)
pred_mat = mlb.transform(pred_labels.values())
true_mat = mlb.transform(true_labels.values())
return true_mat, pred_mat
示例23
def __init__(self, vectors, clf):
self.embeddings = vectors
self.clf = TopKRanker(clf)
self.binarizer = MultiLabelBinarizer(sparse_output=True)
示例24
def fit(self, X, y=None):
Xsplit = X.applymap(lambda x: x.split(self.sep))
self.mlbs = [MultiLabelBinarizer().fit(Xsplit[c]) for c in X.columns]
return self
示例25
def explicitness_per_factor(mus_train, y_train, mus_test, y_test):
"""Compute explicitness score for a factor as ROC-AUC of a classifier.
Args:
mus_train: Representation for training, (num_codes, num_points)-np array.
y_train: Ground truth factors for training, (num_factors, num_points)-np
array.
mus_test: Representation for testing, (num_codes, num_points)-np array.
y_test: Ground truth factors for testing, (num_factors, num_points)-np
array.
Returns:
roc_train: ROC-AUC score of the classifier on training data.
roc_test: ROC-AUC score of the classifier on testing data.
"""
x_train = np.transpose(mus_train)
x_test = np.transpose(mus_test)
clf = LogisticRegression().fit(x_train, y_train)
y_pred_train = clf.predict_proba(x_train)
y_pred_test = clf.predict_proba(x_test)
mlb = MultiLabelBinarizer()
roc_train = roc_auc_score(mlb.fit_transform(np.expand_dims(y_train, 1)),
y_pred_train)
roc_test = roc_auc_score(mlb.fit_transform(np.expand_dims(y_test, 1)),
y_pred_test)
return roc_train, roc_test
示例26
def test_BRKnna_no_labels_take_closest(self):
data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0, 1]])
train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid2', 'lid3'], ['lid0', 'lid5']]
mlb = MultiLabelBinarizer(sparse_output=True)
y = mlb.fit_transform(train_ids)
knn = BRKNeighborsClassifier(n_neighbors=2, threshold=0.6, mode='a')
knn.fit(data, y)
pred = knn.predict(csr.csr_matrix([[0, 1]])).todense()
print(pred)
np.testing.assert_array_equal([[1, 0, 0, 0, 0]], pred)
示例27
def test_BRKnna_predict(self):
data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0.5, 1]])
train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid4', 'lid3'], ['lid4', 'lid5']]
mlb = MultiLabelBinarizer(sparse_output=True)
y = mlb.fit_transform(train_ids)
knn = BRKNeighborsClassifier(threshold=0.5, n_neighbors=3, mode='a')
knn.fit(data, y)
pred = knn.predict(csr.csr_matrix([[1.1, 1.1]])).todense()
np.testing.assert_array_equal([[0, 0, 0, 1, 1, 0]], pred)
示例28
def test_BRKnna_predict_dense(self):
data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0.5, 1]])
train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid4', 'lid3'], ['lid4', 'lid5']]
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(train_ids)
knn = BRKNeighborsClassifier(threshold=0.5, n_neighbors=3, mode='a')
knn.fit(data, y)
pred = knn.predict(csr.csr_matrix([[1.1, 1.1]])).todense()
np.testing.assert_array_equal([[0, 0, 0, 1, 1, 0]], pred)
示例29
def test_BRKnnb_predict(self):
data = csr.csr_matrix([[0, 1], [1, 1], [1.5, 1], [0.5, 1]])
train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid4', 'lid3'], ['lid4', 'lid5']]
mlb = MultiLabelBinarizer(sparse_output=True)
y = mlb.fit_transform(train_ids)
knn = BRKNeighborsClassifier(mode='b', n_neighbors=3)
knn.fit(data, y)
pred = knn.predict(csr.csr_matrix([[0, 1]])).todense()
np.testing.assert_array_equal([[1, 1, 0, 0, 0]], pred)
示例30
def test_BRKnnb_predict_dense(self):
data = csr.csr_matrix([[0, 1], [1, 1], [1.5, 1], [0.5, 1]])
train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid4', 'lid3'], ['lid4', 'lid5']]
mlb = MultiLabelBinarizer(sparse_output=False)
y = mlb.fit_transform(train_ids)
knn = BRKNeighborsClassifier(mode='b', n_neighbors=3)
knn.fit(data, y)
pred = knn.predict(csr.csr_matrix([[0, 1]])).todense()
np.testing.assert_array_equal([[1, 1, 0, 0, 0]], pred)