Python源码示例:sklearn.datasets.make_multilabel_classification()
示例1
def check_alternative_lrap_implementation(lrap_score, n_classes=5,
n_samples=20, random_state=0):
_, y_true = make_multilabel_classification(n_features=1,
allow_unlabeled=False,
random_state=random_state,
n_classes=n_classes,
n_samples=n_samples)
# Score with ties
y_score = sparse_random_matrix(n_components=y_true.shape[0],
n_features=y_true.shape[1],
random_state=random_state)
if hasattr(y_score, "toarray"):
y_score = y_score.toarray()
score_lrap = label_ranking_average_precision_score(y_true, y_score)
score_my_lrap = _my_lrap(y_true, y_score)
assert_almost_equal(score_lrap, score_my_lrap)
# Uniform score
random_state = check_random_state(random_state)
y_score = random_state.uniform(size=(n_samples, n_classes))
score_lrap = label_ranking_average_precision_score(y_true, y_score)
score_my_lrap = _my_lrap(y_true, y_score)
assert_almost_equal(score_lrap, score_my_lrap)
示例2
def test_multilabel_sample_weight_invariance(name):
# multilabel indicator
random_state = check_random_state(0)
_, ya = make_multilabel_classification(n_features=1, n_classes=20,
random_state=0, n_samples=100,
allow_unlabeled=False)
_, yb = make_multilabel_classification(n_features=1, n_classes=20,
random_state=1, n_samples=100,
allow_unlabeled=False)
y_true = np.vstack([ya, yb])
y_pred = np.vstack([ya, ya])
y_score = random_state.randint(1, 4, size=y_true.shape)
metric = ALL_METRICS[name]
if name in THRESHOLDED_METRICS:
check_sample_weight_invariance(name, metric, y_true, y_score)
else:
check_sample_weight_invariance(name, metric, y_true, y_pred)
示例3
def test_multilabel_classification():
# Test that multi-label classification works as expected.
# test fit method
X, y = make_multilabel_classification(n_samples=50, random_state=0,
return_indicator=True)
mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=50, alpha=1e-5,
max_iter=150, random_state=0, activation='logistic',
learning_rate_init=0.2)
mlp.fit(X, y)
assert_greater(mlp.score(X, y), 0.97)
# test partial fit method
mlp = MLPClassifier(solver='sgd', hidden_layer_sizes=50, max_iter=150,
random_state=0, activation='logistic', alpha=1e-5,
learning_rate_init=0.2)
for i in range(100):
mlp.partial_fit(X, y, classes=[0, 1, 2, 3, 4])
assert_greater(mlp.score(X, y), 0.9)
# Make sure early stopping still work now that spliting is stratified by
# default (it is disabled for multilabel classification)
mlp = MLPClassifier(early_stopping=True)
mlp.fit(X, y).predict(X)
示例4
def test_predict_proba_multilabel():
# Test that predict_proba works as expected for multilabel.
# Multilabel should not use softmax which makes probabilities sum to 1
X, Y = make_multilabel_classification(n_samples=50, random_state=0,
return_indicator=True)
n_samples, n_classes = Y.shape
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=30,
random_state=0)
clf.fit(X, Y)
y_proba = clf.predict_proba(X)
assert_equal(y_proba.shape, (n_samples, n_classes))
assert_array_equal(y_proba > 0.5, Y)
y_log_proba = clf.predict_log_proba(X)
proba_max = y_proba.argmax(axis=1)
proba_log_max = y_log_proba.argmax(axis=1)
assert_greater((y_proba.sum(1) - 1).dot(y_proba.sum(1) - 1), 1e-10)
assert_array_equal(proba_max, proba_log_max)
assert_array_equal(y_log_proba, np.log(y_proba))
示例5
def test_ovr_multilabel_dataset():
base_clf = MultinomialNB(alpha=1)
for au, prec, recall in zip((True, False), (0.51, 0.66), (0.51, 0.80)):
X, Y = datasets.make_multilabel_classification(n_samples=100,
n_features=20,
n_classes=5,
n_labels=2,
length=50,
allow_unlabeled=au,
random_state=0)
X_train, Y_train = X[:80], Y[:80]
X_test, Y_test = X[80:], Y[80:]
clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
assert clf.multilabel_
assert_almost_equal(precision_score(Y_test, Y_pred, average="micro"),
prec,
decimal=2)
assert_almost_equal(recall_score(Y_test, Y_pred, average="micro"),
recall,
decimal=2)
示例6
def test_make_multilabel_classification_return_indicator():
for allow_unlabeled, min_length in zip((True, False), (0, 1)):
X, Y = make_multilabel_classification(n_samples=25, n_features=20,
n_classes=3, random_state=0,
allow_unlabeled=allow_unlabeled)
assert_equal(X.shape, (25, 20), "X shape mismatch")
assert_equal(Y.shape, (25, 3), "Y shape mismatch")
assert np.all(np.sum(Y, axis=0) > min_length)
# Also test return_distributions and return_indicator with True
X2, Y2, p_c, p_w_c = make_multilabel_classification(
n_samples=25, n_features=20, n_classes=3, random_state=0,
allow_unlabeled=allow_unlabeled, return_distributions=True)
assert_array_almost_equal(X, X2)
assert_array_equal(Y, Y2)
assert_equal(p_c.shape, (3,))
assert_almost_equal(p_c.sum(), 1)
assert_equal(p_w_c.shape, (20, 3))
assert_almost_equal(p_w_c.sum(axis=0), [1] * 3)
示例7
def test_sparse_multilabel_targets(n_neighbors, n_jobs):
X, y_dense = make_multilabel_classification(random_state=123)
thresh = 80
knn = KNeighborsClassifier(n_neighbors=n_neighbors,
n_jobs=n_jobs, )
assert not issparse(y_dense)
knn.fit(X[:thresh], y_dense[:thresh])
y_pred = knn.predict(X[thresh:])
y_sparse = csr_matrix(y_dense)
knn = KNeighborsClassifier(n_neighbors=n_neighbors,
n_jobs=n_jobs,)
assert issparse(y_sparse)
knn.fit(X[:thresh], y_sparse[:thresh])
y_pred_sparse = knn.predict(X[thresh:, :])
# Test array equality
np.testing.assert_array_equal(y_pred, y_pred_sparse.toarray())
示例8
def generate_classification(self, num_classes, num_features, num_samples, test_split=0.1, seed=0):
"""Generate a classification task
Arguments:
num_classes {int} -- Number of classes
num_features {int} -- Number of features
num_samples {int} -- Number of samples
Keyword Arguments:
test_split {float} -- Size of test split (default: {0.1})
seed {int} -- A random seed (default: {0})
"""
#X, Y = make_classification(n_samples=800, n_features=num_feats, n_classes=num_classes, n_informative=4)
X, y = make_multilabel_classification(
n_samples=num_samples, n_features=num_features, n_classes=num_classes, n_labels=0.01,
length=50, allow_unlabeled=False, sparse=False, return_indicator='dense',
return_distributions=False, random_state=seed
)
Y = np.argmax(y, axis=1)
self.categorical_features = [False] * num_features
self.problem_type = ProblemType.FeatureClassification
self.X, self.Y = X, Y
self._split_data(test_split, seed)
示例9
def check_alternative_lrap_implementation(lrap_score, n_classes=5,
n_samples=20, random_state=0):
_, y_true = make_multilabel_classification(n_features=1,
allow_unlabeled=False,
random_state=random_state,
n_classes=n_classes,
n_samples=n_samples)
# Score with ties
y_score = sparse_random_matrix(n_components=y_true.shape[0],
n_features=y_true.shape[1],
random_state=random_state)
if hasattr(y_score, "toarray"):
y_score = y_score.toarray()
score_lrap = label_ranking_average_precision_score(y_true, y_score)
score_my_lrap = _my_lrap(y_true, y_score)
assert_almost_equal(score_lrap, score_my_lrap)
# Uniform score
random_state = check_random_state(random_state)
y_score = random_state.uniform(size=(n_samples, n_classes))
score_lrap = label_ranking_average_precision_score(y_true, y_score)
score_my_lrap = _my_lrap(y_true, y_score)
assert_almost_equal(score_lrap, score_my_lrap)
示例10
def test_multilabel_classification():
# Test that multi-label classification works as expected.
# test fit method
X, y = make_multilabel_classification(n_samples=50, random_state=0,
return_indicator=True)
mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=50, alpha=1e-5,
max_iter=150, random_state=0, activation='logistic',
learning_rate_init=0.2)
mlp.fit(X, y)
assert_equal(mlp.score(X, y), 1)
# test partial fit method
mlp = MLPClassifier(solver='sgd', hidden_layer_sizes=50, max_iter=150,
random_state=0, activation='logistic', alpha=1e-5,
learning_rate_init=0.2)
for i in range(100):
mlp.partial_fit(X, y, classes=[0, 1, 2, 3, 4])
assert_greater(mlp.score(X, y), 0.9)
示例11
def test_predict_proba_multilabel():
# Test that predict_proba works as expected for multilabel.
# Multilabel should not use softmax which makes probabilities sum to 1
X, Y = make_multilabel_classification(n_samples=50, random_state=0,
return_indicator=True)
n_samples, n_classes = Y.shape
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=30,
random_state=0)
clf.fit(X, Y)
y_proba = clf.predict_proba(X)
assert_equal(y_proba.shape, (n_samples, n_classes))
assert_array_equal(y_proba > 0.5, Y)
y_log_proba = clf.predict_log_proba(X)
proba_max = y_proba.argmax(axis=1)
proba_log_max = y_log_proba.argmax(axis=1)
assert_greater((y_proba.sum(1) - 1).dot(y_proba.sum(1) - 1), 1e-10)
assert_array_equal(proba_max, proba_log_max)
assert_array_equal(y_log_proba, np.log(y_proba))
示例12
def test_ovr_multilabel_dataset():
base_clf = MultinomialNB(alpha=1)
for au, prec, recall in zip((True, False), (0.51, 0.66), (0.51, 0.80)):
X, Y = datasets.make_multilabel_classification(n_samples=100,
n_features=20,
n_classes=5,
n_labels=2,
length=50,
allow_unlabeled=au,
random_state=0)
X_train, Y_train = X[:80], Y[:80]
X_test, Y_test = X[80:], Y[80:]
clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
assert_true(clf.multilabel_)
assert_almost_equal(precision_score(Y_test, Y_pred, average="micro"),
prec,
decimal=2)
assert_almost_equal(recall_score(Y_test, Y_pred, average="micro"),
recall,
decimal=2)
示例13
def test_make_multilabel_classification_return_indicator():
for allow_unlabeled, min_length in zip((True, False), (0, 1)):
X, Y = make_multilabel_classification(n_samples=25, n_features=20,
n_classes=3, random_state=0,
allow_unlabeled=allow_unlabeled)
assert_equal(X.shape, (25, 20), "X shape mismatch")
assert_equal(Y.shape, (25, 3), "Y shape mismatch")
assert_true(np.all(np.sum(Y, axis=0) > min_length))
# Also test return_distributions and return_indicator with True
X2, Y2, p_c, p_w_c = make_multilabel_classification(
n_samples=25, n_features=20, n_classes=3, random_state=0,
allow_unlabeled=allow_unlabeled, return_distributions=True)
assert_array_equal(X, X2)
assert_array_equal(Y, Y2)
assert_equal(p_c.shape, (3,))
assert_almost_equal(p_c.sum(), 1)
assert_equal(p_w_c.shape, (20, 3))
assert_almost_equal(p_w_c.sum(axis=0), [1] * 3)
示例14
def setup_mlc_dataset(self):
X, Y = datasets.make_multilabel_classification(
n_features=5, random_state=1126)
return Dataset(X, Y)
示例15
def setUp(self):
X, Y = datasets.make_multilabel_classification(random_state=1126)
self.X_train, self.X_test, self.Y_train, self.Y_test = \
train_test_split(X, Y, test_size=0.3, random_state=1126)
示例16
def split_train_test(test_size):
# choose a dataset with unbalanced class instances
data = make_multilabel_classification(
n_samples=300, n_classes=10, allow_unlabeled=False)
X = StandardScaler().fit_transform(data[0])
Y = data[1]
X_trn, X_tst, Y_trn, Y_tst = train_test_split(X, Y, test_size=test_size)
trn_ds = Dataset(X_trn, Y_trn[:5].tolist() + [None] * (len(Y_trn) - 5))
tst_ds = Dataset(X_tst, Y_tst.tolist())
fully_labeled_trn_ds = Dataset(X_trn, Y_trn)
return trn_ds, tst_ds, fully_labeled_trn_ds
示例17
def _prepare_for_use(self):
self._random_state = check_random_state(self.random_state)
self.X, self.y = make_multilabel_classification(n_samples=self.n_samples,
n_features=self.n_features,
n_classes=self.n_targets,
n_labels=self.n_labels,
random_state=self._random_state)
self.target_names = ["target_" + str(i) for i in range(self.n_targets)]
self.feature_names = ["att_num_" + str(i) for i in range(self.n_num_features)]
self.target_values = np.unique(self.y).tolist() if self.n_targets == 1 else \
[np.unique(self.y[:, i]).tolist() for i in range(self.n_targets)]
示例18
def test_multilabel():
"""Check if error is raised for multilabel classification."""
X, y = make_multilabel_classification(n_classes=2, n_labels=1,
allow_unlabeled=False,
random_state=123)
clf = OneVsRestClassifier(SVC(kernel='linear'))
eclf = VotingClassifier(estimators=[('ovr', clf)], voting='hard')
try:
eclf.fit(X, y)
except NotImplementedError:
return
示例19
def test_sparse_input(EstimatorClass, sparse_matrix):
y, X = datasets.make_multilabel_classification(random_state=0,
n_samples=50,
n_features=1,
n_classes=20)
y = y[:, 0]
check_sparse_input(EstimatorClass, X, sparse_matrix(X), y)
示例20
def test_random_hasher_sparse_data():
X, y = datasets.make_multilabel_classification(random_state=0)
hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
X_transformed = hasher.fit_transform(X)
X_transformed_sparse = hasher.fit_transform(csc_matrix(X))
assert_array_equal(X_transformed_sparse.toarray(), X_transformed.toarray())
示例21
def test_sparse_input(name, sparse_matrix):
X, y = datasets.make_multilabel_classification(random_state=0,
n_samples=50)
check_sparse_input(name, X, sparse_matrix(X), y)
示例22
def test_presort_sparse():
ests = (DecisionTreeClassifier(presort=True),
DecisionTreeRegressor(presort=True))
sparse_matrices = (csr_matrix, csc_matrix, coo_matrix)
y, X = datasets.make_multilabel_classification(random_state=0,
n_samples=50,
n_features=1,
n_classes=20)
y = y[:, 0]
for est, sparse_matrix in product(ests, sparse_matrices):
check_presort_sparse(est, sparse_matrix(X), y)
示例23
def test_multilabel_representation_invariance():
# Generate some data
n_classes = 4
n_samples = 50
_, y1 = make_multilabel_classification(n_features=1, n_classes=n_classes,
random_state=0, n_samples=n_samples,
allow_unlabeled=True)
_, y2 = make_multilabel_classification(n_features=1, n_classes=n_classes,
random_state=1, n_samples=n_samples,
allow_unlabeled=True)
# To make sure at least one empty label is present
y1 = np.vstack([y1, [[0] * n_classes]])
y2 = np.vstack([y2, [[0] * n_classes]])
y1_sparse_indicator = sp.coo_matrix(y1)
y2_sparse_indicator = sp.coo_matrix(y2)
for name in MULTILABELS_METRICS:
metric = ALL_METRICS[name]
# XXX cruel hack to work with partial functions
if isinstance(metric, partial):
metric.__module__ = 'tmp'
metric.__name__ = name
measure = metric(y1, y2)
# Check representation invariance
assert_allclose(metric(y1_sparse_indicator, y2_sparse_indicator),
measure,
err_msg="%s failed representation invariance between "
"dense and sparse indicator formats." % name)
示例24
def test_normalize_option_multilabel_classification():
# Test in the multilabel case
n_classes = 4
n_samples = 100
# for both random_state 0 and 1, y_true and y_pred has at least one
# unlabelled entry
_, y_true = make_multilabel_classification(n_features=1,
n_classes=n_classes,
random_state=0,
allow_unlabeled=True,
n_samples=n_samples)
_, y_pred = make_multilabel_classification(n_features=1,
n_classes=n_classes,
random_state=1,
allow_unlabeled=True,
n_samples=n_samples)
# To make sure at least one empty label is present
y_true += [0]*n_classes
y_pred += [0]*n_classes
for name in METRICS_WITH_NORMALIZE_OPTION:
metrics = ALL_METRICS[name]
measure = metrics(y_true, y_pred, normalize=True)
assert_array_less(-1.0 * measure, 0,
err_msg="We failed to test correctly the normalize "
"option")
assert_allclose(metrics(y_true, y_pred, normalize=False) / n_samples,
measure, err_msg="Failed with %s" % name)
示例25
def test_averaging_multilabel(name):
n_samples, n_classes = 40, 5
_, y = make_multilabel_classification(n_features=1, n_classes=n_classes,
random_state=5, n_samples=n_samples,
allow_unlabeled=False)
y_true = y[:20]
y_pred = y[20:]
y_score = check_random_state(0).normal(size=(20, n_classes))
y_true_binarize = y_true
y_pred_binarize = y_pred
check_averaging(name, y_true, y_true_binarize,
y_pred, y_pred_binarize, y_score)
示例26
def test_thresholded_scorers_multilabel_indicator_data():
# Test that the scorer work with multilabel-indicator format
# for multilabel and multi-output multi-class classifier
X, y = make_multilabel_classification(allow_unlabeled=False,
random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# Multi-output multi-class predict_proba
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_proba = clf.predict_proba(X_test)
score1 = get_scorer('roc_auc')(clf, X_test, y_test)
score2 = roc_auc_score(y_test, np.vstack([p[:, -1] for p in y_proba]).T)
assert_almost_equal(score1, score2)
# Multi-output multi-class decision_function
# TODO Is there any yet?
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf._predict_proba = clf.predict_proba
clf.predict_proba = None
clf.decision_function = lambda X: [p[:, 1] for p in clf._predict_proba(X)]
y_proba = clf.decision_function(X_test)
score1 = get_scorer('roc_auc')(clf, X_test, y_test)
score2 = roc_auc_score(y_test, np.vstack([p for p in y_proba]).T)
assert_almost_equal(score1, score2)
# Multilabel predict_proba
clf = OneVsRestClassifier(DecisionTreeClassifier())
clf.fit(X_train, y_train)
score1 = get_scorer('roc_auc')(clf, X_test, y_test)
score2 = roc_auc_score(y_test, clf.predict_proba(X_test))
assert_almost_equal(score1, score2)
# Multilabel decision function
clf = OneVsRestClassifier(LinearSVC(random_state=0))
clf.fit(X_train, y_train)
score1 = get_scorer('roc_auc')(clf, X_test, y_test)
score2 = roc_auc_score(y_test, clf.decision_function(X_test))
assert_almost_equal(score1, score2)
示例27
def test_cross_val_predict_sparse_prediction():
# check that cross_val_predict gives same result for sparse and dense input
X, y = make_multilabel_classification(n_classes=2, n_labels=1,
allow_unlabeled=False,
return_indicator=True,
random_state=1)
X_sparse = csr_matrix(X)
y_sparse = csr_matrix(y)
classif = OneVsRestClassifier(SVC(kernel='linear'))
preds = cross_val_predict(classif, X, y, cv=10)
preds_sparse = cross_val_predict(classif, X_sparse, y_sparse, cv=10)
preds_sparse = preds_sparse.toarray()
assert_array_almost_equal(preds_sparse, preds)
示例28
def test_cross_val_predict_with_method_multilabel_rf():
# The RandomForest allows multiple classes in each label.
# Output of predict_proba is a list of outputs of predict_proba
# for each individual label.
n_classes = 4
X, y = make_multilabel_classification(n_samples=100, n_labels=3,
n_classes=n_classes, n_features=5,
random_state=42)
y[:, 0] += y[:, 1] # Put three classes in the first column
for method in ['predict_proba', 'predict_log_proba', 'decision_function']:
est = RFWithDecisionFunction(n_estimators=5, random_state=0)
with warnings.catch_warnings():
# Suppress "RuntimeWarning: divide by zero encountered in log"
warnings.simplefilter('ignore')
check_cross_val_predict_multilabel(est, X, y, method=method)
示例29
def test_ovr_fit_predict_sparse():
for sparse in [sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix,
sp.lil_matrix]:
base_clf = MultinomialNB(alpha=1)
X, Y = datasets.make_multilabel_classification(n_samples=100,
n_features=20,
n_classes=5,
n_labels=3,
length=50,
allow_unlabeled=True,
random_state=0)
X_train, Y_train = X[:80], Y[:80]
X_test = X[80:]
clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
clf_sprs = OneVsRestClassifier(base_clf).fit(X_train, sparse(Y_train))
Y_pred_sprs = clf_sprs.predict(X_test)
assert clf.multilabel_
assert sp.issparse(Y_pred_sprs)
assert_array_equal(Y_pred_sprs.toarray(), Y_pred)
# Test predict_proba
Y_proba = clf_sprs.predict_proba(X_test)
# predict assigns a label if the probability that the
# sample has the label is greater than 0.5.
pred = Y_proba > .5
assert_array_equal(pred, Y_pred_sprs.toarray())
# Test decision_function
clf = svm.SVC(gamma="scale")
clf_sprs = OneVsRestClassifier(clf).fit(X_train, sparse(Y_train))
dec_pred = (clf_sprs.decision_function(X_test) > 0).astype(int)
assert_array_equal(dec_pred, clf_sprs.predict(X_test).toarray())
示例30
def test_ridge_classifier_no_support_multilabel():
X, y = make_multilabel_classification(n_samples=10, random_state=0)
assert_raises(ValueError, RidgeClassifier().fit, X, y)