Python源码示例:sklearn.datasets.make_blobs()
示例1
def test_js_divergence():
n_samples = 1000
blobs, _ = datasets.make_blobs(n_samples=n_samples, random_state=8)
one_component_a = ml.cluster.GaussianMixture(1)
one_component_b = ml.cluster.GaussianMixture(1)
two_component = ml.cluster.GaussianMixture(2)
one_component_a.fit(blobs)
one_component_b.fit(blobs)
two_component.fit(blobs)
confidence_2v1 = ml.confidence.jensen_shannon_divergence(
one_component_a, two_component)
confidence_1v1 = ml.confidence.jensen_shannon_divergence(
one_component_a, one_component_b)
assert confidence_2v1 > confidence_1v1
示例2
def produce_XOR(sampleSize):
import sklearn.datasets as dt
# centers of the blobs
centers = [(0,0),(3,0),(3,3),(0,3)]
# create the sample
x, y = dt.make_blobs(n_samples=sampleSize, n_features=2,
cluster_std=0.8, centers=centers, shuffle=False
)
# and make it XOR like
y[y == 2] = 0
y[y == 3] = 1
return x, y
示例3
def test_svc():
"""Check that sparse SVC gives the same result as SVC"""
# many class dataset:
X_blobs, y_blobs = make_blobs(n_samples=100, centers=10, random_state=0)
X_blobs = sparse.csr_matrix(X_blobs)
datasets = [[X_sp, Y, T], [X2_sp, Y2, T2],
[X_blobs[:80], y_blobs[:80], X_blobs[80:]],
[iris.data, iris.target, iris.data]]
kernels = ["linear", "poly", "rbf", "sigmoid"]
for dataset in datasets:
for kernel in kernels:
clf = svm.SVC(gamma=1, kernel=kernel, probability=True,
random_state=0, decision_function_shape='ovo')
sp_clf = svm.SVC(gamma=1, kernel=kernel, probability=True,
random_state=0, decision_function_shape='ovo')
check_svm_model_equal(clf, sp_clf, *dataset)
示例4
def test_grid_search_no_score():
# Test grid-search on classifier that has no score function.
clf = LinearSVC(random_state=0)
X, y = make_blobs(random_state=0, centers=2)
Cs = [.1, 1, 10]
clf_no_score = LinearSVCNoScore(random_state=0)
grid_search = GridSearchCV(clf, {'C': Cs}, scoring='accuracy')
grid_search.fit(X, y)
grid_search_no_score = GridSearchCV(clf_no_score, {'C': Cs},
scoring='accuracy')
# smoketest grid search
grid_search_no_score.fit(X, y)
# check that best params are equal
assert_equal(grid_search_no_score.best_params_, grid_search.best_params_)
# check that we can call score and that it gives the correct result
assert_equal(grid_search.score(X, y), grid_search_no_score.score(X, y))
# giving no scoring function raises an error
grid_search_no_score = GridSearchCV(clf_no_score, {'C': Cs})
assert_raise_message(TypeError, "no scoring", grid_search_no_score.fit,
[[1]])
示例5
def test_unsupervised_grid_search():
# test grid-search with unsupervised estimator
X, y = make_blobs(random_state=0)
km = KMeans(random_state=0)
# Multi-metric evaluation unsupervised
scoring = ['adjusted_rand_score', 'fowlkes_mallows_score']
for refit in ['adjusted_rand_score', 'fowlkes_mallows_score']:
grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]),
scoring=scoring, refit=refit)
grid_search.fit(X, y)
# Both ARI and FMS can find the right number :)
assert_equal(grid_search.best_params_["n_clusters"], 3)
# Single metric evaluation unsupervised
grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]),
scoring='fowlkes_mallows_score')
grid_search.fit(X, y)
assert_equal(grid_search.best_params_["n_clusters"], 3)
# Now without a score, and without y
grid_search = GridSearchCV(km, param_grid=dict(n_clusters=[2, 3, 4]))
grid_search.fit(X)
assert_equal(grid_search.best_params_["n_clusters"], 4)
示例6
def test_deprecated_grid_search_iid():
depr_message = ("The default of the `iid` parameter will change from True "
"to False in version 0.22")
X, y = make_blobs(n_samples=54, random_state=0, centers=2)
grid = GridSearchCV(SVC(gamma='scale', random_state=0),
param_grid={'C': [10]}, cv=3)
# no warning with equally sized test sets
assert_no_warnings(grid.fit, X, y)
grid = GridSearchCV(SVC(gamma='scale', random_state=0),
param_grid={'C': [10]}, cv=5)
# warning because 54 % 5 != 0
assert_warns_message(DeprecationWarning, depr_message, grid.fit, X, y)
grid = GridSearchCV(SVC(gamma='scale', random_state=0),
param_grid={'C': [10]}, cv=2)
# warning because stratification into two classes and 27 % 2 != 0
assert_warns_message(DeprecationWarning, depr_message, grid.fit, X, y)
grid = GridSearchCV(SVC(gamma='scale', random_state=0),
param_grid={'C': [10]}, cv=KFold(2))
# no warning because no stratification and 54 % 2 == 0
assert_no_warnings(grid.fit, X, y)
示例7
def test_lda_coefs():
# Test if the coefficients of the solvers are approximately the same.
n_features = 2
n_classes = 2
n_samples = 1000
X, y = make_blobs(n_samples=n_samples, n_features=n_features,
centers=n_classes, random_state=11)
clf_lda_svd = LinearDiscriminantAnalysis(solver="svd")
clf_lda_lsqr = LinearDiscriminantAnalysis(solver="lsqr")
clf_lda_eigen = LinearDiscriminantAnalysis(solver="eigen")
clf_lda_svd.fit(X, y)
clf_lda_lsqr.fit(X, y)
clf_lda_eigen.fit(X, y)
assert_array_almost_equal(clf_lda_svd.coef_, clf_lda_lsqr.coef_, 1)
assert_array_almost_equal(clf_lda_svd.coef_, clf_lda_eigen.coef_, 1)
assert_array_almost_equal(clf_lda_eigen.coef_, clf_lda_lsqr.coef_, 1)
示例8
def test_partial_fit():
# Test that fit is equivalent to calling partial_fit multiple times
X, y = make_blobs(n_samples=100)
brc = Birch(n_clusters=3)
brc.fit(X)
brc_partial = Birch(n_clusters=None)
brc_partial.partial_fit(X[:50])
brc_partial.partial_fit(X[50:])
assert_array_almost_equal(brc_partial.subcluster_centers_,
brc.subcluster_centers_)
# Test that same global labels are obtained after calling partial_fit
# with None
brc_partial.set_params(n_clusters=3)
brc_partial.partial_fit(None)
assert_array_equal(brc_partial.subcluster_labels_, brc.subcluster_labels_)
示例9
def test_n_clusters():
# Test that n_clusters param works properly
X, y = make_blobs(n_samples=100, centers=10)
brc1 = Birch(n_clusters=10)
brc1.fit(X)
assert_greater(len(brc1.subcluster_centers_), 10)
assert_equal(len(np.unique(brc1.labels_)), 10)
# Test that n_clusters = Agglomerative Clustering gives
# the same results.
gc = AgglomerativeClustering(n_clusters=10)
brc2 = Birch(n_clusters=gc)
brc2.fit(X)
assert_array_equal(brc1.subcluster_labels_, brc2.subcluster_labels_)
assert_array_equal(brc1.labels_, brc2.labels_)
# Test that the wrong global clustering step raises an Error.
clf = ElasticNet()
brc3 = Birch(n_clusters=clf)
assert_raises(ValueError, brc3.fit, X)
# Test that a small number of clusters raises a warning.
brc4 = Birch(threshold=10000.)
assert_warns(ConvergenceWarning, brc4.fit, X)
示例10
def test_branching_factor():
# Test that nodes have at max branching_factor number of subclusters
X, y = make_blobs()
branching_factor = 9
# Purposefully set a low threshold to maximize the subclusters.
brc = Birch(n_clusters=None, branching_factor=branching_factor,
threshold=0.01)
brc.fit(X)
check_branching_factor(brc.root_, branching_factor)
brc = Birch(n_clusters=3, branching_factor=branching_factor,
threshold=0.01)
brc.fit(X)
check_branching_factor(brc.root_, branching_factor)
# Raises error when branching_factor is set to one.
brc = Birch(n_clusters=None, branching_factor=1, threshold=0.01)
assert_raises(ValueError, brc.fit, X)
示例11
def test_make_blobs_error():
n_samples = [20, 20, 20]
centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]])
cluster_stds = np.array([0.05, 0.2, 0.4])
wrong_centers_msg = ("Length of `n_samples` not consistent "
"with number of centers. Got n_samples = {} "
"and centers = {}".format(n_samples, centers[:-1]))
assert_raise_message(ValueError, wrong_centers_msg,
make_blobs, n_samples, centers=centers[:-1])
wrong_std_msg = ("Length of `clusters_std` not consistent with "
"number of centers. Got centers = {} "
"and cluster_std = {}".format(centers, cluster_stds[:-1]))
assert_raise_message(ValueError, wrong_std_msg,
make_blobs, n_samples,
centers=centers, cluster_std=cluster_stds[:-1])
wrong_type_msg = ("Parameter `centers` must be array-like. "
"Got {!r} instead".format(3))
assert_raise_message(ValueError, wrong_type_msg,
make_blobs, n_samples, centers=3)
示例12
def test_linear_kernel(ax, cost):
train_x, train_y = make_blobs(
n_samples=500, centers=2, n_features=2, random_state=1
)
train_y[train_y == 0] = -1
scaler = StandardScaler()
train_x_scaled = scaler.fit_transform(train_x, train_y)
train_data = np.hstack((train_y.reshape(500, 1), train_x_scaled))
mykernel = Kernel(kernel="linear", degree=5, coef0=1, gamma=0.5)
mysvm = SmoSVM(
train=train_data,
kernel_func=mykernel,
cost=cost,
tolerance=0.001,
auto_norm=False,
)
mysvm.fit()
plot_partition_boundary(mysvm, train_data, ax=ax)
示例13
def test_metric_supervised_umap_trustworthiness():
data, labels = make_blobs(50, cluster_std=0.5, random_state=42)
embedding = UMAP(
n_neighbors=10,
min_dist=0.01,
target_metric="l1",
target_weight=0.8,
n_epochs=100,
random_state=42,
).fit_transform(data, labels)
trust = trustworthiness(data, embedding, 10)
assert_greater_equal(
trust,
0.95,
"Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust),
)
示例14
def test_string_metric_supervised_umap_trustworthiness():
data, labels = make_blobs(50, cluster_std=0.5, random_state=42)
labels = np.array(["this", "that", "other"])[labels]
embedding = UMAP(
n_neighbors=10,
min_dist=0.01,
target_metric="string",
target_weight=0.8,
n_epochs=100,
random_state=42,
).fit_transform(data, labels)
trust = trustworthiness(data, embedding, 10)
assert_greater_equal(
trust,
0.95,
"Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust),
)
示例15
def test_discrete_metric_supervised_umap_trustworthiness():
data, labels = make_blobs(50, cluster_std=0.5, random_state=42)
embedding = UMAP(
n_neighbors=10,
min_dist=0.01,
target_metric="ordinal",
target_weight=0.8,
n_epochs=100,
random_state=42,
).fit_transform(data, labels)
trust = trustworthiness(data, embedding, 10)
assert_greater_equal(
trust,
0.95,
"Insufficiently trustworthy embedding for" "blobs dataset: {}".format(trust),
)
示例16
def check_estimators_partial_fit_n_features(name, estimator_orig):
# check if number of features changes between calls to partial_fit.
if not hasattr(estimator_orig, 'partial_fit'):
return
estimator = clone(estimator_orig)
X, y = make_blobs(n_samples=50, random_state=1)
X -= X.min()
try:
if is_classifier(estimator):
classes = np.unique(y)
estimator.partial_fit(X, y, classes=classes)
else:
estimator.partial_fit(X, y)
except NotImplementedError:
return
assert_raises(ValueError, estimator.partial_fit, X[:, :-1], y)
示例17
def check_decision_proba_consistency(name, estimator_orig):
# Check whether an estimator having both decision_function and
# predict_proba methods has outputs with perfect rank correlation.
centers = [(2, 2), (4, 4)]
X, y = make_blobs(n_samples=100, random_state=0, n_features=4,
centers=centers, cluster_std=1.0, shuffle=True)
X_test = np.random.randn(20, 2) + 4
estimator = clone(estimator_orig)
if (hasattr(estimator, "decision_function") and
hasattr(estimator, "predict_proba")):
estimator.fit(X, y)
a = estimator.predict_proba(X_test)[:, 1]
b = estimator.decision_function(X_test)
assert_array_equal(rankdata(a), rankdata(b))
示例18
def test_fit_transform(self):
""" Tests fit_transform against scikit-learn.
"""
n_samples = 1500
x, y = make_blobs(n_samples=n_samples, random_state=170)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
x = np.dot(x, transformation)
ds_arr = ds.array(x, block_size=(300, 2))
sc1 = SKScaler()
scaled_x = sc1.fit_transform(x)
sc2 = StandardScaler()
ds_scaled = sc2.fit_transform(ds_arr)
self.assertTrue(np.allclose(scaled_x, ds_scaled.collect()))
self.assertTrue(np.allclose(sc1.mean_, sc2.mean_.collect()))
self.assertTrue(np.allclose(sc1.var_, sc2.var_.collect()))
self.assertEqual(ds_scaled._top_left_shape,
ds_scaled._blocks[0][0].shape)
self.assertEqual(ds_arr._reg_shape, ds_scaled._reg_shape)
self.assertEqual(ds_arr._top_left_shape, ds_scaled._top_left_shape)
self.assertEqual(ds_arr.shape, ds_scaled.shape)
self.assertEqual(ds_arr._n_blocks, ds_scaled._n_blocks)
示例19
def test_irregular(self):
""" Test with an irregular array """
n_samples = 1500
x, y = make_blobs(n_samples=n_samples, random_state=170)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
x = np.dot(x, transformation)
ds_arr = ds.array(x, block_size=(300, 2))
ds_arr = ds_arr[297:602]
x = x[297:602]
sc1 = SKScaler()
scaled_x = sc1.fit_transform(x)
sc2 = StandardScaler()
ds_scaled = sc2.fit_transform(ds_arr)
self.assertTrue(np.allclose(scaled_x, ds_scaled.collect()))
self.assertTrue(np.allclose(sc1.mean_, sc2.mean_.collect()))
self.assertTrue(np.allclose(sc1.var_, sc2.var_.collect()))
self.assertEqual(ds_scaled._top_left_shape,
compss_wait_on(ds_scaled._blocks[0][0]).shape)
self.assertEqual(ds_arr._reg_shape, ds_scaled._reg_shape)
self.assertEqual(ds_arr._top_left_shape, ds_scaled._top_left_shape)
self.assertEqual(ds_arr.shape, ds_scaled.shape)
self.assertEqual(ds_arr._n_blocks, ds_scaled._n_blocks)
示例20
def test_n_clusters_aniso(self):
""" Tests that DBSCAN finds the correct number of clusters with
anisotropicly distributed data.
"""
n_samples = 1500
x, y = make_blobs(n_samples=n_samples, random_state=170)
dbscan = DBSCAN(n_regions=1, eps=.15)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
x = np.dot(x, transformation)
x = StandardScaler().fit_transform(x)
ds_x = ds.array(x, block_size=(300, 2))
y_pred = dbscan.fit_predict(ds_x).collect()
true_sizes = {19, 496, 491, 488, 6}
cluster_sizes = {y_pred[y_pred == -1].size,
y_pred[y_pred == 0].size,
y_pred[y_pred == 1].size,
y_pred[y_pred == 2].size,
y_pred[y_pred == 3].size}
self.assertEqual(dbscan.n_clusters, 4)
self.assertEqual(true_sizes, cluster_sizes)
示例21
def test_n_clusters_aniso_max_samples(self):
""" Tests that DBSCAN finds the correct number of clusters when
defining max_samples with anisotropicly distributed data.
"""
n_samples = 1500
x, y = make_blobs(n_samples=n_samples, random_state=170)
dbscan = DBSCAN(n_regions=1, eps=.15, max_samples=500)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
x = np.dot(x, transformation)
x = StandardScaler().fit_transform(x)
ds_x = ds.array(x, block_size=(300, 2))
y_pred = dbscan.fit_predict(ds_x).collect()
true_sizes = {19, 496, 491, 488, 6}
cluster_sizes = {y_pred[y_pred == -1].size,
y_pred[y_pred == 0].size,
y_pred[y_pred == 1].size,
y_pred[y_pred == 2].size,
y_pred[y_pred == 3].size}
self.assertEqual(dbscan.n_clusters, 4)
self.assertEqual(true_sizes, cluster_sizes)
示例22
def test_n_clusters_aniso_grid(self):
""" Tests that DBSCAN finds the correct number of clusters when
setting n_regions > 1 with anisotropicly distributed data.
"""
n_samples = 1500
x, y = make_blobs(n_samples=n_samples, random_state=170)
dbscan = DBSCAN(n_regions=4, eps=.15, max_samples=500)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
x = np.dot(x, transformation)
x = StandardScaler().fit_transform(x)
ds_x = ds.array(x, block_size=(300, 2))
y_pred = dbscan.fit_predict(ds_x).collect()
true_sizes = {19, 496, 491, 488, 6}
cluster_sizes = {y_pred[y_pred == -1].size,
y_pred[y_pred == 0].size,
y_pred[y_pred == 1].size,
y_pred[y_pred == 2].size,
y_pred[y_pred == 3].size}
self.assertEqual(dbscan.n_clusters, 4)
self.assertEqual(true_sizes, cluster_sizes)
示例23
def test_n_clusters_aniso_dimensions(self):
""" Tests that DBSCAN finds the correct number of clusters when
dimensions is not None.
"""
n_samples = 1500
x, y = make_blobs(n_samples=n_samples, random_state=170)
dbscan = DBSCAN(n_regions=5, dimensions=[1], eps=.15)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
x = np.dot(x, transformation)
x = StandardScaler().fit_transform(x)
ds_x = ds.array(x, block_size=(300, 2))
y_pred = dbscan.fit_predict(ds_x).collect()
true_sizes = {19, 496, 491, 488, 6}
cluster_sizes = {y_pred[y_pred == -1].size,
y_pred[y_pred == 0].size,
y_pred[y_pred == 1].size,
y_pred[y_pred == 2].size,
y_pred[y_pred == 3].size}
self.assertEqual(dbscan.n_clusters, 4)
self.assertEqual(true_sizes, cluster_sizes)
示例24
def test_sparse(self):
""" Tests that DBSCAN produces the same results with sparse and
dense data.
"""
n_samples = 1500
x, y = make_blobs(n_samples=n_samples, random_state=170)
dbscan = DBSCAN(n_regions=1, eps=.15)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
x = np.dot(x, transformation)
x = StandardScaler().fit_transform(x)
dense = ds.array(x, block_size=(300, 2))
sparse = ds.array(csr_matrix(x), block_size=(300, 2))
y_dense = dbscan.fit_predict(dense).collect()
y_sparse = dbscan.fit_predict(sparse).collect()
self.assertTrue(np.array_equal(y_dense, y_sparse))
示例25
def test_fit_predict(self):
""" Tests fit_predict."""
x, y = make_blobs(n_samples=1500, random_state=170)
x_filtered = np.vstack(
(x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]))
x_train = ds.array(x_filtered, block_size=(300, 2))
kmeans = KMeans(n_clusters=3, random_state=170)
labels = kmeans.fit_predict(x_train).collect()
skmeans = SKMeans(n_clusters=3, random_state=170)
sklabels = skmeans.fit_predict(x_filtered)
centers = np.array([[-8.941375656533449, -5.481371322614891],
[-4.524023204953875, 0.06235042593214654],
[2.332994701667008, 0.37681003933082696]])
self.assertTrue(np.allclose(centers, kmeans.centers))
self.assertTrue(np.allclose(labels, sklabels))
示例26
def test_supervised_cluster_scorers():
# Test clustering scorers against gold standard labeling.
X, y = make_blobs(random_state=0, centers=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
km = KMeans(n_clusters=3)
km.fit(X_train)
for name in CLUSTER_SCORERS:
score1 = get_scorer(name)(km, X_test, y_test)
score2 = getattr(cluster_module, name)(y_test, km.predict(X_test))
assert_almost_equal(score1, score2)
示例27
def test_raises_on_score_list():
# Test that when a list of scores is returned, we raise proper errors.
X, y = make_blobs(random_state=0)
f1_scorer_no_average = make_scorer(f1_score, average=None)
clf = DecisionTreeClassifier()
assert_raises(ValueError, cross_val_score, clf, X, y,
scoring=f1_scorer_no_average)
grid_search = GridSearchCV(clf, scoring=f1_scorer_no_average,
param_grid={'max_depth': [1, 2]})
assert_raises(ValueError, grid_search.fit, X, y)
示例28
def test_compute_class_weight_invariance():
# Test that results with class_weight="balanced" is invariant wrt
# class imbalance if the number of samples is identical.
# The test uses a balanced two class dataset with 100 datapoints.
# It creates three versions, one where class 1 is duplicated
# resulting in 150 points of class 1 and 50 of class 0,
# one where there are 50 points in class 1 and 150 in class 0,
# and one where there are 100 points of each class (this one is balanced
# again).
# With balancing class weights, all three should give the same model.
X, y = make_blobs(centers=2, random_state=0)
# create dataset where class 1 is duplicated twice
X_1 = np.vstack([X] + [X[y == 1]] * 2)
y_1 = np.hstack([y] + [y[y == 1]] * 2)
# create dataset where class 0 is duplicated twice
X_0 = np.vstack([X] + [X[y == 0]] * 2)
y_0 = np.hstack([y] + [y[y == 0]] * 2)
# duplicate everything
X_ = np.vstack([X] * 2)
y_ = np.hstack([y] * 2)
# results should be identical
logreg1 = LogisticRegression(class_weight="balanced").fit(X_1, y_1)
logreg0 = LogisticRegression(class_weight="balanced").fit(X_0, y_0)
logreg = LogisticRegression(class_weight="balanced").fit(X_, y_)
assert_array_almost_equal(logreg1.coef_, logreg0.coef_)
assert_array_almost_equal(logreg.coef_, logreg0.coef_)
示例29
def test_pipeline():
# check that LocallyLinearEmbedding works fine as a Pipeline
# only checks that no error is raised.
# TODO check that it actually does something useful
from sklearn import pipeline, datasets
X, y = datasets.make_blobs(random_state=0)
clf = pipeline.Pipeline(
[('filter', manifold.LocallyLinearEmbedding(random_state=0)),
('clf', neighbors.KNeighborsClassifier())])
clf.fit(X, y)
assert_less(.9, clf.score(X, y))
# Test the error raised when the weight matrix is singular
示例30
def test_pipeline():
# check that Isomap works fine as a transformer in a Pipeline
# only checks that no error is raised.
# TODO check that it actually does something useful
X, y = datasets.make_blobs(random_state=0)
clf = pipeline.Pipeline(
[('isomap', manifold.Isomap()),
('clf', neighbors.KNeighborsClassifier())])
clf.fit(X, y)
assert_less(.9, clf.score(X, y))