Python源码示例:sklearn.datasets.load_digits()
示例1
def split_train_test(n_classes):
from sklearn.datasets import load_digits
n_labeled = 5
digits = load_digits(n_class=n_classes) # consider binary case
X = digits.data
y = digits.target
print(np.shape(X))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
while len(np.unique(y_train[:n_labeled])) < n_classes:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33)
trn_ds = Dataset(X_train, np.concatenate(
[y_train[:n_labeled], [None] * (len(y_train) - n_labeled)]))
tst_ds = Dataset(X_test, y_test)
return trn_ds, tst_ds, digits
示例2
def test_pca_score_with_different_solvers(self):
digits = datasets.load_digits()
X_digits = mt.tensor(digits.data)
pca_dict = {svd_solver: PCA(n_components=30, svd_solver=svd_solver,
random_state=0)
for svd_solver in self.solver_list}
for pca in pca_dict.values():
pca.fit(X_digits)
# Sanity check for the noise_variance_. For more details see
# https://github.com/scikit-learn/scikit-learn/issues/7568
# https://github.com/scikit-learn/scikit-learn/issues/8541
# https://github.com/scikit-learn/scikit-learn/issues/8544
assert mt.all((pca.explained_variance_ - pca.noise_variance_) >= 0).to_numpy()
# Compare scores with different svd_solvers
score_dict = {svd_solver: pca.score(X_digits).to_numpy()
for svd_solver, pca in pca_dict.items()}
assert_almost_equal(score_dict['full'], score_dict['randomized'],
decimal=3)
示例3
def get_mnist_data():
"""Loads the MNIST data set into memory.
Returns
-------
X : array-like, shape=[n_samples, n_features]
Training data for the MNIST data set.
y : array-like, shape=[n_samples,]
Labels for the MNIST data set.
"""
digits = load_digits()
X, y = digits.data, digits.target
y = LabelBinarizer().fit_transform(y)
return X, y
示例4
def _get_mnist_data(seed=None):
digits = load_digits()["images"]
if seed is not None:
rnd = np.random.RandomState(seed=seed)
else:
rnd = np.random.RandomState()
no_img, rows, cols = digits.shape
X = digits.reshape((no_img, rows * cols))
X = np.ascontiguousarray(X)
rnd.shuffle(X)
X_test = X[:100]
X_train = X[100:]
return X_train, X_test
示例5
def digits_reduced():
data=load_digits()
XX = data['data']
y = data['target']
nn,dd = XX.shape
XX = XX.reshape([nn,8,8])
X = np.empty([nn,3])
for i in xrange(nn):
X[i,0] = simetria_hor(XX[i,:,:])
X[i,1] = simetria_ver(XX[i,:,:])
X[i,2] = np.mean(XX[i,:])
return X,y
### ARFF dataframes ###
示例6
def digits_reduced():
data=load_digits()
XX = data['data']
y = data['target']
nn,dd = XX.shape
XX = XX.reshape([nn,8,8])
X = np.empty([nn,3])
for i in xrange(nn):
X[i,0] = simetria_hor(XX[i,:,:])
X[i,1] = simetria_ver(XX[i,:,:])
X[i,2] = np.mean(XX[i,:])
return X,y
### ARFF dataframes ###
示例7
def test_pca_default_int_randomised(self):
data = load_digits()
X_train, X_test, *_ = train_test_split(
data.data, data.target, test_size=0.2, random_state=42)
model = PCA(random_state=42, svd_solver='randomized',
iterated_power=3).fit(X_train)
model_onnx = convert_sklearn(
model,
initial_types=[("input",
Int64TensorType([None, X_test.shape[1]]))],
)
self.assertTrue(model_onnx is not None)
dump_data_and_model(
X_test.astype(np.int64),
model,
model_onnx,
basename="SklearnPCADefaultIntRandomised",
allow_failure="StrictVersion("
"onnxruntime.__version__)"
"<= StrictVersion('0.2.1')",
)
示例8
def test_dummy_identity(self):
digits = datasets.load_digits(n_class=6)
Xd = digits.data[:20]
yd = digits.target[:20]
n_samples, n_features = Xd.shape
idtr = make_pipeline(IdentityTransformer(), identity())
idtr.fit(Xd, yd)
update_registered_converter(IdentityTransformer, "IdentityTransformer",
dummy_shape_calculator, dummy_converter)
update_registered_converter(identity, "identity",
dummy_shape_calculator, dummy_converter)
model_onnx = convert_sklearn(
idtr,
"idtr",
[("input", FloatTensorType([None, Xd.shape[1]]))],
target_opset=TARGET_OPSET)
idnode = [node for node in model_onnx.graph.node
if node.op_type == "Identity"]
assert len(idnode) == 2
示例9
def test_kmeans_clustering_int(self):
data = load_digits()
X = data.data
model = KMeans(n_clusters=4)
model.fit(X)
model_onnx = convert_sklearn(model, "kmeans",
[("input", Int64TensorType([None,
X.shape[1]]))],
target_opset=TARGET_OPSET)
self.assertIsNotNone(model_onnx)
dump_data_and_model(
X.astype(numpy.int64)[40:60],
model,
model_onnx,
basename="SklearnKMeansInt-Dec4",
# Operator gemm is not implemented in onnxruntime
allow_failure="StrictVersion(onnx.__version__)"
" < StrictVersion('1.2') or "
"StrictVersion(onnxruntime.__version__) "
"<= StrictVersion('0.2.1')",
)
示例10
def test_batchkmeans_clustering_int(self):
data = load_digits()
X = data.data
model = MiniBatchKMeans(n_clusters=4)
model.fit(X)
model_onnx = convert_sklearn(model, "kmeans",
[("input", Int64TensorType([None,
X.shape[1]]))],
target_opset=TARGET_OPSET)
self.assertIsNotNone(model_onnx)
dump_data_and_model(
X.astype(numpy.int64)[40:60],
model,
model_onnx,
basename="SklearnBatchKMeansInt-Dec4",
allow_failure="StrictVersion(onnx.__version__)"
" < StrictVersion('1.2') or "
"StrictVersion(onnxruntime.__version__) "
"<= StrictVersion('0.2.1')",
)
示例11
def test_model_calibrated_classifier_cv_int(self):
data = load_digits()
X, y = data.data, data.target
clf = MultinomialNB().fit(X, y)
model = CalibratedClassifierCV(clf, cv=2, method="sigmoid").fit(X, y)
model_onnx = convert_sklearn(
model,
"scikit-learn CalibratedClassifierCVMNB",
[("input", Int64TensorType([None, X.shape[1]]))],
target_opset=TARGET_OPSET
)
self.assertTrue(model_onnx is not None)
dump_data_and_model(
X.astype(np.int64),
model,
model_onnx,
basename="SklearnCalibratedClassifierCVInt-Dec4",
allow_failure="StrictVersion(onnxruntime.__version__)"
"<= StrictVersion('0.2.1')",
)
示例12
def test_feature_union_transformer_weights_1(self):
data = load_digits()
X, y = data.data, data.target
X = X.astype(np.int64)
X_train, X_test, *_ = train_test_split(X, y, test_size=0.5,
random_state=42)
model = FeatureUnion([('pca', PCA()),
('svd', TruncatedSVD())],
transformer_weights={'pca': 10, 'svd': 3}
).fit(X_train)
model_onnx = convert_sklearn(
model, 'feature union',
[('input', Int64TensorType([None, X_test.shape[1]]))])
self.assertTrue(model_onnx is not None)
dump_data_and_model(
X_test,
model,
model_onnx,
basename="SklearnFeatureUnionTransformerWeights1-Dec4",
allow_failure="StrictVersion("
"onnxruntime.__version__)"
"<= StrictVersion('0.2.1')",
)
示例13
def test_feature_union_transformer_weights_2(self):
data = load_digits()
X, y = data.data, data.target
X = X.astype(np.float32)
X_train, X_test, *_ = train_test_split(X, y, test_size=0.5,
random_state=42)
model = FeatureUnion([('pca', PCA()),
('svd', TruncatedSVD())],
transformer_weights={'pca1': 10, 'svd2': 3}
).fit(X_train)
model_onnx = convert_sklearn(
model, 'feature union',
[('input', FloatTensorType([None, X_test.shape[1]]))])
self.assertTrue(model_onnx is not None)
dump_data_and_model(
X_test,
model,
model_onnx,
basename="SklearnFeatureUnionTransformerWeights2-Dec4",
allow_failure="StrictVersion("
"onnxruntime.__version__)"
"<= StrictVersion('0.2.1')",
)
示例14
def setup_method(self):
import sklearn.svm as svm
digits = datasets.load_digits()
self.data = digits.data
self.target = digits.target
self.df = pdml.ModelFrame(digits)
estimator1 = self.df.svm.LinearSVC(C=1.0, random_state=self.random_state)
self.df.fit(estimator1)
estimator2 = svm.LinearSVC(C=1.0, random_state=self.random_state)
estimator2.fit(self.data, self.target)
self.pred = estimator2.predict(self.data)
self.decision = estimator2.decision_function(self.data)
# argument for classification reports
self.labels = np.array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])
示例15
def test_train_test_split(self):
df = pdml.ModelFrame(datasets.load_digits())
self.assertIsInstance(df, pdml.ModelFrame)
train_df, test_df = df.model_selection.train_test_split()
tm.assert_index_equal(df.columns, train_df.columns)
tm.assert_index_equal(df.columns, test_df.columns)
self.assertEqual(len(df), len(train_df) + len(test_df))
self.assertEqual(df.shape[1], train_df.shape[1])
self.assertEqual(df.shape[1], test_df.shape[1])
tm.assert_index_equal(df.columns, train_df.columns)
tm.assert_index_equal(df.columns, test_df.columns)
df = pdml.ModelFrame(datasets.load_digits())
df.target_name = 'xxx'
train_df, test_df = df.model_selection.train_test_split()
tm.assert_index_equal(df.columns, train_df.columns)
tm.assert_index_equal(df.columns, test_df.columns)
self.assertEqual(train_df.target_name, 'xxx')
self.assertEqual(test_df.target_name, 'xxx')
示例16
def test_validation_curve(self):
digits = datasets.load_digits()
df = pdml.ModelFrame(digits)
param_range = np.logspace(-2, -1, 2)
svc = df.svm.SVC(random_state=self.random_state)
result = df.model_selection.validation_curve(svc, 'gamma',
param_range)
expected = ms.validation_curve(svm.SVC(random_state=self.random_state),
digits.data, digits.target,
'gamma', param_range)
self.assertEqual(len(result), 2)
self.assert_numpy_array_almost_equal(result[0], expected[0])
self.assert_numpy_array_almost_equal(result[1], expected[1])
示例17
def main():
data = datasets.load_digits()
X = normalize(data.data)
y = data.target
# Convert the nominal y values to binary
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, seed=1)
# MLP
clf = MultilayerPerceptron(n_hidden=16,
n_iterations=1000,
learning_rate=0.01)
clf.fit(X_train, y_train)
y_pred = np.argmax(clf.predict(X_test), axis=1)
y_test = np.argmax(y_test, axis=1)
accuracy = accuracy_score(y_test, y_pred)
print ("Accuracy:", accuracy)
# Reduce dimension to two using PCA and plot the results
Plot().plot_in_2d(X_test, y_pred, title="Multilayer Perceptron", accuracy=accuracy, legend_labels=np.unique(y))
示例18
def main():
data = datasets.load_digits()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, seed=2)
clf = RandomForest(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print ("Accuracy:", accuracy)
Plot().plot_in_2d(X_test, y_pred, title="Random Forest", accuracy=accuracy, legend_labels=data.target_names)
示例19
def main():
data = datasets.load_digits()
X = normalize(data.data)
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
clf = NaiveBayes()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print ("Accuracy:", accuracy)
# Reduce dimension to two using PCA and plot the results
Plot().plot_in_2d(X_test, y_pred, title="Naive Bayes", accuracy=accuracy, legend_labels=data.target_names)
示例20
def get_sample_dataset(dataset_properties):
"""Returns sample dataset
Args:
dataset_properties (dict): Dictionary corresponding to the properties of the dataset
used to verify the estimator and metric generators.
Returns:
X (array-like): Features array
y (array-like): Labels array
splits (iterator): This is an iterator that returns train test splits for
cross-validation purposes on ``X`` and ``y``.
"""
kwargs = dataset_properties.copy()
data_type = kwargs.pop('type')
if data_type == 'multiclass':
try:
X, y = datasets.make_classification(random_state=8, **kwargs)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
except Exception as e:
raise exceptions.UserError(repr(e))
elif data_type == 'iris':
X, y = datasets.load_iris(return_X_y=True)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
elif data_type == 'mnist':
X, y = datasets.load_digits(return_X_y=True)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
elif data_type == 'breast_cancer':
X, y = datasets.load_breast_cancer(return_X_y=True)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
elif data_type == 'boston':
X, y = datasets.load_boston(return_X_y=True)
splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
elif data_type == 'diabetes':
X, y = datasets.load_diabetes(return_X_y=True)
splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
else:
raise exceptions.UserError('Unknown dataset type {}'.format(dataset_properties['type']))
return X, y, splits
示例21
def extract_main_dataset():
X, y = load_digits(return_X_y=True)
return X, y
示例22
def test_correct_dataset(self):
X, y = load_digits(return_X_y=True)
verification_dict = functions.verify_dataset(X, y)
assert verification_dict['features_shape'] == (1797,64)
assert verification_dict['labels_shape'] == (1797,)
示例23
def setUp(self):
data = load_digits()
self.Xtr, self.Xte, Ytr, Yte = train_test_split(data.data, data.target, shuffle=True, train_size=.15)
self.Xtr_numpy = self.Xtr.copy()
self.Xte_numpy = self.Xte.copy()
self.Xtr = preprocessing.normalization(self.Xtr)
self.Xte = preprocessing.normalization(self.Xte)
self.Ytr = torch.Tensor(Ytr)
self.Yte = torch.Tensor(Yte)
self.KLtr = [pairwise_mk.homogeneous_polynomial_kernel(self.Xtr, degree=d) for d in range(1,11)]
self.KLte = [pairwise_mk.homogeneous_polynomial_kernel(self.Xte, self.Xtr, degree=d) for d in range(1,11)]
示例24
def test_scikit_learn(df):
sklearn = import_module('sklearn') # noqa
from sklearn import svm, datasets
digits = datasets.load_digits()
clf = svm.SVC(gamma=0.001, C=100.)
clf.fit(digits.data[:-1], digits.target[:-1])
clf.predict(digits.data[-1:])
# Cython import warning and traitlets
示例25
def digits_dataload():
from sklearn import datasets
Digits=datasets.load_digits()
Data=Digits.data/16.
label=Digits.target
return Data,label
示例26
def test_scikit_learn(df):
sklearn = import_module('sklearn') # noqa
from sklearn import svm, datasets
digits = datasets.load_digits()
clf = svm.SVC(gamma=0.001, C=100.)
clf.fit(digits.data[:-1], digits.target[:-1])
clf.predict(digits.data[-1:])
示例27
def test_unsorted_indices():
# test that the result with sorted and unsorted indices in csr is the same
# we use a subset of digits as iris, blobs or make_classification didn't
# show the problem
digits = load_digits()
X, y = digits.data[:50], digits.target[:50]
X_test = sparse.csr_matrix(digits.data[50:100])
X_sparse = sparse.csr_matrix(X)
coef_dense = svm.SVC(kernel='linear', probability=True,
random_state=0).fit(X, y).coef_
sparse_svc = svm.SVC(kernel='linear', probability=True,
random_state=0).fit(X_sparse, y)
coef_sorted = sparse_svc.coef_
# make sure dense and sparse SVM give the same result
assert_array_almost_equal(coef_dense, coef_sorted.toarray())
# reverse each row's indices
def scramble_indices(X):
new_data = []
new_indices = []
for i in range(1, len(X.indptr)):
row_slice = slice(*X.indptr[i - 1: i + 1])
new_data.extend(X.data[row_slice][::-1])
new_indices.extend(X.indices[row_slice][::-1])
return sparse.csr_matrix((new_data, new_indices, X.indptr),
shape=X.shape)
X_sparse_unsorted = scramble_indices(X_sparse)
X_test_unsorted = scramble_indices(X_test)
assert not X_sparse_unsorted.has_sorted_indices
assert not X_test_unsorted.has_sorted_indices
unsorted_svc = svm.SVC(kernel='linear', probability=True,
random_state=0).fit(X_sparse_unsorted, y)
coef_unsorted = unsorted_svc.coef_
# make sure unsorted indices give same result
assert_array_almost_equal(coef_unsorted.toarray(), coef_sorted.toarray())
assert_array_almost_equal(sparse_svc.predict_proba(X_test_unsorted),
sparse_svc.predict_proba(X_test))
示例28
def test_check_accuracy_on_digits():
# Non regression test to make sure that any further refactoring / optim
# of the NB models do not harm the performance on a slightly non-linearly
# separable dataset
digits = load_digits()
X, y = digits.data, digits.target
binary_3v8 = np.logical_or(digits.target == 3, digits.target == 8)
X_3v8, y_3v8 = X[binary_3v8], y[binary_3v8]
# Multinomial NB
scores = cross_val_score(MultinomialNB(alpha=10), X, y, cv=10)
assert_greater(scores.mean(), 0.86)
scores = cross_val_score(MultinomialNB(alpha=10), X_3v8, y_3v8, cv=10)
assert_greater(scores.mean(), 0.94)
# Bernoulli NB
scores = cross_val_score(BernoulliNB(alpha=10), X > 4, y, cv=10)
assert_greater(scores.mean(), 0.83)
scores = cross_val_score(BernoulliNB(alpha=10), X_3v8 > 4, y_3v8, cv=10)
assert_greater(scores.mean(), 0.92)
# Gaussian NB
scores = cross_val_score(GaussianNB(), X, y, cv=10)
assert_greater(scores.mean(), 0.77)
scores = cross_val_score(GaussianNB(var_smoothing=0.1), X, y, cv=10)
assert_greater(scores.mean(), 0.89)
scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10)
assert_greater(scores.mean(), 0.86)
示例29
def test_load_digits():
digits = load_digits()
assert_equal(digits.data.shape, (1797, 64))
assert_equal(numpy.unique(digits.target).size, 10)
# test return_X_y option
check_return_X_y(digits, partial(load_digits))
示例30
def test_load_digits_n_class_lt_10():
digits = load_digits(9)
assert_equal(digits.data.shape, (1617, 64))
assert_equal(numpy.unique(digits.target).size, 9)