Python源码示例:sklearn.datasets.load_diabetes()
示例1
def test_lasso_cv_with_some_model_selection():
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn import datasets
from sklearn.linear_model import LassoCV
diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target
pipe = make_pipeline(
StandardScaler(),
LassoCV(cv=StratifiedKFold(n_splits=5))
)
pipe.fit(X, y)
示例2
def test_lasso_path(self):
diabetes = datasets.load_diabetes()
df = pdml.ModelFrame(diabetes)
result = df.linear_model.lasso_path()
expected = lm.lasso_path(diabetes.data, diabetes.target)
self.assertEqual(len(result), 3)
tm.assert_numpy_array_equal(result[0], expected[0])
self.assertIsInstance(result[1], pdml.ModelFrame)
tm.assert_index_equal(result[1].index, df.data.columns)
self.assert_numpy_array_almost_equal(result[1].values, expected[1])
self.assert_numpy_array_almost_equal(result[2], expected[2])
result = df.linear_model.lasso_path(return_models=True)
expected = lm.lasso_path(diabetes.data, diabetes.target, return_models=True)
self.assertEqual(len(result), len(expected))
self.assertIsInstance(result, tuple)
tm.assert_numpy_array_equal(result[0], result[0])
tm.assert_numpy_array_equal(result[1], result[1])
tm.assert_numpy_array_equal(result[2], result[2])
示例3
def test_LassoCV(self, criterion):
diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target
X = pp.normalize(X)
df = pdml.ModelFrame(diabetes)
df.data = df.data.pp.normalize()
mod1 = lm.LassoLarsIC(criterion=criterion)
mod1.fit(X, y)
mod2 = df.lm.LassoLarsIC(criterion=criterion)
df.fit(mod2)
self.assertAlmostEqual(mod1.alpha_, mod2.alpha_)
expected = mod1.predict(X)
predicted = df.predict(mod2)
self.assertIsInstance(predicted, pdml.ModelSeries)
self.assert_numpy_array_almost_equal(predicted.values, expected)
示例4
def test_MixedLM(self):
import statsmodels.regression.mixed_linear_model as mlm
diabetes = datasets.load_diabetes()
models = ['MixedLM']
data = diabetes.data[:100, :]
target = diabetes.target[:100]
groups = np.array([0] * 50 + [1] * 50)
for model in models:
klass = getattr(sm, model)
estimator = base.StatsModelsRegressor(klass, groups=groups)
fitted = estimator.fit(data, target)
# result = estimator.predict(diabetes.data)
# NotImplementedError
self.assertIsInstance(fitted, mlm.MixedLMResultsWrapper)
# expected = klass(target, data, groups=groups).fit().predict(diabetes.data)
# self.assert_numpy_array_almost_equal(result, expected)
示例5
def test_pipeline(self):
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.pipeline import Pipeline
diabetes = datasets.load_diabetes()
models = ['OLS', 'GLS', 'WLS', 'GLSAR', 'QuantReg', 'GLM', 'RLM']
for model in models:
klass = getattr(sm, model)
selector = SelectKBest(f_regression, k=5)
estimator = Pipeline([('selector', selector),
('reg', base.StatsModelsRegressor(klass))])
estimator.fit(diabetes.data, diabetes.target)
result = estimator.predict(diabetes.data)
data = SelectKBest(f_regression, k=5).fit_transform(diabetes.data, diabetes.target)
expected = klass(diabetes.target, data).fit().predict(data)
self.assert_numpy_array_almost_equal(result, expected)
示例6
def _timeseries_generated_data(self):
# Load diabetes data and convert to data frame
x, y = datasets.load_diabetes(return_X_y=True)
nrows, ncols = x.shape
column_names = [str(i) for i in range(ncols)]
X = pd.DataFrame(x, columns=column_names)
# Add an arbitrary time axis
time_column_name = "Date" + str(uuid.uuid4())
dates = pd.date_range('1980-01-01', periods=nrows, freq='MS')
X[time_column_name] = dates
index_keys = [time_column_name]
X.set_index(index_keys, inplace=True)
# Split into train and test sets
test_frac = 0.2
cutoff_index = int(np.floor((1.0 - test_frac) * nrows))
X_train = X.iloc[:cutoff_index]
y_train = y[:cutoff_index]
X_test = X.iloc[cutoff_index:]
y_test = y[cutoff_index:]
return X_train, X_test, y_train, y_test, time_column_name
示例7
def main():
diabetes = datasets.load_diabetes()
diabetes_X = diabetes.data[:, np.newaxis, 2]
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]
regr = linear_model.LinearRegression()
regr.fit(diabetes_X_train, diabetes_y_train)
print('Coefficients: \n', regr.coef_)
print("Mean squared error: %.2f" %
np.mean((regr.predict(diabetes_X_test) - diabetes_y_test)**2))
print('Variance score: %.2f' % regr.score(diabetes_X_test, diabetes_y_test))
示例8
def test_svr():
# Test Support Vector Regression
diabetes = datasets.load_diabetes()
for clf in (svm.NuSVR(kernel='linear', nu=.4, C=1.0),
svm.NuSVR(kernel='linear', nu=.4, C=10.),
svm.SVR(kernel='linear', C=10.),
svm.LinearSVR(C=10.),
svm.LinearSVR(C=10.),
):
clf.fit(diabetes.data, diabetes.target)
assert_greater(clf.score(diabetes.data, diabetes.target), 0.02)
# non-regression test; previously, BaseLibSVM would check that
# len(np.unique(y)) < 2, which must only be done for SVC
svm.SVR().fit(diabetes.data, np.ones(len(diabetes.data)))
svm.LinearSVR().fit(diabetes.data, np.ones(len(diabetes.data)))
示例9
def test_bayesian_on_diabetes():
# Test BayesianRidge on diabetes
raise SkipTest("XFailed Test")
diabetes = datasets.load_diabetes()
X, y = diabetes.data, diabetes.target
clf = BayesianRidge(compute_score=True)
# Test with more samples than features
clf.fit(X, y)
# Test that scores are increasing at each iteration
assert_array_equal(np.diff(clf.scores_) > 0, True)
# Test with more features than samples
X = X[:5, :]
y = y[:5]
clf.fit(X, y)
# Test that scores are increasing at each iteration
assert_array_equal(np.diff(clf.scores_) > 0, True)
示例10
def test_xgb_regressor(self):
iris = load_diabetes()
x = iris.data
y = iris.target
x_train, x_test, y_train, _ = train_test_split(x, y, test_size=0.5,
random_state=42)
xgb = XGBRegressor()
xgb.fit(x_train, y_train)
conv_model = convert_xgboost(
xgb, initial_types=[('input', FloatTensorType(shape=['None', 'None']))])
self.assertTrue(conv_model is not None)
dump_data_and_model(
x_test.astype("float32"),
xgb,
conv_model,
basename="SklearnXGBRegressor-Dec3",
allow_failure="StrictVersion("
"onnx.__version__)"
"< StrictVersion('1.3.0')",
)
示例11
def test_h2o_regressor(self):
diabetes = load_diabetes()
train, test = _train_test_split_as_frames(diabetes.data, diabetes.target)
dists = ["auto", "gaussian", "huber", "laplace", "quantile"]
for d in dists:
gbm = H2OGradientBoostingEstimator(ntrees=7, max_depth=5, distribution=d)
mojo_path = _make_mojo(gbm, train)
onnx_model = _convert_mojo(mojo_path)
self.assertIsNot(onnx_model, None)
dump_data_and_model(
test,
H2OMojoWrapper(mojo_path),
onnx_model,
basename="H2OReg-Dec4",
allow_failure="StrictVersion("
"onnx.__version__)"
"< StrictVersion('1.3.0')",
)
示例12
def get_sample_dataset(dataset_properties):
"""Returns sample dataset
Args:
dataset_properties (dict): Dictionary corresponding to the properties of the dataset
used to verify the estimator and metric generators.
Returns:
X (array-like): Features array
y (array-like): Labels array
splits (iterator): This is an iterator that returns train test splits for
cross-validation purposes on ``X`` and ``y``.
"""
kwargs = dataset_properties.copy()
data_type = kwargs.pop('type')
if data_type == 'multiclass':
try:
X, y = datasets.make_classification(random_state=8, **kwargs)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
except Exception as e:
raise exceptions.UserError(repr(e))
elif data_type == 'iris':
X, y = datasets.load_iris(return_X_y=True)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
elif data_type == 'mnist':
X, y = datasets.load_digits(return_X_y=True)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
elif data_type == 'breast_cancer':
X, y = datasets.load_breast_cancer(return_X_y=True)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
elif data_type == 'boston':
X, y = datasets.load_boston(return_X_y=True)
splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
elif data_type == 'diabetes':
X, y = datasets.load_diabetes(return_X_y=True)
splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
else:
raise exceptions.UserError('Unknown dataset type {}'.format(dataset_properties['type']))
return X, y, splits
示例13
def test_regression_scorers():
# Test regression scorers.
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf = Ridge()
clf.fit(X_train, y_train)
score1 = get_scorer('r2')(clf, X_test, y_test)
score2 = r2_score(y_test, clf.predict(X_test))
assert_almost_equal(score1, score2)
示例14
def test_svr():
# Test Support Vector Regression
diabetes = datasets.load_diabetes()
for clf in (svm.NuSVR(kernel='linear', nu=.4, C=1.0),
svm.NuSVR(kernel='linear', nu=.4, C=10.),
svm.SVR(kernel='linear', C=10.),
svm.LinearSVR(C=10.),
svm.LinearSVR(C=10.),
):
clf.fit(diabetes.data, diabetes.target)
assert_greater(clf.score(diabetes.data, diabetes.target), 0.02)
# non-regression test; previously, BaseLibSVM would check that
# len(np.unique(y)) < 2, which must only be done for SVC
svm.SVR(gamma='scale').fit(diabetes.data, np.ones(len(diabetes.data)))
svm.LinearSVR().fit(diabetes.data, np.ones(len(diabetes.data)))
示例15
def test_linearsvr():
# check that SVR(kernel='linear') and LinearSVC() give
# comparable results
diabetes = datasets.load_diabetes()
lsvr = svm.LinearSVR(C=1e3).fit(diabetes.data, diabetes.target)
score1 = lsvr.score(diabetes.data, diabetes.target)
svr = svm.SVR(kernel='linear', C=1e3).fit(diabetes.data, diabetes.target)
score2 = svr.score(diabetes.data, diabetes.target)
assert_allclose(np.linalg.norm(lsvr.coef_),
np.linalg.norm(svr.coef_), 1, 0.0001)
assert_almost_equal(score1, score2, 2)
示例16
def test_linearsvr_fit_sampleweight():
# check correct result when sample_weight is 1
# check that SVR(kernel='linear') and LinearSVC() give
# comparable results
diabetes = datasets.load_diabetes()
n_samples = len(diabetes.target)
unit_weight = np.ones(n_samples)
lsvr = svm.LinearSVR(C=1e3).fit(diabetes.data, diabetes.target,
sample_weight=unit_weight)
score1 = lsvr.score(diabetes.data, diabetes.target)
lsvr_no_weight = svm.LinearSVR(C=1e3).fit(diabetes.data, diabetes.target)
score2 = lsvr_no_weight.score(diabetes.data, diabetes.target)
assert_allclose(np.linalg.norm(lsvr.coef_),
np.linalg.norm(lsvr_no_weight.coef_), 1, 0.0001)
assert_almost_equal(score1, score2, 2)
# check that fit(X) = fit([X1, X2, X3],sample_weight = [n1, n2, n3]) where
# X = X1 repeated n1 times, X2 repeated n2 times and so forth
random_state = check_random_state(0)
random_weight = random_state.randint(0, 10, n_samples)
lsvr_unflat = svm.LinearSVR(C=1e3).fit(diabetes.data, diabetes.target,
sample_weight=random_weight)
score3 = lsvr_unflat.score(diabetes.data, diabetes.target,
sample_weight=random_weight)
X_flat = np.repeat(diabetes.data, random_weight, axis=0)
y_flat = np.repeat(diabetes.target, random_weight, axis=0)
lsvr_flat = svm.LinearSVR(C=1e3).fit(X_flat, y_flat)
score4 = lsvr_flat.score(X_flat, y_flat)
assert_almost_equal(score3, score4, 2)
示例17
def setUp(self):
self._data = datasets.load_diabetes()
self._labels =[
'age', 'sex', 'bmi', 'bp', 's1',
's2', 's3', 's4', 's5', 's6'
]
示例18
def test_replicability():
"""Make sure running fit twice in a row finds the same parameters."""
diabetes = load_diabetes()
X_diabetes, y_diabetes = diabetes.data, diabetes.target
ind = np.arange(X_diabetes.shape[0])
rng = np.random.RandomState(0)
rng.shuffle(ind)
X_diabetes, y_diabetes = X_diabetes[ind], y_diabetes[ind]
clf = MLPRegressor(keep_prob=0.9, random_state=42, n_epochs=100)
target = y_diabetes
# Just predict on the training set, for simplicity.
pred1 = clf.fit(X_diabetes, target).predict(X_diabetes)
pred2 = clf.fit(X_diabetes, target).predict(X_diabetes)
assert_array_almost_equal(pred1, pred2)
示例19
def test_partial_fit():
data = load_diabetes()
clf = MLPRegressor(n_epochs=1)
X, y = data['data'], data['target']
for _ in range(30):
clf.partial_fit(X, y)
y_pred = clf.predict(X)
assert pearsonr(y_pred, y)[0] > 0.5
示例20
def test_embedding_default():
# Make sure the embedding works by default.
data = load_diabetes()
X, y = data['data'], data['target']
clf = MLPRegressor(n_epochs=1)
clf.fit(X, y)
assert clf.transform(X).shape[1] == 256
示例21
def test_embedding_no_layers():
# Make sure the embedding works with no layers.
data = load_diabetes()
X, y = data['data'], data['target']
clf = MLPRegressor(n_epochs=1, hidden_units=[])
clf.fit(X, y)
assert clf.transform(X).shape[1] == 1
示例22
def test_embedding_specific_layer():
# Make sure the embedding works with no layers.
data = load_diabetes()
X, y = data['data'], data['target']
clf = MLPRegressor(
n_epochs=1,
hidden_units=(256, 8, 256),
transform_layer_index=1)
clf.fit(X, y)
assert clf.transform(X).shape[1] == 8
示例23
def create_sample_data_csv(file_name: str = "diabetes.csv",
for_scoring: bool = False):
sample_data = load_diabetes()
df = pd.DataFrame(
data=sample_data.data,
columns=sample_data.feature_names)
if not for_scoring:
df['Y'] = sample_data.target
# Hard code to diabetes so we fail fast if the project has been
# bootstrapped.
df.to_csv(file_name, index=False)
示例24
def get_data(n_clients):
"""
Import the dataset via sklearn, shuffle and split train/test.
Return training, target lists for `n_clients` and a holdout test set
"""
print("Loading data")
diabetes = load_diabetes()
y = diabetes.target
X = diabetes.data
# Add constant to emulate intercept
X = np.c_[X, np.ones(X.shape[0])]
# The features are already preprocessed
# Shuffle
perm = np.random.permutation(X.shape[0])
X, y = X[perm, :], y[perm]
# Select test at random
test_size = 50
test_idx = np.random.choice(X.shape[0], size=test_size, replace=False)
train_idx = np.ones(X.shape[0], dtype=bool)
train_idx[test_idx] = False
X_test, y_test = X[test_idx, :], y[test_idx]
X_train, y_train = X[train_idx, :], y[train_idx]
# Split train among multiple clients.
# The selection is not at random. We simulate the fact that each client
# sees a potentially very different sample of patients.
X, y = [], []
step = int(X_train.shape[0] / n_clients)
for c in range(n_clients):
X.append(X_train[step * c: step * (c + 1), :])
y.append(y_train[step * c: step * (c + 1)])
return X, y, X_test, y_test
示例25
def _fit_model_pca(model):
data = load_diabetes()
X_train, X_test, *_ = train_test_split(
data.data, data.target, test_size=0.2, random_state=42)
model.fit(X_train)
return model, X_test.astype(np.float32)
示例26
def test_pipeline(self):
dataset = datasets.load_diabetes()
target_scaler = preprocessing.MinMaxScaler()
target = dataset.target.reshape(-1, 1)
x_train, x_test, y_train, y_test = train_test_split(
asfloat(dataset.data),
asfloat(target_scaler.fit_transform(target)),
test_size=0.15
)
network = algorithms.GradientDescent(
network=[
layers.Input(10),
layers.Sigmoid(25),
layers.Sigmoid(1),
],
batch_size=None,
show_epoch=100,
verbose=False,
)
pipeline = Pipeline([
('min_max_scaler', preprocessing.MinMaxScaler()),
('gd', network),
])
pipeline.fit(x_train, y_train, gd__epochs=50)
y_predict = pipeline.predict(x_test)
error = objectives.rmsle(
target_scaler.inverse_transform(y_test),
target_scaler.inverse_transform(y_predict).round()
)
error = self.eval(error)
self.assertGreater(0.5, error)
示例27
def test_grid_search(self):
def scorer(network, X, y):
y = asfloat(y)
result = asfloat(network.predict(X))
return self.eval(objectives.rmsle(result[:, 0], y))
dataset = datasets.load_diabetes()
x_train, x_test, y_train, y_test = train_test_split(
dataset.data, dataset.target, test_size=0.3
)
grnnet = algorithms.GRNN(std=0.5, verbose=False)
grnnet.train(x_train, y_train)
error = scorer(grnnet, x_test, y_test)
self.assertAlmostEqual(0.513, error, places=3)
random_search = model_selection.RandomizedSearchCV(
grnnet,
param_distributions={'std': np.arange(1e-2, 0.1, 1e-4)},
n_iter=10,
scoring=scorer,
random_state=self.random_seed,
cv=3,
)
random_search.fit(dataset.data, dataset.target)
scores = random_search.cv_results_
best_score = min(scores['mean_test_score'])
self.assertAlmostEqual(0.4266, best_score, places=3)
示例28
def test_transfrom_method(self):
dataset = datasets.load_diabetes()
grnnet = algorithms.GRNN(std=0.5, verbose=False)
grnnet.train(dataset.data, dataset.target)
y_predicted = grnnet.predict(dataset.data)
y_transformed = grnnet.transform(dataset.data)
np.testing.assert_array_almost_equal(y_predicted, y_transformed)
示例29
def test_pandas_for_bp(self):
dataset = datasets.load_diabetes()
target = dataset.target.reshape(-1, 1)
input_scaler = preprocessing.MinMaxScaler()
target_scaler = preprocessing.MinMaxScaler()
n_features = dataset.data.shape[1]
input_columns = ['column_' + str(i) for i in range(n_features)]
pandas_data = pd.DataFrame(dataset.data, columns=input_columns)
pandas_data['target'] = target_scaler.fit_transform(target)
pandas_data[input_columns] = input_scaler.fit_transform(
pandas_data[input_columns]
)
x_train, x_test, y_train, y_test = train_test_split(
asfloat(pandas_data[input_columns]),
asfloat(pandas_data['target']),
test_size=0.15
)
bpnet = algorithms.GradientDescent(
[
layers.Input(10),
layers.Sigmoid(30),
layers.Sigmoid(1),
],
batch_size=None,
)
bpnet.train(x_train, y_train, epochs=50)
y_predict = bpnet.predict(x_test).reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
error = objectives.rmsle(
target_scaler.inverse_transform(y_test),
target_scaler.inverse_transform(y_predict).round()
)
error = self.eval(error)
self.assertGreater(0.5, error)
示例30
def test_simple_grnn(self):
dataset = datasets.load_diabetes()
x_train, x_test, y_train, y_test = train_test_split(
dataset.data, dataset.target, test_size=0.3
)
x_train_before = x_train.copy()
x_test_before = x_test.copy()
y_train_before = y_train.copy()
grnnet = algorithms.GRNN(std=0.1, verbose=False)
grnnet.train(x_train, y_train)
result = grnnet.predict(x_test)
error = metrics.mean_absolute_error(result, y_test)
old_result = result.copy()
self.assertAlmostEqual(error, 46.3358, places=4)
# Test problem with variable links
np.testing.assert_array_equal(x_train, x_train_before)
np.testing.assert_array_equal(x_test, x_test_before)
np.testing.assert_array_equal(y_train, y_train_before)
x_train[:, :] = 0
result = grnnet.predict(x_test)
np.testing.assert_array_almost_equal(result, old_result)
self.assertPickledNetwork(grnnet, x_test)