Python源码示例:sklearn.datasets.make_regression()
示例1
def test_multi_target_regression_partial_fit():
X, y = datasets.make_regression(n_targets=3)
X_train, y_train = X[:50], y[:50]
X_test, y_test = X[50:], y[50:]
references = np.zeros_like(y_test)
half_index = 25
for n in range(3):
sgr = SGDRegressor(random_state=0, max_iter=5)
sgr.partial_fit(X_train[:half_index], y_train[:half_index, n])
sgr.partial_fit(X_train[half_index:], y_train[half_index:, n])
references[:, n] = sgr.predict(X_test)
sgr = MultiOutputRegressor(SGDRegressor(random_state=0, max_iter=5))
sgr.partial_fit(X_train[:half_index], y_train[:half_index])
sgr.partial_fit(X_train[half_index:], y_train[half_index:])
y_pred = sgr.predict(X_test)
assert_almost_equal(references, y_pred)
assert not hasattr(MultiOutputRegressor(Lasso), 'partial_fit')
示例2
def test_early_stopping_regression(scoring, validation_split,
n_iter_no_change, tol):
max_iter = 500
X, y = make_regression(random_state=0)
gb = GradientBoostingRegressor(verbose=1, # just for coverage
scoring=scoring,
tol=tol,
validation_split=validation_split,
max_iter=max_iter,
n_iter_no_change=n_iter_no_change,
random_state=0)
gb.fit(X, y)
if n_iter_no_change is not None:
assert n_iter_no_change <= gb.n_iter_ < max_iter
else:
assert gb.n_iter_ == max_iter
示例3
def test_fixed_effect_contrast_nonzero_effect():
X, y = make_regression(n_features=5, n_samples=20, random_state=0)
y = y[:, None]
labels, results = run_glm(y, X, 'ols')
coef = LinearRegression(fit_intercept=False).fit(X, y).coef_
for i in range(X.shape[1]):
contrast = np.zeros(X.shape[1])
contrast[i] = 1.
fixed_effect = _compute_fixed_effect_contrast([labels],
[results],
[contrast],
)
assert_almost_equal(fixed_effect.effect_size(), coef.ravel()[i])
fixed_effect = _compute_fixed_effect_contrast(
[labels] * 3, [results] * 3, [contrast] * 3)
assert_almost_equal(fixed_effect.effect_size(), coef.ravel()[i])
示例4
def make_regression_df(n_samples: int = 1024,
n_num_features: int = 20,
n_cat_features: int = 0,
feature_name: str = 'col_{}',
target_name: str = 'target',
random_state: int = 0,
id_column: str = None) -> Tuple[pd.DataFrame, pd.Series]:
np.random.seed(random_state)
X, y = make_regression(n_samples=n_samples, n_features=n_num_features,
random_state=random_state)
X = pd.DataFrame(X, columns=[feature_name.format(i) for i in range(n_num_features)])
y = pd.Series(y, name=target_name)
if id_column is not None:
X[id_column] = range(n_samples)
for i in range(n_cat_features):
X['cat_{}'.format(i)] = \
pd.Series(np.random.choice(['A', 'B', None], size=n_samples)).astype(str).astype('category')
return X, y
示例5
def test_early_stopping_regression(scoring, validation_fraction,
n_iter_no_change, tol):
max_iter = 200
X, y = make_regression(random_state=0)
gb = HistGradientBoostingRegressor(
verbose=1, # just for coverage
min_samples_leaf=5, # easier to overfit fast
scoring=scoring,
tol=tol,
validation_fraction=validation_fraction,
max_iter=max_iter,
n_iter_no_change=n_iter_no_change,
random_state=0
)
gb.fit(X, y)
if n_iter_no_change is not None:
assert n_iter_no_change <= gb.n_iter_ < max_iter
else:
assert gb.n_iter_ == max_iter
示例6
def test_shuffle():
# Test that the shuffle parameter affects the training process (it should)
X, y = make_regression(n_samples=50, n_features=5, n_targets=1,
random_state=0)
# The coefficients will be identical if both do or do not shuffle
for shuffle in [True, False]:
mlp1 = MLPRegressor(hidden_layer_sizes=1, max_iter=1, batch_size=1,
random_state=0, shuffle=shuffle)
mlp2 = MLPRegressor(hidden_layer_sizes=1, max_iter=1, batch_size=1,
random_state=0, shuffle=shuffle)
mlp1.fit(X, y)
mlp2.fit(X, y)
assert np.array_equal(mlp1.coefs_[0], mlp2.coefs_[0])
# The coefficients will be slightly different if shuffle=True
mlp1 = MLPRegressor(hidden_layer_sizes=1, max_iter=1, batch_size=1,
random_state=0, shuffle=True)
mlp2 = MLPRegressor(hidden_layer_sizes=1, max_iter=1, batch_size=1,
random_state=0, shuffle=False)
mlp1.fit(X, y)
mlp2.fit(X, y)
assert not np.array_equal(mlp1.coefs_[0], mlp2.coefs_[0])
示例7
def test_cross_val_score_with_score_func_regression():
X, y = make_regression(n_samples=30, n_features=20, n_informative=5,
random_state=0)
reg = Ridge()
# Default score of the Ridge regression estimator
scores = cross_val_score(reg, X, y, cv=5)
assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
# R2 score (aka. determination coefficient) - should be the
# same as the default estimator score
r2_scores = cross_val_score(reg, X, y, scoring="r2", cv=5)
assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
# Mean squared error; this is a loss function, so "scores" are negative
neg_mse_scores = cross_val_score(reg, X, y, cv=5,
scoring="neg_mean_squared_error")
expected_neg_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99])
assert_array_almost_equal(neg_mse_scores, expected_neg_mse, 2)
# Explained variance
scoring = make_scorer(explained_variance_score)
ev_scores = cross_val_score(reg, X, y, cv=5, scoring=scoring)
assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
示例8
def test_multi_target_regression():
X, y = datasets.make_regression(n_targets=3)
X_train, y_train = X[:50], y[:50]
X_test, y_test = X[50:], y[50:]
references = np.zeros_like(y_test)
for n in range(3):
rgr = GradientBoostingRegressor(random_state=0)
rgr.fit(X_train, y_train[:, n])
references[:, n] = rgr.predict(X_test)
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
rgr.fit(X_train, y_train)
y_pred = rgr.predict(X_test)
assert_almost_equal(references, y_pred)
# 0.23. warning about tol not having its correct default value.
示例9
def test_ridge_fit_intercept_sparse():
X, y = make_regression(n_samples=1000, n_features=2, n_informative=2,
bias=10., random_state=42)
X_csr = sp.csr_matrix(X)
for solver in ['sag', 'sparse_cg']:
dense = Ridge(alpha=1., tol=1.e-15, solver=solver, fit_intercept=True)
sparse = Ridge(alpha=1., tol=1.e-15, solver=solver, fit_intercept=True)
dense.fit(X, y)
with pytest.warns(None) as record:
sparse.fit(X_csr, y)
assert len(record) == 0
assert_almost_equal(dense.intercept_, sparse.intercept_)
assert_array_almost_equal(dense.coef_, sparse.coef_)
# test the solver switch and the corresponding warning
for solver in ['saga', 'lsqr']:
sparse = Ridge(alpha=1., tol=1.e-15, solver=solver, fit_intercept=True)
assert_raises_regex(ValueError, "In Ridge,", sparse.fit, X_csr, y)
示例10
def test_make_regression():
X, y, c = make_regression(n_samples=100, n_features=10, n_informative=3,
effective_rank=5, coef=True, bias=0.0,
noise=1.0, random_state=0)
assert_equal(X.shape, (100, 10), "X shape mismatch")
assert_equal(y.shape, (100,), "y shape mismatch")
assert_equal(c.shape, (10,), "coef shape mismatch")
assert_equal(sum(c != 0.0), 3, "Unexpected number of informative features")
# Test that y ~= np.dot(X, c) + bias + N(0, 1.0).
assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)
# Test with small number of features.
X, y = make_regression(n_samples=100, n_features=1) # n_informative=3
assert_equal(X.shape, (100, 1))
示例11
def test_prediction_gradient():
"""Test computation of prediction gradients."""
mlp = MLPRegressor(n_epochs=100, random_state=42, hidden_units=(5,))
X, y = make_regression(
n_samples=1000, n_features=10, n_informative=1, shuffle=False)
mlp.fit(X, y)
grad = mlp.prediction_gradient(X)
grad_means = grad.mean(axis=0)
assert grad.shape == X.shape
# Check that only the informative feature has a large gradient.
assert np.abs(grad_means[0]) > 0.5
for m in grad_means[1:]:
assert np.abs(m) < 0.1
# Raise an exception for sparse inputs, which are not yet supported.
X_sp = sp.csr_matrix(X)
mlp.fit(X_sp, y)
with pytest.raises(NotImplementedError):
mlp.prediction_gradient(X_sp)
示例12
def test_smoke_multiout_regression_methods(n_jobs):
"""Construct, fit, and predict on realistic problem.
"""
X, y = make_regression(random_state=7, n_samples=100, n_features=10,
n_informative=4, n_targets=2)
rng = np.random.RandomState(17)
est_list = [('lr', LinearRegression()),
('rf', RandomForestRegressor(random_state=rng,
n_estimators=10)),
('metalr', LinearRegression())]
sm = StackedRegressor(est_list, n_jobs=n_jobs)
sm.fit(X, y)
sm.predict(X)
sm.score(X, y)
with pytest.raises(AttributeError):
sm.predict_proba(X)
示例13
def test_cv():
"""Simple CV check."""
# XXX: don't use scikit-learn for tests.
X, y = make_regression()
cv = KFold(n_splits=5)
glm_normal = GLM(distr='gaussian', alpha=0.01, reg_lambda=0.1)
# check that it returns 5 scores
scores = cross_val_score(glm_normal, X, y, cv=cv)
assert(len(scores) == 5)
param_grid = [{'alpha': np.linspace(0.01, 0.99, 2)},
{'reg_lambda': np.logspace(np.log(0.5), np.log(0.01),
10, base=np.exp(1))}]
glmcv = GridSearchCV(glm_normal, param_grid, cv=cv)
glmcv.fit(X, y)
示例14
def setUpClass(cls):
cls.X, cls.y = datasets.make_regression(
n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0
)
cls.params = {
"dense_layers": 2,
"dense_1_size": 8,
"dense_2_size": 4,
"dropout": 0,
"learning_rate": 0.01,
"momentum": 0.9,
"decay": 0.001,
"ml_task": "regression"
}
cls.y = preprocessing.scale(cls.y)
示例15
def test_kmeans(self):
model = KMeans()
X, y = make_regression(n_features=4, random_state=42)
model.fit(X, y)
initial_types = [('input', FloatTensorType((None, X.shape[1])))]
with self.assertRaises(RuntimeError):
convert_sklearn(model, initial_types=initial_types,
final_types=[('output4', None)])
with self.assertRaises(RuntimeError):
convert_sklearn(model, initial_types=initial_types,
final_types=[('dup1', None), ('dup1', None)],
target_opset=TARGET_OPSET)
model_onnx = convert_sklearn(
model, initial_types=initial_types,
final_types=[('output4', None), ('output5', None)],
target_opset=TARGET_OPSET)
assert model_onnx is not None
sess = InferenceSession(model_onnx.SerializeToString())
assert sess.get_outputs()[0].name == 'output4'
assert sess.get_outputs()[1].name == 'output5'
示例16
def make_linear_transitive(
self,
n_instances=1000,
n_objects=5,
noise=0.0,
n_features=100,
n_informative=10,
seed=42,
**kwd,
):
random_state = check_random_state(seed=seed)
X, y, coeff = make_regression(
n_samples=n_instances * n_objects,
n_features=n_features,
n_informative=n_informative,
coef=True,
noise=noise,
random_state=random_state,
)
X = X.reshape(n_instances, n_objects, n_features)
y = y.reshape(n_instances, n_objects)
Y = y.argmax(axis=1)
Y = convert_to_label_encoding(Y, n_objects)
return X, Y
示例17
def make_linear_transitive(
self,
n_instances=1000,
n_objects=5,
noise=0.0,
n_features=100,
n_informative=10,
seed=42,
**kwd,
):
random_state = check_random_state(seed=seed)
X, y, coeff = make_regression(
n_samples=n_instances * n_objects,
n_features=n_features,
n_informative=n_informative,
coef=True,
noise=noise,
random_state=random_state,
)
X = X.reshape(n_instances, n_objects, n_features)
y = y.reshape(n_instances, n_objects)
Y = scores_to_rankings(y)
return X, Y
示例18
def generate_regression(self, num_features, num_samples, test_split=0.1, seed=0):
"""Generate a regression task
Arguments:
num_features {int} -- Number of features
num_samples {int} -- Number of samples
Keyword Arguments:
test_split {float} -- Size of test split (default: {0.1})
seed {int} -- a random seed (default: {0})
"""
X, Y = make_regression(n_samples=num_samples, n_features=num_features, random_state=seed)
self.categorical_features = [False] * num_features
self.problem_type = ProblemType.FeatureRegression
self.X, self.Y = X, Y
self._split_data(test_split, seed)
示例19
def test_decision_tree_regressor(self):
# Train model
training_data = datasets.make_regression(n_features=5)
regressor = DecisionTreeRegressor()
regressor.fit(training_data[0], training_data[1])
# Get some test results
test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
test_results = regressor.predict(test_data)
# Serialise the models to Elasticsearch
feature_names = ["f0", "f1", "f2", "f3", "f4"]
model_id = "test_decision_tree_regressor"
es_model = ImportedMLModel(
ES_TEST_CLIENT, model_id, regressor, feature_names, overwrite=True
)
es_results = es_model.predict(test_data)
np.testing.assert_almost_equal(test_results, es_results, decimal=2)
# Clean up
es_model.delete_model()
示例20
def test_random_forest_regressor(self):
# Train model
training_data = datasets.make_regression(n_features=5)
regressor = RandomForestRegressor()
regressor.fit(training_data[0], training_data[1])
# Get some test results
test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
test_results = regressor.predict(test_data)
# Serialise the models to Elasticsearch
feature_names = ["f0", "f1", "f2", "f3", "f4"]
model_id = "test_random_forest_regressor"
es_model = ImportedMLModel(
ES_TEST_CLIENT, model_id, regressor, feature_names, overwrite=True
)
es_results = es_model.predict(test_data)
np.testing.assert_almost_equal(test_results, es_results, decimal=2)
# Clean up
es_model.delete_model()
示例21
def test_xgb_regressor(self):
# Train model
training_data = datasets.make_regression(n_features=5)
regressor = XGBRegressor()
regressor.fit(training_data[0], training_data[1])
# Get some test results
test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
test_results = regressor.predict(np.asarray(test_data))
# Serialise the models to Elasticsearch
feature_names = ["f0", "f1", "f2", "f3", "f4"]
model_id = "test_xgb_regressor"
es_model = ImportedMLModel(
ES_TEST_CLIENT, model_id, regressor, feature_names, overwrite=True
)
es_results = es_model.predict(test_data)
np.testing.assert_almost_equal(test_results, es_results, decimal=2)
# Clean up
es_model.delete_model()
示例22
def test_predict_single_feature_vector(self):
# Train model
training_data = datasets.make_regression(n_features=1)
regressor = XGBRegressor()
regressor.fit(training_data[0], training_data[1])
# Get some test results
test_data = [[0.1]]
test_results = regressor.predict(np.asarray(test_data))
# Serialise the models to Elasticsearch
feature_names = ["f0"]
model_id = "test_xgb_regressor"
es_model = ImportedMLModel(
ES_TEST_CLIENT, model_id, regressor, feature_names, overwrite=True
)
# Single feature
es_results = es_model.predict(test_data[0])
np.testing.assert_almost_equal(test_results, es_results, decimal=2)
# Clean up
es_model.delete_model()
示例23
def test_cross_val_score_with_score_func_regression():
X, y = make_regression(n_samples=30, n_features=20, n_informative=5,
random_state=0)
reg = Ridge()
# Default score of the Ridge regression estimator
scores = cross_val_score(reg, X, y, cv=5)
assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
# R2 score (aka. determination coefficient) - should be the
# same as the default estimator score
r2_scores = cross_val_score(reg, X, y, scoring="r2", cv=5)
assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
# Mean squared error; this is a loss function, so "scores" are negative
neg_mse_scores = cross_val_score(reg, X, y, cv=5,
scoring="neg_mean_squared_error")
expected_neg_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99])
assert_array_almost_equal(neg_mse_scores, expected_neg_mse, 2)
# Explained variance
scoring = make_scorer(explained_variance_score)
ev_scores = cross_val_score(reg, X, y, cv=5, scoring=scoring)
assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)
示例24
def test_automl():
X, y = make_regression(n_samples=N_OBS,
n_features=N_FEATURE,
n_informative=N_IMP_FEATURE,
random_state=RANDOM_SEED)
X = pd.DataFrame(X, columns=['x{}'.format(i) for i in range(X.shape[1])])
y = pd.Series(y)
logging.info(X.shape, y.shape)
X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, test_size=.2, random_state=RANDOM_SEED)
model = AutoLGB(objective='regression', metric='l1')
model.tune(X_trn, y_trn)
model.fit(X_trn, y_trn)
p = model.predict(X_tst)
r = (np.random.rand(X_tst.shape[0]) * (y_trn.max() - y_trn.min()) + y_trn.min())
logging.info('MAE (LGB): {:.4f}'.format(mae(y_tst, p)))
assert mae(y_tst, p) < mae(y_tst, r)
model = AutoXGB(objective='reg:linear', metric='rmse')
model.tune(X_trn, y_trn)
model.fit(X_trn, y_trn)
p = model.predict(X_tst)
r = (np.random.rand(X_tst.shape[0]) * (y_trn.max() - y_trn.min()) + y_trn.min())
logging.info('MAE (XGB): {:.4f}'.format(mae(y_tst, p)))
assert mae(y_tst, p) < mae(y_tst, r)
示例25
def test_pre_binned_data():
# Make sure that:
# - training on numerical data and predicting on numerical data is the
# same as training on binned data and predicting on binned data
# - training on numerical data and predicting on numerical data is the
# same as training on numerical data and predicting on binned data
# - training on binned data and predicting on numerical data is not
# possible.
X, y = make_regression(random_state=0)
gbdt = GradientBoostingRegressor(scoring=None, random_state=0)
mapper = BinMapper(random_state=0)
X_binned = mapper.fit_transform(X)
fit_num_pred_num = gbdt.fit(X, y).predict(X)
fit_binned_pred_binned = gbdt.fit(X_binned, y).predict(X_binned)
fit_num_pred_binned = gbdt.fit(X, y).predict(X_binned)
assert_allclose(fit_num_pred_num, fit_binned_pred_binned)
assert_allclose(fit_num_pred_num, fit_num_pred_binned)
assert_raises_regex(
ValueError,
'This estimator was fitted with pre-binned data ',
gbdt.fit(X_binned, y).predict, X
)
示例26
def test_pre_binned_data():
# Make sure ValueError is raised when predictor.predict() is called while
# the predictor does not have any numerical thresholds.
X, y = make_regression()
# Init gradients and hessians to that of least squares loss
gradients = -y.astype(np.float32)
hessians = np.ones(1, dtype=np.float32)
mapper = BinMapper(random_state=0)
X_binned = mapper.fit_transform(X)
grower = TreeGrower(X_binned, gradients, hessians,
n_bins_per_feature=mapper.n_bins_per_feature_)
grower.grow()
predictor = grower.make_predictor(
numerical_thresholds=None
)
assert_raises_regex(
ValueError,
'This predictor does not have numerical thresholds',
predictor.predict, X
)
assert_raises_regex(
ValueError,
'binned_data dtype should be uint8',
predictor.predict_binned, X
)
predictor.predict_binned(X_binned) # No error
predictor = grower.make_predictor(
numerical_thresholds=mapper.numerical_thresholds_
)
assert_raises_regex(
ValueError,
'X has uint8 dtype',
predictor.predict, X_binned
)
示例27
def create_regression_dataset(n_samples, n_features, n_informative, effective_rank, tail_strength,
noise, random_state=None, shuffle=True):
"""
Creates a regression dataset
:param n_samples: number of observations
:param n_features: number of features
:param n_informative: number of informative features
:param n_targets: The number of regression targets, i.e., the dimension of the y output vector associated with a sample. By default, the output is a scalar.
:param effective_rank: approximate number of singular vectors required to explain data
:param tail_strength: relative importance of the fat noisy tail of the singular values profile
:param noise: standard deviation of the gaussian noise applied to the output
:param random_state: the numpy RandomState
:param shuffle: shuffle the samples and the features.
:return: the requested dataframe
"""
random_state = get_random_state(random_state)
X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative,
n_targets=1, effective_rank=effective_rank, tail_strength=tail_strength,
noise=noise, random_state=random_state, shuffle=shuffle)
# cast to a data frame
df = pd.DataFrame(X)
# rename X columns
df = rename_columns(df)
# and add the Y
df['y'] = y
return df
示例28
def gen_regression(params):
"""Generate a regression problem with make_regression
where random_state=1"""
return make_regression(**params, random_state=1)
示例29
def test_eigenpro_regression_duplicate_data():
"""Test the performance when some data is repeated"""
X, y = make_regression(random_state=1)
X, y = np.concatenate([X, X]), np.concatenate([y, y])
prediction = (
EigenProRegressor(
kernel="rbf", n_epoch=100, gamma=0.02, random_state=1
)
.fit(X, y)
.predict(X)
)
assert_allclose(prediction, y, rtol=5e-3)
示例30
def test_eigenpro_regression_conflict_data():
"""Make sure the regressor doesn't crash when conflicting
data is given"""
X, y = make_regression(random_state=1)
y = np.reshape(y, (-1, 1))
X, y = X, np.hstack([y, y + 2])
# Make sure we don't throw an error when fitting or predicting
EigenProRegressor(
kernel="linear", n_epoch=5, gamma=0.5, random_state=1
).fit(X, y).predict(X)
# Tests for FastKernelClassification