Python源码示例:sklearn.datasets.make_classification()
示例1
def test_plot_estimator_and_lightgbm(tmpdir):
pytest.importorskip('graphviz')
lightgbm = pytest.importorskip('lightgbm')
from pygbm.plotting import plot_tree
n_classes = 3
X, y = make_classification(n_samples=150, n_classes=n_classes,
n_features=5, n_informative=3, n_redundant=0,
random_state=0)
n_trees = 3
est_pygbm = GradientBoostingClassifier(max_iter=n_trees,
n_iter_no_change=None)
est_pygbm.fit(X, y)
est_lightgbm = lightgbm.LGBMClassifier(n_estimators=n_trees)
est_lightgbm.fit(X, y)
n_total_trees = n_trees * n_classes
for i in range(n_total_trees):
filename = tmpdir.join('plot_mixed_predictors.pdf')
plot_tree(est_pygbm, est_lightgbm=est_lightgbm, tree_index=i,
view=False, filename=filename)
assert filename.exists()
示例2
def make_classification_df(n_samples: int = 1024,
n_num_features: int = 20,
n_cat_features: int = 0,
class_sep: float = 1.0,
n_classes: int = 2,
feature_name: str = 'col_{}',
target_name: str = 'target',
random_state: int = 0,
id_column: str = None) -> Tuple[pd.DataFrame, pd.Series]:
np.random.seed(random_state)
X, y = make_classification(n_samples=n_samples, n_features=n_num_features, class_sep=class_sep,
random_state=random_state, n_classes=n_classes, n_informative=max(n_classes, 2))
X = pd.DataFrame(X, columns=[feature_name.format(i) for i in range(n_num_features)])
y = pd.Series(y, name=target_name)
if id_column is not None:
X[id_column] = range(n_samples)
for i in range(n_cat_features):
X['cat_{}'.format(i)] = \
pd.Series(np.random.choice(['A', 'B', None], size=n_samples)).astype('category')
return X, y
示例3
def test_cv_lgbm():
X, y = make_classification(n_samples=1024, n_features=20, class_sep=0.98, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
models = [LGBMClassifier(n_estimators=300) for _ in range(5)]
pred_oof, pred_test, scores, importance = cross_validate(models, X_train, y_train, X_test, cv=5,
eval_func=roc_auc_score,
fit_params={'early_stopping_rounds': 200})
print(scores)
assert len(scores) == 5 + 1
assert scores[-1] >= 0.85 # overall roc_auc
assert roc_auc_score(y_train, pred_oof) == scores[-1]
assert roc_auc_score(y_test, pred_test) >= 0.85 # test roc_auc
assert roc_auc_score(y, models[0].predict_proba(X)[:, 1]) >= 0.85 # make sure models are trained
assert len(importance) == 5
assert list(importance[0].columns) == ['feature', 'importance']
assert len(importance[0]) == 20
示例4
def test_cv_partial_evaluate():
X, y = make_classification(n_samples=1024, n_features=20, class_sep=0.98, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
model = RidgeClassifier(alpha=1.0)
n = 0
def _fold_count(*args):
nonlocal n
n += 1
cv = Take(2, KFold(5))
pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=cv, eval_func=roc_auc_score,
on_each_fold=_fold_count)
assert len(scores) == 2 + 1
assert scores[-1] >= 0.8 # overall auc
assert n == 2
示例5
def test_fit_params_callback():
X, y = make_classification(n_samples=1024, n_features=20, class_sep=0.98, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
models = [LGBMClassifier(n_estimators=300) for _ in range(5)]
sample_weights = np.random.randint(1, 10, size=len(X_train))
sample_weights = sample_weights / sample_weights.sum()
def fit_params(n: int, train_index: List[int], valid_index: List[int]):
return {
'early_stopping_rounds': 100,
'sample_weight': list(sample_weights[train_index]),
'eval_sample_weight': [list(sample_weights[valid_index])]
}
result_w_weight = cross_validate(models, X_train, y_train, X_test, cv=5,
eval_func=roc_auc_score, fit_params=fit_params)
result_wo_weight = cross_validate(models, X_train, y_train, X_test, cv=5,
eval_func=roc_auc_score, fit_params={'early_stopping_rounds': 50})
assert result_w_weight.scores[-1] != result_wo_weight.scores[-1]
示例6
def test_label_spreading_closed_form():
n_classes = 2
X, y = make_classification(n_classes=n_classes, n_samples=200,
random_state=0)
y[::3] = -1
clf = label_propagation.LabelSpreading().fit(X, y)
# adopting notation from Zhou et al (2004):
S = clf._build_graph()
Y = np.zeros((len(y), n_classes + 1))
Y[np.arange(len(y)), y] = 1
Y = Y[:, :-1]
for alpha in [0.1, 0.3, 0.5, 0.7, 0.9]:
expected = np.dot(np.linalg.inv(np.eye(len(S)) - alpha * S), Y)
expected /= expected.sum(axis=1)[:, np.newaxis]
clf = label_propagation.LabelSpreading(max_iter=10000, alpha=alpha)
clf.fit(X, y)
assert_array_almost_equal(expected, clf.label_distributions_, 4)
示例7
def test_importances():
# Check variable importances.
X, y = datasets.make_classification(n_samples=2000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
shuffle=False,
random_state=1)
for alg in ['SAMME', 'SAMME.R']:
clf = AdaBoostClassifier(algorithm=alg)
clf.fit(X, y)
importances = clf.feature_importances_
assert_equal(importances.shape[0], 10)
assert_equal((importances[:3, np.newaxis] >= importances[3:]).all(),
True)
示例8
def test_importances_gini_equal_mse():
# Check that gini is equivalent to mse for binary output variable
X, y = datasets.make_classification(n_samples=2000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
shuffle=False,
random_state=0)
# The gini index and the mean square error (variance) might differ due
# to numerical instability. Since those instabilities mainly occurs at
# high tree depth, we restrict this maximal depth.
clf = DecisionTreeClassifier(criterion="gini", max_depth=5,
random_state=0).fit(X, y)
reg = DecisionTreeRegressor(criterion="mse", max_depth=5,
random_state=0).fit(X, y)
assert_almost_equal(clf.feature_importances_, reg.feature_importances_)
assert_array_equal(clf.tree_.feature, reg.tree_.feature)
assert_array_equal(clf.tree_.children_left, reg.tree_.children_left)
assert_array_equal(clf.tree_.children_right, reg.tree_.children_right)
assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples)
示例9
def test_mean_variance_illegal_axis():
X, _ = make_classification(5, 4, random_state=0)
# Sparsify the array a little bit
X[0, 0] = 0
X[2, 1] = 0
X[4, 3] = 0
X_csr = sp.csr_matrix(X)
assert_raises(ValueError, mean_variance_axis, X_csr, axis=-3)
assert_raises(ValueError, mean_variance_axis, X_csr, axis=2)
assert_raises(ValueError, mean_variance_axis, X_csr, axis=-1)
assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=-3,
last_mean=None, last_var=None, last_n=None)
assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=2,
last_mean=None, last_var=None, last_n=None)
assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=-1,
last_mean=None, last_var=None, last_n=None)
示例10
def test_max_features_tiebreak():
# Test if max_features can break tie among feature importance
X, y = datasets.make_classification(
n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
n_repeated=0, shuffle=False, random_state=0)
max_features = X.shape[1]
feature_importances = np.array([4, 4, 4, 4, 3, 3, 3, 2, 2, 1])
for n_features in range(1, max_features + 1):
transformer = SelectFromModel(
FixedImportanceEstimator(feature_importances),
max_features=n_features,
threshold=-np.inf)
X_new = transformer.fit_transform(X, y)
selected_feature_indices = np.where(transformer._get_support_mask())[0]
assert_array_equal(selected_feature_indices, np.arange(n_features))
assert X_new.shape[1] == n_features
示例11
def test_threshold_and_max_features():
X, y = datasets.make_classification(
n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
n_repeated=0, shuffle=False, random_state=0)
est = RandomForestClassifier(n_estimators=50, random_state=0)
transformer1 = SelectFromModel(estimator=est, max_features=3,
threshold=-np.inf)
X_new1 = transformer1.fit_transform(X, y)
transformer2 = SelectFromModel(estimator=est, threshold=0.04)
X_new2 = transformer2.fit_transform(X, y)
transformer3 = SelectFromModel(estimator=est, max_features=3,
threshold=0.04)
X_new3 = transformer3.fit_transform(X, y)
assert X_new3.shape[1] == min(X_new1.shape[1], X_new2.shape[1])
selected_indices = transformer3.transform(
np.arange(X.shape[1])[np.newaxis, :])
assert_allclose(X_new3, X[:, selected_indices[0]])
示例12
def test_feature_importances():
X, y = datasets.make_classification(
n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
n_repeated=0, shuffle=False, random_state=0)
est = RandomForestClassifier(n_estimators=50, random_state=0)
for threshold, func in zip(["mean", "median"], [np.mean, np.median]):
transformer = SelectFromModel(estimator=est, threshold=threshold)
transformer.fit(X, y)
assert hasattr(transformer.estimator_, 'feature_importances_')
X_new = transformer.transform(X)
assert_less(X_new.shape[1], X.shape[1])
importances = transformer.estimator_.feature_importances_
feature_mask = np.abs(importances) > func(importances)
assert_array_almost_equal(X_new, X[:, feature_mask])
示例13
def test_2d_coef():
X, y = datasets.make_classification(
n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
n_repeated=0, shuffle=False, random_state=0, n_classes=4)
est = LogisticRegression()
for threshold, func in zip(["mean", "median"], [np.mean, np.median]):
for order in [1, 2, np.inf]:
# Fit SelectFromModel a multi-class problem
transformer = SelectFromModel(estimator=LogisticRegression(),
threshold=threshold,
norm_order=order)
transformer.fit(X, y)
assert hasattr(transformer.estimator_, 'coef_')
X_new = transformer.transform(X)
assert_less(X_new.shape[1], X.shape[1])
# Manually check that the norm is correctly performed
est.fit(X, y)
importances = np.linalg.norm(est.coef_, axis=0, ord=order)
feature_mask = importances > func(importances)
assert_array_almost_equal(X_new, X[:, feature_mask])
示例14
def test_weight():
# Test class weights
clf = svm.SVC(gamma='scale', class_weight={1: 0.1})
# we give a small weights to class 1
clf.fit(X, Y)
# so all predicted values belong to class 2
assert_array_almost_equal(clf.predict(X), [2] * 6)
X_, y_ = make_classification(n_samples=200, n_features=10,
weights=[0.833, 0.167], random_state=2)
for clf in (linear_model.LogisticRegression(),
svm.LinearSVC(random_state=0), svm.SVC(gamma="scale")):
clf.set_params(class_weight={0: .1, 1: 10})
clf.fit(X_[:100], y_[:100])
y_pred = clf.predict(X_[100:])
assert f1_score(y_[100:], y_pred) > .3
示例15
def test_cross_val_score_predict_groups():
# Check if ValueError (when groups is None) propagates to cross_val_score
# and cross_val_predict
# And also check if groups is correctly passed to the cv object
X, y = make_classification(n_samples=20, n_classes=2, random_state=0)
clf = SVC(kernel="linear")
group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
GroupShuffleSplit()]
for cv in group_cvs:
assert_raise_message(ValueError,
"The 'groups' parameter should not be None.",
cross_val_score, estimator=clf, X=X, y=y, cv=cv)
assert_raise_message(ValueError,
"The 'groups' parameter should not be None.",
cross_val_predict, estimator=clf, X=X, y=y, cv=cv)
示例16
def test_cross_val_predict_unbalanced():
X, y = make_classification(n_samples=100, n_features=2, n_redundant=0,
n_informative=2, n_clusters_per_class=1,
random_state=1)
# Change the first sample to a new class
y[0] = 2
clf = LogisticRegression(random_state=1)
cv = StratifiedKFold(n_splits=2, random_state=1)
train, test = list(cv.split(X, y))
yhat_proba = cross_val_predict(clf, X, y, cv=cv, method="predict_proba")
assert y[test[0]][0] == 2 # sanity check for further assertions
assert np.all(yhat_proba[test[0]][:, 2] == 0)
assert np.all(yhat_proba[test[0]][:, 0:1] > 0)
assert np.all(yhat_proba[test[1]] > 0)
assert_array_almost_equal(yhat_proba.sum(axis=1), np.ones(y.shape),
decimal=12)
示例17
def test_learning_curve_verbose():
X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
n_redundant=0, n_classes=2,
n_clusters_per_class=1, random_state=0)
estimator = MockImprovingEstimator(20)
old_stdout = sys.stdout
sys.stdout = StringIO()
try:
train_sizes, train_scores, test_scores = \
learning_curve(estimator, X, y, cv=3, verbose=1)
finally:
out = sys.stdout.getvalue()
sys.stdout.close()
sys.stdout = old_stdout
assert("[learning_curve]" in out)
示例18
def test_learning_curve_incremental_learning_unsupervised():
X, _ = make_classification(n_samples=30, n_features=1, n_informative=1,
n_redundant=0, n_classes=2,
n_clusters_per_class=1, random_state=0)
estimator = MockIncrementalImprovingEstimator(20)
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y=None, cv=3, exploit_incremental_learning=True,
train_sizes=np.linspace(0.1, 1.0, 10))
assert_array_equal(train_sizes, np.linspace(2, 20, 10))
assert_array_almost_equal(train_scores.mean(axis=1),
np.linspace(1.9, 1.0, 10))
assert_array_almost_equal(test_scores.mean(axis=1),
np.linspace(0.1, 1.0, 10))
# 0.23. warning about tol not having its correct default value.
示例19
def test_learning_curve_batch_and_incremental_learning_are_equal():
X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
n_redundant=0, n_classes=2,
n_clusters_per_class=1, random_state=0)
train_sizes = np.linspace(0.2, 1.0, 5)
estimator = PassiveAggressiveClassifier(max_iter=1, tol=None,
shuffle=False)
train_sizes_inc, train_scores_inc, test_scores_inc = \
learning_curve(
estimator, X, y, train_sizes=train_sizes,
cv=3, exploit_incremental_learning=True)
train_sizes_batch, train_scores_batch, test_scores_batch = \
learning_curve(
estimator, X, y, cv=3, train_sizes=train_sizes,
exploit_incremental_learning=False)
assert_array_equal(train_sizes_inc, train_sizes_batch)
assert_array_almost_equal(train_scores_inc.mean(axis=1),
train_scores_batch.mean(axis=1))
assert_array_almost_equal(test_scores_inc.mean(axis=1),
test_scores_batch.mean(axis=1))
示例20
def test_grid_search_groups():
# Check if ValueError (when groups is None) propagates to GridSearchCV
# And also check if groups is correctly passed to the cv object
rng = np.random.RandomState(0)
X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
groups = rng.randint(0, 3, 15)
clf = LinearSVC(random_state=0)
grid = {'C': [1]}
group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
GroupShuffleSplit()]
for cv in group_cvs:
gs = GridSearchCV(clf, grid, cv=cv)
assert_raise_message(ValueError,
"The 'groups' parameter should not be None.",
gs.fit, X, y)
gs.fit(X, y, groups=groups)
non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
for cv in non_group_cvs:
gs = GridSearchCV(clf, grid, cv=cv)
# Should not raise an error
gs.fit(X, y)
示例21
def test_grid_search_sparse():
# Test that grid search works with both dense and sparse matrices
X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
clf = LinearSVC()
cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
cv.fit(X_[:180], y_[:180])
y_pred = cv.predict(X_[180:])
C = cv.best_estimator_.C
X_ = sp.csr_matrix(X_)
clf = LinearSVC()
cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
cv.fit(X_[:180].tocoo(), y_[:180])
y_pred2 = cv.predict(X_[180:])
C2 = cv.best_estimator_.C
assert np.mean(y_pred == y_pred2) >= .9
assert_equal(C, C2)
示例22
def test_refit_callable_invalid_type():
"""
Test implementation catches the errors when 'best_index_' returns an
invalid result.
"""
def refit_callable_invalid_type(cv_results):
"""
A dummy function tests when returned 'best_index_' is not integer.
"""
return None
X, y = make_classification(n_samples=100, n_features=4,
random_state=42)
clf = GridSearchCV(LinearSVC(random_state=42), {'C': [0.1, 1]},
scoring='precision', refit=refit_callable_invalid_type,
cv=5)
with pytest.raises(TypeError,
match='best_index_ returned is not an integer'):
clf.fit(X, y)
示例23
def test_refit_callable_out_bound(out_bound_value, search_cv):
"""
Test implementation catches the errors when 'best_index_' returns an
out of bound result.
"""
def refit_callable_out_bound(cv_results):
"""
A dummy function tests when returned 'best_index_' is out of bounds.
"""
return out_bound_value
X, y = make_classification(n_samples=100, n_features=4,
random_state=42)
clf = search_cv(LinearSVC(random_state=42), {'C': [0.1, 1]},
scoring='precision', refit=refit_callable_out_bound, cv=5)
with pytest.raises(IndexError, match='best_index_ index out of range'):
clf.fit(X, y)
示例24
def test_refit_callable_multi_metric():
"""
Test refit=callable in multiple metric evaluation setting
"""
def refit_callable(cv_results):
"""
A dummy function tests `refit=callable` interface.
Return the index of a model that has the least
`mean_test_prec`.
"""
assert 'mean_test_prec' in cv_results
return cv_results['mean_test_prec'].argmin()
X, y = make_classification(n_samples=100, n_features=4,
random_state=42)
scoring = {'Accuracy': make_scorer(accuracy_score), 'prec': 'precision'}
clf = GridSearchCV(LinearSVC(random_state=42), {'C': [0.01, 0.1, 1]},
scoring=scoring, refit=refit_callable, cv=5)
clf.fit(X, y)
assert clf.best_index_ == 0
# Ensure `best_score_` is disabled when using `refit=callable`
assert not hasattr(clf, 'best_score_')
示例25
def test_grid_search_cv_results_multimetric():
X, y = make_classification(n_samples=50, n_features=4, random_state=42)
n_splits = 3
params = [dict(kernel=['rbf', ], C=[1, 10], gamma=[0.1, 1]),
dict(kernel=['poly', ], degree=[1, 2])]
for iid in (False, True):
grid_searches = []
for scoring in ({'accuracy': make_scorer(accuracy_score),
'recall': make_scorer(recall_score)},
'accuracy', 'recall'):
grid_search = GridSearchCV(SVC(gamma='scale'), cv=n_splits,
iid=iid, param_grid=params,
scoring=scoring, refit=False)
grid_search.fit(X, y)
assert_equal(grid_search.iid, iid)
grid_searches.append(grid_search)
compare_cv_results_multimetric_with_single(*grid_searches, iid=iid)
示例26
def test_sample_weight():
n_samples = 100
X, y = make_classification(n_samples=2 * n_samples, n_features=6,
random_state=42)
sample_weight = np.random.RandomState(seed=42).uniform(size=len(y))
X_train, y_train, sw_train = \
X[:n_samples], y[:n_samples], sample_weight[:n_samples]
X_test = X[n_samples:]
for method in ['sigmoid', 'isotonic']:
base_estimator = LinearSVC(random_state=42)
calibrated_clf = CalibratedClassifierCV(base_estimator, method=method)
calibrated_clf.fit(X_train, y_train, sample_weight=sw_train)
probs_with_sw = calibrated_clf.predict_proba(X_test)
# As the weights are used for the calibration, they should still yield
# a different predictions
calibrated_clf.fit(X_train, y_train)
probs_without_sw = calibrated_clf.predict_proba(X_test)
diff = np.linalg.norm(probs_with_sw - probs_without_sw)
assert_greater(diff, 0.1)
示例27
def test_l1_ratio():
# Test if l1 ratio extremes match L1 and L2 penalty settings.
X, y = datasets.make_classification(n_samples=1000,
n_features=100, n_informative=20,
random_state=1234)
# test if elasticnet with l1_ratio near 1 gives same result as pure l1
est_en = SGDClassifier(alpha=0.001, penalty='elasticnet', tol=None,
max_iter=6, l1_ratio=0.9999999999,
random_state=42).fit(X, y)
est_l1 = SGDClassifier(alpha=0.001, penalty='l1', max_iter=6,
random_state=42, tol=None).fit(X, y)
assert_array_almost_equal(est_en.coef_, est_l1.coef_)
# test if elasticnet with l1_ratio near 0 gives same result as pure l2
est_en = SGDClassifier(alpha=0.001, penalty='elasticnet', tol=None,
max_iter=6, l1_ratio=0.0000000001,
random_state=42).fit(X, y)
est_l2 = SGDClassifier(alpha=0.001, penalty='l2', max_iter=6,
random_state=42, tol=None).fit(X, y)
assert_array_almost_equal(est_en.coef_, est_l2.coef_)
示例28
def test_sag_classifier_raises_error(solver):
# Following #13316, the error handling behavior changed in cython sag. This
# is simply a non-regression test to make sure numerical errors are
# properly raised.
# Train a classifier on a simple problem
rng = np.random.RandomState(42)
X, y = make_classification(random_state=rng)
clf = LogisticRegression(solver=solver, random_state=rng, warm_start=True)
clf.fit(X, y)
# Trigger a numerical error by:
# - corrupting the fitted coefficients of the classifier
# - fit it again starting from its current state thanks to warm_start
clf.coef_[:] = np.nan
with pytest.raises(ValueError, match="Floating-point under-/overflow"):
clf.fit(X, y)
示例29
def get_sample_dataset(dataset_properties):
"""Returns sample dataset
Args:
dataset_properties (dict): Dictionary corresponding to the properties of the dataset
used to verify the estimator and metric generators.
Returns:
X (array-like): Features array
y (array-like): Labels array
splits (iterator): This is an iterator that returns train test splits for
cross-validation purposes on ``X`` and ``y``.
"""
kwargs = dataset_properties.copy()
data_type = kwargs.pop('type')
if data_type == 'multiclass':
try:
X, y = datasets.make_classification(random_state=8, **kwargs)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
except Exception as e:
raise exceptions.UserError(repr(e))
elif data_type == 'iris':
X, y = datasets.load_iris(return_X_y=True)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
elif data_type == 'mnist':
X, y = datasets.load_digits(return_X_y=True)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
elif data_type == 'breast_cancer':
X, y = datasets.load_breast_cancer(return_X_y=True)
splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
elif data_type == 'boston':
X, y = datasets.load_boston(return_X_y=True)
splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
elif data_type == 'diabetes':
X, y = datasets.load_diabetes(return_X_y=True)
splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
else:
raise exceptions.UserError('Unknown dataset type {}'.format(dataset_properties['type']))
return X, y, splits
示例30
def test_early_stopping_loss():
# Make sure that when scoring is None, the early stopping is done w.r.t to
# the loss. Using scoring='neg_log_loss' and scoring=None should be
# equivalent since the loss is precisely the negative log likelihood
n_samples = int(1e3)
max_iter = 100
n_iter_no_change = 5
X, y = make_classification(n_samples, random_state=0)
clf_scoring = GradientBoostingClassifier(max_iter=max_iter,
scoring='neg_log_loss',
validation_split=.1,
n_iter_no_change=n_iter_no_change,
tol=1e-4,
verbose=1,
random_state=0)
clf_scoring.fit(X, y)
clf_loss = GradientBoostingClassifier(max_iter=max_iter,
scoring=None,
validation_split=.1,
n_iter_no_change=n_iter_no_change,
tol=1e-4,
verbose=1,
random_state=0)
clf_loss.fit(X, y)
assert n_iter_no_change < clf_loss.n_iter_ < max_iter
assert clf_loss.n_iter_ == clf_scoring.n_iter_