Python源码示例:sklearn.datasets.make_hastie_10_2()
示例1
def test_warm_start(random_state=42):
# Test if fitting incrementally with warm start gives a forest of the
# right size and the same results as a normal fit.
X, y = make_hastie_10_2(n_samples=20, random_state=1)
clf_ws = None
for n_estimators in [5, 10]:
if clf_ws is None:
clf_ws = BaggingClassifier(n_estimators=n_estimators,
random_state=random_state,
warm_start=True)
else:
clf_ws.set_params(n_estimators=n_estimators)
clf_ws.fit(X, y)
assert_equal(len(clf_ws), n_estimators)
clf_no_ws = BaggingClassifier(n_estimators=10, random_state=random_state,
warm_start=False)
clf_no_ws.fit(X, y)
assert_equal(set([tree.random_state for tree in clf_ws]),
set([tree.random_state for tree in clf_no_ws]))
示例2
def test_warm_start_equal_n_estimators():
# Test that nothing happens when fitting without increasing n_estimators
X, y = make_hastie_10_2(n_samples=20, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)
clf = BaggingClassifier(n_estimators=5, warm_start=True, random_state=83)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# modify X to nonsense values, this should not change anything
X_train += 1.
assert_warns_message(UserWarning,
"Warm-start fitting without increasing n_estimators does not",
clf.fit, X_train, y_train)
assert_array_equal(y_pred, clf.predict(X_test))
示例3
def test_warm_start_equivalence():
# warm started classifier with 5+5 estimators should be equivalent to
# one classifier with 10 estimators
X, y = make_hastie_10_2(n_samples=20, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)
clf_ws = BaggingClassifier(n_estimators=5, warm_start=True,
random_state=3141)
clf_ws.fit(X_train, y_train)
clf_ws.set_params(n_estimators=10)
clf_ws.fit(X_train, y_train)
y1 = clf_ws.predict(X_test)
clf = BaggingClassifier(n_estimators=10, warm_start=False,
random_state=3141)
clf.fit(X_train, y_train)
y2 = clf.predict(X_test)
assert_array_almost_equal(y1, y2)
示例4
def check_classification_synthetic(presort, loss):
# Test GradientBoostingClassifier on synthetic dataset used by
# Hastie et al. in ESLII Example 12.7.
X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]
gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=2,
max_depth=1, loss=loss,
learning_rate=1.0, random_state=0)
gbrt.fit(X_train, y_train)
error_rate = (1.0 - gbrt.score(X_test, y_test))
assert_less(error_rate, 0.09)
gbrt = GradientBoostingClassifier(n_estimators=200, min_samples_split=2,
max_depth=1, loss=loss,
learning_rate=1.0, subsample=0.5,
random_state=0,
presort=presort)
gbrt.fit(X_train, y_train)
error_rate = (1.0 - gbrt.score(X_test, y_test))
assert_less(error_rate, 0.08)
示例5
def test_check_inputs_predict_stages():
# check that predict_stages through an error if the type of X is not
# supported
x, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
x_sparse_csc = csc_matrix(x)
clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
clf.fit(x, y)
score = np.zeros((y.shape)).reshape(-1, 1)
assert_raise_message(ValueError,
"When X is a sparse matrix, a CSR format is expected",
predict_stages, clf.estimators_, x_sparse_csc,
clf.learning_rate, score)
x_fortran = np.asfortranarray(x)
assert_raise_message(ValueError,
"X should be C-ordered np.ndarray",
predict_stages, clf.estimators_, x_fortran,
clf.learning_rate, score)
示例6
def test_warm_start(Cls):
# Test if warm start equals fit.
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
est = Cls(n_estimators=200, max_depth=1)
est.fit(X, y)
est_ws = Cls(n_estimators=100, max_depth=1, warm_start=True)
est_ws.fit(X, y)
est_ws.set_params(n_estimators=200)
est_ws.fit(X, y)
if Cls is GradientBoostingRegressor:
assert_array_almost_equal(est_ws.predict(X), est.predict(X))
else:
# Random state is preserved and hence predict_proba must also be
# same
assert_array_equal(est_ws.predict(X), est.predict(X))
assert_array_almost_equal(est_ws.predict_proba(X),
est.predict_proba(X))
示例7
def test_warm_start_fortran(Cls):
# Test that feeding a X in Fortran-ordered is giving the same results as
# in C-ordered
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
est_c = Cls(n_estimators=1, random_state=1, warm_start=True)
est_fortran = Cls(n_estimators=1, random_state=1, warm_start=True)
est_c.fit(X, y)
est_c.set_params(n_estimators=11)
est_c.fit(X, y)
X_fortran = np.asfortranarray(X)
est_fortran.fit(X_fortran, y)
est_fortran.set_params(n_estimators=11)
est_fortran.fit(X_fortran, y)
assert_array_almost_equal(est_c.predict(X), est_fortran.predict(X))
示例8
def test_warm_start(random_state=42):
# Test if fitting incrementally with warm start gives a forest of the
# right size and the same results as a normal fit.
X, y = make_hastie_10_2(n_samples=20, random_state=1)
clf_ws = None
for n_estimators in [5, 10]:
if clf_ws is None:
clf_ws = BaggingClassifier(n_estimators=n_estimators,
random_state=random_state,
warm_start=True)
else:
clf_ws.set_params(n_estimators=n_estimators)
clf_ws.fit(X, y)
assert_equal(len(clf_ws), n_estimators)
clf_no_ws = BaggingClassifier(n_estimators=10, random_state=random_state,
warm_start=False)
clf_no_ws.fit(X, y)
assert_equal(set([tree.random_state for tree in clf_ws]),
set([tree.random_state for tree in clf_no_ws]))
示例9
def test_warm_start_equal_n_estimators():
# Test that nothing happens when fitting without increasing n_estimators
X, y = make_hastie_10_2(n_samples=20, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)
clf = BaggingClassifier(n_estimators=5, warm_start=True, random_state=83)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# modify X to nonsense values, this should not change anything
X_train += 1.
assert_warns_message(UserWarning,
"Warm-start fitting without increasing n_estimators does not",
clf.fit, X_train, y_train)
assert_array_equal(y_pred, clf.predict(X_test))
示例10
def test_warm_start_equivalence():
# warm started classifier with 5+5 estimators should be equivalent to
# one classifier with 10 estimators
X, y = make_hastie_10_2(n_samples=20, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)
clf_ws = BaggingClassifier(n_estimators=5, warm_start=True,
random_state=3141)
clf_ws.fit(X_train, y_train)
clf_ws.set_params(n_estimators=10)
clf_ws.fit(X_train, y_train)
y1 = clf_ws.predict(X_test)
clf = BaggingClassifier(n_estimators=10, warm_start=False,
random_state=3141)
clf.fit(X_train, y_train)
y2 = clf.predict(X_test)
assert_array_almost_equal(y1, y2)
示例11
def check_classification_synthetic(presort, loss):
# Test GradientBoostingClassifier on synthetic dataset used by
# Hastie et al. in ESLII Example 12.7.
X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]
gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=2,
max_depth=1, loss=loss,
learning_rate=1.0, random_state=0)
gbrt.fit(X_train, y_train)
error_rate = (1.0 - gbrt.score(X_test, y_test))
assert_less(error_rate, 0.09)
gbrt = GradientBoostingClassifier(n_estimators=200, min_samples_split=2,
max_depth=1, loss=loss,
learning_rate=1.0, subsample=0.5,
random_state=0,
presort=presort)
gbrt.fit(X_train, y_train)
error_rate = (1.0 - gbrt.score(X_test, y_test))
assert_less(error_rate, 0.08)
示例12
def test_warm_start_oob():
# Test if warm start OOB equals fit.
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
for Cls in [GradientBoostingRegressor, GradientBoostingClassifier]:
est = Cls(n_estimators=200, max_depth=1, subsample=0.5,
random_state=1)
est.fit(X, y)
est_ws = Cls(n_estimators=100, max_depth=1, subsample=0.5,
random_state=1, warm_start=True)
est_ws.fit(X, y)
est_ws.set_params(n_estimators=200)
est_ws.fit(X, y)
assert_array_almost_equal(est_ws.oob_improvement_[:100],
est.oob_improvement_[:100])
示例13
def test_warm_start_smaller_n_estimators():
# Test if warm start'ed second fit with smaller n_estimators raises error.
X, y = make_hastie_10_2(n_samples=20, random_state=1)
clf = BaggingClassifier(n_estimators=5, warm_start=True)
clf.fit(X, y)
clf.set_params(n_estimators=4)
assert_raises(ValueError, clf.fit, X, y)
示例14
def test_warm_start_with_oob_score_fails():
# Check using oob_score and warm_start simultaneously fails
X, y = make_hastie_10_2(n_samples=20, random_state=1)
clf = BaggingClassifier(n_estimators=5, warm_start=True, oob_score=True)
assert_raises(ValueError, clf.fit, X, y)
示例15
def test_oob_score_consistency():
# Make sure OOB scores are identical when random_state, estimator, and
# training data are fixed and fitting is done twice
X, y = make_hastie_10_2(n_samples=200, random_state=1)
bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5,
max_features=0.5, oob_score=True,
random_state=1)
assert_equal(bagging.fit(X, y).oob_score_, bagging.fit(X, y).oob_score_)
示例16
def test_estimators_samples():
# Check that format of estimators_samples_ is correct and that results
# generated at fit time can be identically reproduced at a later time
# using data saved in object attributes.
X, y = make_hastie_10_2(n_samples=200, random_state=1)
bagging = BaggingClassifier(LogisticRegression(), max_samples=0.5,
max_features=0.5, random_state=1,
bootstrap=False)
bagging.fit(X, y)
# Get relevant attributes
estimators_samples = bagging.estimators_samples_
estimators_features = bagging.estimators_features_
estimators = bagging.estimators_
# Test for correct formatting
assert_equal(len(estimators_samples), len(estimators))
assert_equal(len(estimators_samples[0]), len(X) // 2)
assert_equal(estimators_samples[0].dtype.kind, 'i')
# Re-fit single estimator to test for consistent sampling
estimator_index = 0
estimator_samples = estimators_samples[estimator_index]
estimator_features = estimators_features[estimator_index]
estimator = estimators[estimator_index]
X_train = (X[estimator_samples])[:, estimator_features]
y_train = y[estimator_samples]
orig_coefs = estimator.coef_
estimator.fit(X_train, y_train)
new_coefs = estimator.coef_
assert_array_almost_equal(orig_coefs, new_coefs)
示例17
def test_max_samples_consistency():
# Make sure validated max_samples and original max_samples are identical
# when valid integer max_samples supplied by user
max_samples = 100
X, y = make_hastie_10_2(n_samples=2*max_samples, random_state=1)
bagging = BaggingClassifier(KNeighborsClassifier(),
max_samples=max_samples,
max_features=0.5, random_state=1)
bagging.fit(X, y)
assert_equal(bagging._max_samples, max_samples)
示例18
def test_max_feature_auto():
# Test if max features is set properly for floats and str.
X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
_, n_features = X.shape
X_train = X[:2000]
y_train = y[:2000]
gbrt = GradientBoostingClassifier(n_estimators=1, max_features='auto')
gbrt.fit(X_train, y_train)
assert_equal(gbrt.max_features_, int(np.sqrt(n_features)))
gbrt = GradientBoostingRegressor(n_estimators=1, max_features='auto')
gbrt.fit(X_train, y_train)
assert_equal(gbrt.max_features_, n_features)
gbrt = GradientBoostingRegressor(n_estimators=1, max_features=0.3)
gbrt.fit(X_train, y_train)
assert_equal(gbrt.max_features_, int(n_features * 0.3))
gbrt = GradientBoostingRegressor(n_estimators=1, max_features='sqrt')
gbrt.fit(X_train, y_train)
assert_equal(gbrt.max_features_, int(np.sqrt(n_features)))
gbrt = GradientBoostingRegressor(n_estimators=1, max_features='log2')
gbrt.fit(X_train, y_train)
assert_equal(gbrt.max_features_, int(np.log2(n_features)))
gbrt = GradientBoostingRegressor(n_estimators=1,
max_features=0.01 / X.shape[1])
gbrt.fit(X_train, y_train)
assert_equal(gbrt.max_features_, 1)
示例19
def test_staged_predict_proba():
# Test whether staged predict proba eventually gives
# the same prediction.
X, y = datasets.make_hastie_10_2(n_samples=1200,
random_state=1)
X_train, y_train = X[:200], y[:200]
X_test, y_test = X[200:], y[200:]
clf = GradientBoostingClassifier(n_estimators=20)
# test raise NotFittedError if not fitted
assert_raises(NotFittedError, lambda X: np.fromiter(
clf.staged_predict_proba(X), dtype=np.float64), X_test)
clf.fit(X_train, y_train)
# test if prediction for last stage equals ``predict``
for y_pred in clf.staged_predict(X_test):
assert_equal(y_test.shape, y_pred.shape)
assert_array_equal(clf.predict(X_test), y_pred)
# test if prediction for last stage equals ``predict_proba``
for staged_proba in clf.staged_predict_proba(X_test):
assert_equal(y_test.shape[0], staged_proba.shape[0])
assert_equal(2, staged_proba.shape[1])
assert_array_almost_equal(clf.predict_proba(X_test), staged_proba)
示例20
def test_warm_start_n_estimators(Cls):
# Test if warm start equals fit - set n_estimators.
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
est = Cls(n_estimators=300, max_depth=1)
est.fit(X, y)
est_ws = Cls(n_estimators=100, max_depth=1, warm_start=True)
est_ws.fit(X, y)
est_ws.set_params(n_estimators=300)
est_ws.fit(X, y)
assert_array_almost_equal(est_ws.predict(X), est.predict(X))
示例21
def test_warm_start_max_depth(Cls):
# Test if possible to fit trees of different depth in ensemble.
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
est = Cls(n_estimators=100, max_depth=1, warm_start=True)
est.fit(X, y)
est.set_params(n_estimators=110, max_depth=2)
est.fit(X, y)
# last 10 trees have different depth
assert_equal(est.estimators_[0, 0].max_depth, 1)
for i in range(1, 11):
assert_equal(est.estimators_[-i, 0].max_depth, 2)
示例22
def test_warm_start_zero_n_estimators(Cls):
# Test if warm start with zero n_estimators raises error
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
est = Cls(n_estimators=100, max_depth=1, warm_start=True)
est.fit(X, y)
est.set_params(n_estimators=0)
assert_raises(ValueError, est.fit, X, y)
示例23
def test_warm_start_smaller_n_estimators(Cls):
# Test if warm start with smaller n_estimators raises error
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
est = Cls(n_estimators=100, max_depth=1, warm_start=True)
est.fit(X, y)
est.set_params(n_estimators=99)
assert_raises(ValueError, est.fit, X, y)
示例24
def test_warm_start_equal_n_estimators(Cls):
# Test if warm start with equal n_estimators does nothing
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
est = Cls(n_estimators=100, max_depth=1)
est.fit(X, y)
est2 = clone(est)
est2.set_params(n_estimators=est.n_estimators, warm_start=True)
est2.fit(X, y)
assert_array_almost_equal(est2.predict(X), est.predict(X))
示例25
def test_warm_start_oob_switch(Cls):
# Test if oob can be turned on during warm start.
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
est = Cls(n_estimators=100, max_depth=1, warm_start=True)
est.fit(X, y)
est.set_params(n_estimators=110, subsample=0.5)
est.fit(X, y)
assert_array_equal(est.oob_improvement_[:100], np.zeros(100))
# the last 10 are not zeros
assert_array_equal(est.oob_improvement_[-10:] == 0.0,
np.zeros(10, dtype=np.bool))
示例26
def test_warm_start_oob(Cls):
# Test if warm start OOB equals fit.
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
est = Cls(n_estimators=200, max_depth=1, subsample=0.5,
random_state=1)
est.fit(X, y)
est_ws = Cls(n_estimators=100, max_depth=1, subsample=0.5,
random_state=1, warm_start=True)
est_ws.fit(X, y)
est_ws.set_params(n_estimators=200)
est_ws.fit(X, y)
assert_array_almost_equal(est_ws.oob_improvement_[:100],
est.oob_improvement_[:100])
示例27
def test_monitor_early_stopping(Cls):
# Test if monitor return value works.
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
est = Cls(n_estimators=20, max_depth=1, random_state=1, subsample=0.5)
est.fit(X, y, monitor=early_stopping_monitor)
assert_equal(est.n_estimators, 20) # this is not altered
assert_equal(est.estimators_.shape[0], 10)
assert_equal(est.train_score_.shape[0], 10)
assert_equal(est.oob_improvement_.shape[0], 10)
# try refit
est.set_params(n_estimators=30)
est.fit(X, y)
assert_equal(est.n_estimators, 30)
assert_equal(est.estimators_.shape[0], 30)
assert_equal(est.train_score_.shape[0], 30)
est = Cls(n_estimators=20, max_depth=1, random_state=1, subsample=0.5,
warm_start=True)
est.fit(X, y, monitor=early_stopping_monitor)
assert_equal(est.n_estimators, 20)
assert_equal(est.estimators_.shape[0], 10)
assert_equal(est.train_score_.shape[0], 10)
assert_equal(est.oob_improvement_.shape[0], 10)
# try refit
est.set_params(n_estimators=30, warm_start=False)
est.fit(X, y)
assert_equal(est.n_estimators, 30)
assert_equal(est.train_score_.shape[0], 30)
assert_equal(est.estimators_.shape[0], 30)
assert_equal(est.oob_improvement_.shape[0], 30)
示例28
def test_complete_classification():
# Test greedy trees with max_depth + 1 leafs.
from sklearn.tree._tree import TREE_LEAF
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
k = 4
est = GradientBoostingClassifier(n_estimators=20, max_depth=None,
random_state=1, max_leaf_nodes=k + 1)
est.fit(X, y)
tree = est.estimators_[0, 0].tree_
assert_equal(tree.max_depth, k)
assert_equal(tree.children_left[tree.children_left == TREE_LEAF].shape[0],
k + 1)
示例29
def test_max_leaf_nodes_max_depth(GBEstimator):
# Test precedence of max_leaf_nodes over max_depth.
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
k = 4
est = GBEstimator(max_depth=1, max_leaf_nodes=k).fit(X, y)
tree = est.estimators_[0, 0].tree_
assert_equal(tree.max_depth, 1)
est = GBEstimator(max_depth=1).fit(X, y)
tree = est.estimators_[0, 0].tree_
assert_equal(tree.max_depth, 1)
示例30
def test_min_impurity_split(GBEstimator):
# Test if min_impurity_split of base estimators is set
# Regression test for #8006
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
est = GBEstimator(min_impurity_split=0.1)
est = assert_warns_message(DeprecationWarning, "min_impurity_decrease",
est.fit, X, y)
for tree in est.estimators_.flat:
assert_equal(tree.min_impurity_split, 0.1)