Python源码示例:sklearn.ensemble.GradientBoostingRegressor()
示例1
def build_ensemble(**kwargs):
"""Generate ensemble."""
ens = SuperLearner(**kwargs)
prep = {'Standard Scaling': [StandardScaler()],
'Min Max Scaling': [MinMaxScaler()],
'No Preprocessing': []}
est = {'Standard Scaling':
[ElasticNet(), Lasso(), KNeighborsRegressor()],
'Min Max Scaling':
[SVR()],
'No Preprocessing':
[RandomForestRegressor(random_state=SEED),
GradientBoostingRegressor()]}
ens.add(est, prep)
ens.add(GradientBoostingRegressor(), meta=True)
return ens
示例2
def test_partial_dependence_sample_weight():
# Test near perfect correlation between partial dependence and diagonal
# when sample weights emphasize y = x predictions
N = 1000
rng = np.random.RandomState(123456)
mask = rng.randint(2, size=N, dtype=bool)
x = rng.rand(N)
# set y = x on mask and y = -x outside
y = x.copy()
y[~mask] = -y[~mask]
X = np.c_[mask, x]
# sample weights to emphasize data points where y = x
sample_weight = np.ones(N)
sample_weight[mask] = 1000.
clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
clf.fit(X, y, sample_weight=sample_weight)
grid = np.arange(0, 1, 0.01)
pdp = partial_dependence(clf, [1], grid=grid)
assert np.corrcoef(np.ravel(pdp[0]), grid)[0, 1] > 0.99
示例3
def test_regressor_parameter_checks():
# Check input parameter validation for GradientBoostingRegressor
assert_raise_message(ValueError, "alpha must be in (0.0, 1.0) but was 1.2",
GradientBoostingRegressor(loss='huber', alpha=1.2)
.fit, X, y)
assert_raise_message(ValueError, "alpha must be in (0.0, 1.0) but was 1.2",
GradientBoostingRegressor(loss='quantile', alpha=1.2)
.fit, X, y)
assert_raise_message(ValueError, "Invalid value for max_features: "
"'invalid'. Allowed string values are 'auto', 'sqrt'"
" or 'log2'.",
GradientBoostingRegressor(max_features='invalid').fit,
X, y)
assert_raise_message(ValueError, "n_iter_no_change should either be None"
" or an integer. 'invalid' was passed",
GradientBoostingRegressor(n_iter_no_change='invalid')
.fit, X, y)
allowed_presort = ('auto', True, False)
assert_raise_message(ValueError,
"'presort' should be in {}. "
"Got 'invalid' instead.".format(allowed_presort),
GradientBoostingRegressor(presort='invalid')
.fit, X, y)
示例4
def test_check_inputs_predict():
# X has wrong shape
clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
clf.fit(X, y)
x = np.array([1.0, 2.0])[:, np.newaxis]
assert_raises(ValueError, clf.predict, x)
x = np.array([[]])
assert_raises(ValueError, clf.predict, x)
x = np.array([1.0, 2.0, 3.0])[:, np.newaxis]
assert_raises(ValueError, clf.predict, x)
clf = GradientBoostingRegressor(n_estimators=100, random_state=1)
clf.fit(X, rng.rand(len(X)))
x = np.array([1.0, 2.0])[:, np.newaxis]
assert_raises(ValueError, clf.predict, x)
x = np.array([[]])
assert_raises(ValueError, clf.predict, x)
x = np.array([1.0, 2.0, 3.0])[:, np.newaxis]
assert_raises(ValueError, clf.predict, x)
示例5
def test_staged_predict():
# Test whether staged decision function eventually gives
# the same prediction.
X, y = datasets.make_friedman1(n_samples=1200,
random_state=1, noise=1.0)
X_train, y_train = X[:200], y[:200]
X_test = X[200:]
clf = GradientBoostingRegressor()
# test raise ValueError if not fitted
assert_raises(ValueError, lambda X: np.fromiter(
clf.staged_predict(X), dtype=np.float64), X_test)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# test if prediction for last stage equals ``predict``
for y in clf.staged_predict(X_test):
assert_equal(y.shape, y_pred.shape)
assert_array_almost_equal(y_pred, y)
示例6
def test_warm_start(Cls):
# Test if warm start equals fit.
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
est = Cls(n_estimators=200, max_depth=1)
est.fit(X, y)
est_ws = Cls(n_estimators=100, max_depth=1, warm_start=True)
est_ws.fit(X, y)
est_ws.set_params(n_estimators=200)
est_ws.fit(X, y)
if Cls is GradientBoostingRegressor:
assert_array_almost_equal(est_ws.predict(X), est.predict(X))
else:
# Random state is preserved and hence predict_proba must also be
# same
assert_array_equal(est_ws.predict(X), est.predict(X))
assert_array_almost_equal(est_ws.predict_proba(X),
est.predict_proba(X))
示例7
def test_gradient_boosting_with_init(gb, dataset_maker, init_estimator):
# Check that GradientBoostingRegressor works when init is a sklearn
# estimator.
# Check that an error is raised if trying to fit with sample weight but
# inital estimator does not support sample weight
X, y = dataset_maker()
sample_weight = np.random.RandomState(42).rand(100)
# init supports sample weights
init_est = init_estimator()
gb(init=init_est).fit(X, y, sample_weight=sample_weight)
# init does not support sample weights
init_est = _NoSampleWeightWrapper(init_estimator())
gb(init=init_est).fit(X, y) # ok no sample weights
with pytest.raises(ValueError,
match="estimator.*does not support sample weights"):
gb(init=init_est).fit(X, y, sample_weight=sample_weight)
示例8
def test_multi_target_regression():
X, y = datasets.make_regression(n_targets=3)
X_train, y_train = X[:50], y[:50]
X_test, y_test = X[50:], y[50:]
references = np.zeros_like(y_test)
for n in range(3):
rgr = GradientBoostingRegressor(random_state=0)
rgr.fit(X_train, y_train[:, n])
references[:, n] = rgr.predict(X_test)
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
rgr.fit(X_train, y_train)
y_pred = rgr.predict(X_test)
assert_almost_equal(references, y_pred)
# 0.23. warning about tol not having its correct default value.
示例9
def test_multi_target_sample_weights():
# weighted regressor
Xw = [[1, 2, 3], [4, 5, 6]]
yw = [[3.141, 2.718], [2.718, 3.141]]
w = [2., 1.]
rgr_w = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
rgr_w.fit(Xw, yw, w)
# unweighted, but with repeated samples
X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]]
y = [[3.141, 2.718], [3.141, 2.718], [2.718, 3.141]]
rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
rgr.fit(X, y)
X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
assert_almost_equal(rgr.predict(X_test), rgr_w.predict(X_test))
# Import the data
示例10
def __init__(self, q1=.16, q2=.84,**params):
"""
Gradient boosted trees as surrogate model for Bayesian Optimization.
Uses quantile regression for an estimate of the 'posterior' variance.
In practice, the std is computed as (`q2` - `q1`) / 2.
Relies on `sklearn.ensemble.GradientBoostingRegressor`
Parameters
----------
q1: float
First quantile.
q2: float
Second quantile
params: tuple
Extra parameters to pass to `GradientBoostingRegressor`
"""
self.params = params
self.q1 = q1
self.q2 = q2
self.eps = 1e-1
示例11
def fit(self, X, y):
"""
Fit a GBM model to data `X` and targets `y`.
Parameters
----------
X : array-like
Input values.
y: array-like
Target values.
"""
self.X = X
self.y = y
self.n = self.X.shape[0]
self.modq1 = GradientBoostingRegressor(loss='quantile', alpha=self.q1, **self.params)
self.modq2 = GradientBoostingRegressor(loss='quantile', alpha=self.q2, **self.params)
self.mod = GradientBoostingRegressor(loss = 'ls', **self.params)
self.modq1.fit(self.X, self.y)
self.modq2.fit(self.X, self.y)
self.mod.fit(self.X, self.y)
示例12
def test_boston_OHE_plus_trees(self):
data = load_boston()
pl = Pipeline(
[
("OHE", OneHotEncoder(categorical_features=[8], sparse=False)),
("Trees", GradientBoostingRegressor(random_state=1)),
]
)
pl.fit(data.data, data.target)
# Convert the model
spec = convert(pl, data.feature_names, "target")
if _is_macos() and _macos_version() >= (10, 13):
# Get predictions
df = pd.DataFrame(data.data, columns=data.feature_names)
df["prediction"] = pl.predict(data.data)
# Evaluate it
result = evaluate_regressor(spec, df, "target", verbose=False)
assert result["max_error"] < 0.0001
示例13
def train_model(self, train_file_path, model_path):
print("==> Load the data ...")
X_train, Y_train = self.load_file(train_file_path)
print(train_file_path, shape(X_train))
print("==> Train the model ...")
min_max_scaler = preprocessing.MaxAbsScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
clf = GradientBoostingRegressor(n_estimators=self.n_estimators)
clf.fit(X_train_minmax.toarray(), Y_train)
print("==> Save the model ...")
pickle.dump(clf, open(model_path, 'wb'))
scaler_path = model_path.replace('.pkl', '.scaler.pkl')
pickle.dump(min_max_scaler, open(scaler_path, 'wb'))
return clf
示例14
def test_same_prediction(self):
from sklearn.ensemble import GradientBoostingRegressor
params = {'n_estimators': 1, 'max_depth': 2, 'min_samples_split': 2,
'learning_rate': 0.8, 'loss': 'ls'}
sklearn_model = GradientBoostingRegressor(**params)
sklearn_model.fit(self.data.X.values, self.data.y.values)
sklearn_tree = sklearn_model.estimators_[0][0].tree_
bartpy_tree = Tree([LeafNode(Split(self.data))])
map_sklearn_tree_into_bartpy(bartpy_tree, sklearn_tree)
sklearn_predictions = sklearn_tree.predict(self.data.X.values.astype(np.float32))
sklearn_predictions = [round(x, 2) for x in sklearn_predictions.reshape(-1)]
bartpy_tree.cache_up_to_date = False
bartpy_tree_predictions = bartpy_tree.predict(self.data.X.values)
bartpy_tree_predictions = [round(x, 2) for x in bartpy_tree_predictions]
self.assertListEqual(sklearn_predictions, bartpy_tree_predictions)
示例15
def grid_search(X, y, split, learn=[.01], samples_leaf=[250, 350, 500],
depth=[10, 15]):
'''
Runs a grid search for GBM on split data
'''
for l in learn:
for s in samples_leaf:
for d in depth:
model = GradientBoostingRegressor(n_estimators=250,
learning_rate=l,
min_samples_leaf=s,
max_depth=d,
random_state=42)
model.fit(X.values[:split], y.values[:split])
in_score = model.score(X.values[:split], y.values[:split])
out_score = model.score(X.values[split:], y.values[split:])
print 'learning_rate: {}, min_samples_leaf: {}, max_depth: {}'.\
format(l, s, d)
print 'in-sample score:', in_score
print 'out-sample score:', out_score
print ''
示例16
def grid_search(X, y, split, learn=[.01], samples_leaf=[250, 350, 500],
depth=[10, 15]):
'''
Runs a grid search for GBM on split data
'''
for l in learn:
for s in samples_leaf:
for d in depth:
model = GradientBoostingRegressor(n_estimators=250,
learning_rate=l,
min_samples_leaf=s,
max_depth=d,
random_state=42)
model.fit(X.values[:split], y.values[:split])
in_score = model.score(X.values[:split], y.values[:split])
out_score = model.score(X.values[split:], y.values[split:])
print 'learning_rate: {}, min_samples_leaf: {}, max_depth: {}'.\
format(l, s, d)
print 'in-sample score:', in_score
print 'out-sample score:', out_score
print ''
示例17
def para_adaboost(data):
''' para_adaboost(data)
kernel function for parallel computing adaboost classifier
data: training data containing features and labels in a tuple
Return: adaboost classifier model '''
model = GradientBoostingRegressor(
learning_rate = 1,
n_estimators = 1000,
max_depth = 1,
random_state = 0
)
patch, label = data
model = model.fit(patch, label)
return model
#-----------------END: functions used for parallel computation--------------------------#
示例18
def __init__(self, options):
self.handle_options(options)
params = options.get('params', {})
out_params = convert_params(
params,
strs=['loss', 'max_features'],
floats=['learning_rate', 'min_weight_fraction_leaf', 'alpha', 'subsample'],
ints=['n_estimators', 'max_depth', 'min_samples_split',
'min_samples_leaf', 'max_leaf_nodes', 'random_state'],
)
valid_loss = ['ls', 'lad', 'huber', 'quantile']
if 'loss' in out_params:
if out_params['loss'] not in valid_loss:
msg = "loss must be one of: {}".format(', '.join(valid_loss))
raise RuntimeError(msg)
if 'max_features' in out_params:
out_params['max_features'] = handle_max_features(out_params['max_features'])
self.estimator = _GradientBoostingRegressor(**out_params)
示例19
def test_register_model(self, boston_dataset):
pytest.importorskip('sklearn')
from sasctl import register_model
from sklearn.ensemble import GradientBoostingRegressor
TARGET = 'Price'
X = boston_dataset.drop(TARGET, axis=1)
y = boston_dataset[TARGET]
model = GradientBoostingRegressor()
model.fit(X, y)
model = register_model(model, self.MODEL_NAME, self.PROJECT_NAME, input=X, force=True)
assert model.name == self.MODEL_NAME
assert model.projectName == self.PROJECT_NAME
assert model.function.lower() == 'prediction'
assert model.algorithm.lower() == 'gradient boosting'
assert model.targetLevel.lower() == 'interval'
assert model.tool.lower().startswith('python')
示例20
def ensure_many_models(self):
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import ElasticNet, RANSACRegressor, HuberRegressor, PassiveAggressiveRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR, LinearSVR
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings('ignore', category=ConvergenceWarning)
for learner in [GradientBoostingRegressor, RandomForestRegressor, MLPRegressor,
ElasticNet, RANSACRegressor, HuberRegressor, PassiveAggressiveRegressor,
KNeighborsRegressor, SVR, LinearSVR]:
learner = learner()
learner_name = str(learner).split("(", maxsplit=1)[0]
with self.subTest("Test fit using {learner}".format(learner=learner_name)):
model = self.estimator.__class__(learner)
model.fit(self.data_lin["X"], self.data_lin["a"], self.data_lin["y"])
self.assertTrue(True) # Fit did not crash
示例21
def __init__(self, numFeatures, numSamples, randomSeed):
"""
:param numFeatures: total number of features to be used (at least 5)
:param numSamples: number of samples in dataset
:param randomSeed: random seed value used for reproducible results
"""
self.numFeatures = numFeatures
self.numSamples = numSamples
self.randomSeed = randomSeed
# generate test data:
self.X, self.y = datasets.make_friedman1(n_samples=self.numSamples, n_features=self.numFeatures,
noise=self.NOISE, random_state=self.randomSeed)
# divide the data to a training set and a validation set:
self.X_train, self.X_validation, self.y_train, self.y_validation = \
model_selection.train_test_split(self.X, self.y, test_size=self.VALIDATION_SIZE, random_state=self.randomSeed)
self.regressor = GradientBoostingRegressor(random_state=self.randomSeed)
示例22
def test_model_ransac_regressor_tree(self):
model, X = fit_regression_model(
linear_model.RANSACRegressor(
base_estimator=GradientBoostingRegressor()))
model_onnx = convert_sklearn(
model, "ransac regressor",
[("input", FloatTensorType([None, X.shape[1]]))])
self.assertIsNotNone(model_onnx)
dump_data_and_model(
X,
model,
model_onnx,
verbose=False,
basename="SklearnRANSACRegressorTree-Dec3",
allow_failure="StrictVersion("
"onnxruntime.__version__)"
"<= StrictVersion('0.2.1')",
)
示例23
def test_GradientBoostingRegression(self):
boston = datasets.load_boston()
df = pdml.ModelFrame(boston)
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 0.9,
'learning_rate': 0.01, 'loss': 'ls', 'random_state': self.random_state}
clf1 = ensemble.GradientBoostingRegressor(**params)
clf2 = df.ensemble.GradientBoostingRegressor(**params)
clf1.fit(boston.data, boston.target)
df.fit(clf2)
expected = clf1.predict(boston.data)
predicted = df.predict(clf2)
self.assertIsInstance(predicted, pdml.ModelSeries)
self.assert_numpy_array_almost_equal(predicted.values, expected)
self.assertAlmostEqual(df.metrics.mean_squared_error(),
metrics.mean_squared_error(boston.target, expected))
示例24
def GBDT_First(self, data, max_depth=17, n_estimators=57):
model = GradientBoostingRegressor(loss='ls', n_estimators=n_estimators, max_depth=max_depth,
learning_rate=0.12, subsample=0.8)
model.fit(data['train'][:, :-1], data['train'][:, -1])
# 注意存储验证数据集结果和预测数据集结果的不同
# 训练数据集的预测结果
xul = model.predict(data['train'][:, :-1])
# 验证的预测结果
yanre = model.predict(data['test'][:, :-1])
# 预测的预测结果
prer = model.predict(data['predict'][:, :-1])
# 储存
self.yanzhneg_pr.append(yanre)
self.predi.append(prer)
# 分别计算训练、验证、预测的误差
# 每计算一折后,要计算训练、验证、预测数据的误差
xx = self.RMSE(xul, data['train'][:, -1])
yy = self.RMSE(yanre, data['test'][:, -1])
pp = self.RMSE(prer, data['predict'][:, -1])
# 储存误差
self.error_dict['GBDT'] = [xx, yy, pp]
# 验证数据集的真实输出结果
self.yanzhneg_real = data['test'][:, -1]
# 预测数据集的真实输出结果
self.preal = data['predict'][:, -1]
return print('1层中的GBDT运行完毕')
# LightGBM
示例25
def Train(data, modelcount, censhu, yanzhgdata):
model = GradientBoostingRegressor(loss='ls', n_estimators=modelcount, max_depth=censhu, learning_rate=0.12, subsample=0.8)
model.fit(data[:, :-1], data[:, -1])
# 给出训练数据的预测值
train_out = model.predict(data[:, :-1])
# 计算MSE
train_mse = mse(data[:, -1], train_out)
# 给出验证数据的预测值
add_yan = model.predict(yanzhgdata[:, :-1])
# 计算MSE
add_mse = mse(yanzhgdata[:, -1], add_yan)
print(train_mse, add_mse)
return train_mse, add_mse
# 最终确定组合的函数
示例26
def recspre(exstr, predata, datadict, zhe, count=100):
tree, te = exstr.split('-')
model = GradientBoostingRegressor(loss='ls', n_estimators=int(tree), max_depth=int(te), learning_rate=0.12, subsample=0.8)
model.fit(datadict[zhe]['train'][:, :-1], datadict[zhe]['train'][:, -1])
# 预测
yucede = model.predict(predata[:, :-1])
# 为了便于展示,选100条数据进行展示
zongleng = np.arange(len(yucede))
randomnum = np.random.choice(zongleng, count, replace=False)
yucede_se = list(np.array(yucede)[randomnum])
yuce_re = list(np.array(predata[:, -1])[randomnum])
# 对比
plt.figure(figsize=(17, 9))
plt.subplot(2, 1, 1)
plt.plot(list(range(len(yucede_se))), yucede_se, 'r--', label='预测', lw=2)
plt.scatter(list(range(len(yuce_re))), yuce_re, c='b', marker='.', label='真实', lw=2)
plt.xlim(-1, count + 1)
plt.legend()
plt.title('预测和真实值对比[最大树数%d]' % int(tree))
plt.subplot(2, 1, 2)
plt.plot(list(range(len(yucede_se))), np.array(yuce_re) - np.array(yucede_se), 'k--', marker='s', label='真实-预测', lw=2)
plt.legend()
plt.title('预测和真实值相对误差')
plt.savefig(r'C:\Users\GWT9\Desktop\duibi.jpg')
return '预测真实对比完毕'
# 主函数
示例27
def test_partial_dependence_regressor():
# Test partial dependence for regressor
clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
clf.fit(boston.data, boston.target)
grid_resolution = 25
pdp, axes = partial_dependence(
clf, [0], X=boston.data, grid_resolution=grid_resolution)
assert pdp.shape == (1, grid_resolution)
assert axes[0].shape[0] == grid_resolution
示例28
def test_plot_partial_dependence(pyplot):
# Test partial dependence plot function.
clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
clf.fit(boston.data, boston.target)
grid_resolution = 25
fig, axs = plot_partial_dependence(clf, boston.data, [0, 1, (0, 1)],
grid_resolution=grid_resolution,
feature_names=boston.feature_names)
assert len(axs) == 3
assert all(ax.has_data for ax in axs)
# check with str features and array feature names
fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
('CRIM', 'ZN')],
grid_resolution=grid_resolution,
feature_names=boston.feature_names)
assert len(axs) == 3
assert all(ax.has_data for ax in axs)
# check with list feature_names
feature_names = boston.feature_names.tolist()
fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
('CRIM', 'ZN')],
grid_resolution=grid_resolution,
feature_names=feature_names)
assert len(axs) == 3
assert all(ax.has_data for ax in axs)
示例29
def test_raise_deprecation_warning(pyplot, func, params):
clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
clf.fit(boston.data, boston.target)
grid_resolution = 25
warn_msg = "The function ensemble.{} has been deprecated".format(
func.__name__
)
with pytest.warns(DeprecationWarning, match=warn_msg):
func(clf, **params, grid_resolution=grid_resolution)
示例30
def check_boston(presort, loss, subsample):
# Check consistency on dataset boston house prices with least squares
# and least absolute deviation.
ones = np.ones(len(boston.target))
last_y_pred = None
for sample_weight in None, ones, 2 * ones:
clf = GradientBoostingRegressor(n_estimators=100,
loss=loss,
max_depth=4,
subsample=subsample,
min_samples_split=2,
random_state=1,
presort=presort)
assert_raises(ValueError, clf.predict, boston.data)
clf.fit(boston.data, boston.target,
sample_weight=sample_weight)
leaves = clf.apply(boston.data)
assert_equal(leaves.shape, (506, 100))
y_pred = clf.predict(boston.data)
mse = mean_squared_error(boston.target, y_pred)
assert_less(mse, 6.0)
if last_y_pred is not None:
assert_array_almost_equal(last_y_pred, y_pred)
last_y_pred = y_pred