Python源码示例:sklearn.ensemble.ExtraTreesRegressor()
示例1
def fit(self, X, y):
"""
Fit a Random Forest model to data `X` and targets `y`.
Parameters
----------
X : array-like
Input values.
y: array-like
Target values.
"""
self.X = X
self.y = y
self.n = self.X.shape[0]
self.model = ExtraTreesRegressor(**self.params)
self.model.fit(X, y)
示例2
def __init__(self, params):
super(ExtraTreesRegressorAlgorithm, self).__init__(params)
logger.debug("ExtraTreesRegressorAlgorithm.__init__")
self.library_version = sklearn.__version__
self.trees_in_step = regression_additional.get("trees_in_step", 100)
self.max_steps = regression_additional.get("max_steps", 50)
self.early_stopping_rounds = regression_additional.get(
"early_stopping_rounds", 50
)
self.model = ExtraTreesRegressor(
n_estimators=self.trees_in_step,
criterion=params.get("criterion", "mse"),
max_features=params.get("max_features", 0.8),
min_samples_split=params.get("min_samples_split", 4),
warm_start=True,
n_jobs=-1,
random_state=params.get("seed", 1),
)
示例3
def test_min_impurity_split():
# Test if min_impurity_split of base estimators is set
# Regression test for #8006
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
all_estimators = [RandomForestClassifier, RandomForestRegressor,
ExtraTreesClassifier, ExtraTreesRegressor]
for Estimator in all_estimators:
est = Estimator(min_impurity_split=0.1)
est = assert_warns_message(DeprecationWarning, "min_impurity_decrease",
est.fit, X, y)
for tree in est.estimators_:
assert_equal(tree.min_impurity_split, 0.1)
示例4
def test_min_impurity_decrease():
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
all_estimators = [RandomForestClassifier, RandomForestRegressor,
ExtraTreesClassifier, ExtraTreesRegressor]
for Estimator in all_estimators:
est = Estimator(min_impurity_decrease=0.1)
est.fit(X, y)
for tree in est.estimators_:
# Simply check if the parameter is passed on correctly. Tree tests
# will suffice for the actual working of this param
assert_equal(tree.min_impurity_decrease, 0.1)
示例5
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
orig_cols = list(X.names)
if self.num_classes >= 2:
lb = LabelEncoder()
lb.fit(self.labels)
y = lb.transform(y)
model = ExtraTreesClassifier(**self.params)
else:
model = ExtraTreesRegressor(**self.params)
# Replace missing values with a value smaller than all observed values
self.min = dict()
for col in X.names:
XX = X[:, col]
self.min[col] = XX.min1()
if self.min[col] is None or np.isnan(self.min[col]):
self.min[col] = -1e10
else:
self.min[col] -= 1
XX.replace(None, self.min[col])
X[:, col] = XX
assert X[dt.isna(dt.f[col]), col].nrows == 0
X = X.to_numpy()
model.fit(X, y)
importances = np.array(model.feature_importances_)
self.set_model_properties(model=model,
features=orig_cols,
importances=importances.tolist(),
iterations=self.params['n_estimators'])
示例6
def __init__(self, **params):
"""
Wrapper around sklearn's ExtraTreesRegressor implementation for pyGPGO.
Random Forests can also be used for surrogate models in Bayesian Optimization.
An estimate of 'posterior' variance can be obtained by using the `impurity`
criterion value in each subtree.
Parameters
----------
params: tuple, optional
Any parameters to pass to `RandomForestRegressor`. Defaults to sklearn's.
"""
self.params = params
示例7
def __init__(self, algorithm, params = None):
'''
Initialize the class with a list of possible algorithms and
recommended hyperparameter ranges
'''
if algorithm == 'etr': # Extra trees regressor
from sklearn.ensemble import ExtraTreesRegressor
self.hyper_range = {"max_depth": [4, 8, 12, 16, 20],
"min_samples_split": np.arange(2, 11),
"min_samples_leaf": np.arange(1, 11),
"n_estimators": np.arange(10,801,40)}
self.algorithm = ExtraTreesRegressor()
elif algorithm == 'gbm': # Gradient boosting model
from sklearn.ensemble import GradientBoostingRegressor
self.hyper_range = {"max_depth": [4, 8, 12, 16, 20],
"min_samples_split": np.arange(2, 11),
"min_samples_leaf": np.arange(1, 11),
"n_estimators": np.arange(10,801,40)}
self.algorithm = GradientBoostingRegressor()
elif algorithm == 'gam': # Generalized additive model
from pygam import GAM
self.hyper_range = {'n_splines': np.arange(5,40)}
self.algorithm = GAM()
# Set scorer as R2
self.my_scorer = make_scorer(r2_score, greater_is_better = True)
示例8
def __init__(self, options):
self.handle_options(options)
params = options.get('params', {})
out_params = convert_params(
params,
floats=['max_samples', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_features', 'min_impurity_split'],
bools=['bootstrap', 'oob_score', 'warm_start'],
ints=['n_estimators', 'max_depth', 'max_leaf_nodes', 'min_impurity_decrease'],
strs=['criterion'],
)
self.estimator = _ExtraTreesRegressor(**out_params)
示例9
def test_export_pipeline_5():
"""Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline with SelectFromModel."""
pipeline_string = (
'DecisionTreeRegressor(SelectFromModel(input_matrix, '
'SelectFromModel__ExtraTreesRegressor__max_features=0.05, SelectFromModel__ExtraTreesRegressor__n_estimators=100, '
'SelectFromModel__threshold=0.05), DecisionTreeRegressor__max_depth=8,'
'DecisionTreeRegressor__min_samples_leaf=5, DecisionTreeRegressor__min_samples_split=5)'
)
pipeline = creator.Individual.from_string(pipeline_string, tpot_obj_reg._pset)
expected_code = """import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \\
train_test_split(features, tpot_data['target'], random_state=None)
exported_pipeline = make_pipeline(
SelectFromModel(estimator=ExtraTreesRegressor(max_features=0.05, n_estimators=100), threshold=0.05),
DecisionTreeRegressor(max_depth=8, min_samples_leaf=5, min_samples_split=5)
)
exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
"""
assert expected_code == export_pipeline(pipeline, tpot_obj_reg.operators, tpot_obj_reg._pset)
示例10
def get_ensemble_models():
rf = RandomForestRegressor(
n_estimators=51, min_samples_leaf=5, min_samples_split=3, random_state=42,
n_jobs=int(0.8*n_cores))
bag = BaggingRegressor(n_estimators=51, random_state=42, n_jobs=int(0.8*n_cores))
extra = ExtraTreesRegressor(n_estimators=71, random_state=42, n_jobs=int(0.8*n_cores))
ada = AdaBoostRegressor(random_state=42)
grad = GradientBoostingRegressor(n_estimators=101, random_state=42)
classifier_list = [rf, bag, extra, ada, grad]
classifier_name_list = ['Random Forests', 'Bagging',
'Extra Trees', 'AdaBoost', 'Gradient Boost']
return classifier_list, classifier_name_list
示例11
def fit_predict(self, dfit, dpre, tournament):
clf = ETC(criterion='mse',
max_features=self.p['nfeatures'],
max_depth=self.p['depth'],
n_estimators=self.p['ntrees'],
random_state=self.p['seed'],
n_jobs=-1)
clf.fit(dfit.x, dfit.y[tournament])
yhat = clf.predict(dpre.x)
return dpre.ids, yhat
示例12
def test_objectmapper(self):
df = pdml.ModelFrame([])
self.assertIs(df.ensemble.AdaBoostClassifier,
ensemble.AdaBoostClassifier)
self.assertIs(df.ensemble.AdaBoostRegressor,
ensemble.AdaBoostRegressor)
self.assertIs(df.ensemble.BaggingClassifier,
ensemble.BaggingClassifier)
self.assertIs(df.ensemble.BaggingRegressor,
ensemble.BaggingRegressor)
self.assertIs(df.ensemble.ExtraTreesClassifier,
ensemble.ExtraTreesClassifier)
self.assertIs(df.ensemble.ExtraTreesRegressor,
ensemble.ExtraTreesRegressor)
self.assertIs(df.ensemble.GradientBoostingClassifier,
ensemble.GradientBoostingClassifier)
self.assertIs(df.ensemble.GradientBoostingRegressor,
ensemble.GradientBoostingRegressor)
self.assertIs(df.ensemble.IsolationForest,
ensemble.IsolationForest)
self.assertIs(df.ensemble.RandomForestClassifier,
ensemble.RandomForestClassifier)
self.assertIs(df.ensemble.RandomTreesEmbedding,
ensemble.RandomTreesEmbedding)
self.assertIs(df.ensemble.RandomForestRegressor,
ensemble.RandomForestRegressor)
self.assertIs(df.ensemble.VotingClassifier,
ensemble.VotingClassifier)
示例13
def test_min_impurity_split():
# Test if min_impurity_split of base estimators is set
# Regression test for #8006
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
all_estimators = [RandomForestClassifier, RandomForestRegressor,
ExtraTreesClassifier, ExtraTreesRegressor]
for Estimator in all_estimators:
est = Estimator(min_impurity_split=0.1)
est = assert_warns_message(DeprecationWarning, "min_impurity_decrease",
est.fit, X, y)
for tree in est.estimators_:
assert_equal(tree.min_impurity_split, 0.1)
示例14
def test_min_impurity_decrease():
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
all_estimators = [RandomForestClassifier, RandomForestRegressor,
ExtraTreesClassifier, ExtraTreesRegressor]
for Estimator in all_estimators:
est = Estimator(min_impurity_decrease=0.1)
est.fit(X, y)
for tree in est.estimators_:
# Simply check if the parameter is passed on correctly. Tree tests
# will suffice for the actual working of this param
assert_equal(tree.min_impurity_decrease, 0.1)
示例15
def _get_learner(self):
# xgboost
if self.learner_name in ["reg_xgb_linear", "reg_xgb_tree", "reg_xgb_tree_best_single_model"]:
return XGBRegressor(**self.param_dict)
if self.learner_name in ["clf_xgb_linear", "clf_xgb_tree"]:
return XGBClassifier(**self.param_dict)
# sklearn
if self.learner_name == "reg_skl_lasso":
return Lasso(**self.param_dict)
if self.learner_name == "reg_skl_ridge":
return Ridge(**self.param_dict)
if self.learner_name == "reg_skl_random_ridge":
return RandomRidge(**self.param_dict)
if self.learner_name == "reg_skl_bayesian_ridge":
return BayesianRidge(**self.param_dict)
if self.learner_name == "reg_skl_svr":
return SVR(**self.param_dict)
if self.learner_name == "reg_skl_lsvr":
return LinearSVR(**self.param_dict)
if self.learner_name == "reg_skl_knn":
return KNNRegressor(**self.param_dict)
if self.learner_name == "reg_skl_etr":
return ExtraTreesRegressor(**self.param_dict)
if self.learner_name == "reg_skl_rf":
return RandomForestRegressor(**self.param_dict)
if self.learner_name == "reg_skl_gbm":
return GradientBoostingRegressor(**self.param_dict)
if self.learner_name == "reg_skl_adaboost":
return AdaBoostRegressor(**self.param_dict)
# keras
if self.learner_name == "reg_keras_dnn":
try:
return KerasDNNRegressor(**self.param_dict)
except:
return None
# rgf
if self.learner_name == "reg_rgf":
return RGFRegressor(**self.param_dict)
# ensemble
if self.learner_name == "reg_ensemble":
return EnsembleLearner(**self.param_dict)
return None
示例16
def test_distribution():
rng = check_random_state(12321)
# Single variable with 4 values
X = rng.randint(0, 4, size=(1000, 1))
y = rng.rand(1000)
n_trees = 500
clf = ExtraTreesRegressor(n_estimators=n_trees, random_state=42).fit(X, y)
uniques = defaultdict(int)
for tree in clf.estimators_:
tree = "".join(("%d,%d/" % (f, int(t)) if f >= 0 else "-")
for f, t in zip(tree.tree_.feature,
tree.tree_.threshold))
uniques[tree] += 1
uniques = sorted([(1. * count / n_trees, tree)
for tree, count in uniques.items()])
# On a single variable problem where X_0 has 4 equiprobable values, there
# are 5 ways to build a random tree. The more compact (0,1/0,0/--0,2/--) of
# them has probability 1/3 while the 4 others have probability 1/6.
assert_equal(len(uniques), 5)
assert_greater(0.20, uniques[0][0]) # Rough approximation of 1/6.
assert_greater(0.20, uniques[1][0])
assert_greater(0.20, uniques[2][0])
assert_greater(0.20, uniques[3][0])
assert_greater(uniques[4][0], 0.3)
assert_equal(uniques[4][1], "0,1/0,0/--0,2/--")
# Two variables, one with 2 values, one with 3 values
X = np.empty((1000, 2))
X[:, 0] = np.random.randint(0, 2, 1000)
X[:, 1] = np.random.randint(0, 3, 1000)
y = rng.rand(1000)
clf = ExtraTreesRegressor(n_estimators=100, max_features=1,
random_state=1).fit(X, y)
uniques = defaultdict(int)
for tree in clf.estimators_:
tree = "".join(("%d,%d/" % (f, int(t)) if f >= 0 else "-")
for f, t in zip(tree.tree_.feature,
tree.tree_.threshold))
uniques[tree] += 1
uniques = [(count, tree) for tree, count in uniques.items()]
assert_equal(len(uniques), 8)
示例17
def _predict_forest(model, X, joint_contribution=False):
"""
For a given RandomForestRegressor, RandomForestClassifier,
ExtraTreesRegressor, or ExtraTreesClassifier returns a triple of
[prediction, bias and feature_contributions], such that prediction ≈ bias +
feature_contributions.
"""
if joint_contribution:
biases = []
contributions = []
predictions = []
for tree in model.estimators_:
pred, bias, contribution = _predict_tree(tree, X, joint_contribution=joint_contribution)
biases.append(bias)
contributions.append(contribution)
predictions.append(pred)
total_contributions = []
for i in range(len(X)):
contr = {}
for j, dct in enumerate(contributions):
for k in set(dct[i]).union(set(contr.keys())):
contr[k] = (contr.get(k, 0)*j + dct[i].get(k,0) ) / (j+1)
total_contributions.append(contr)
for i, item in enumerate(contribution):
total_contributions[i]
sm = sum([v for v in contribution[i].values()])
return (np.mean(predictions, axis=0), np.mean(biases, axis=0),
total_contributions)
else:
mean_pred = None
mean_bias = None
mean_contribution = None
for i, tree in enumerate(model.estimators_):
pred, bias, contribution = _predict_tree(tree, X)
if i < 1: # first iteration
mean_bias = bias
mean_contribution = contribution
mean_pred = pred
else:
mean_bias = _iterative_mean(i, mean_bias, bias)
mean_contribution = _iterative_mean(i, mean_contribution, contribution)
mean_pred = _iterative_mean(i, mean_pred, pred)
return mean_pred, mean_bias, mean_contribution
示例18
def predict(model, X, joint_contribution=False):
""" Returns a triple (prediction, bias, feature_contributions), such
that prediction ≈ bias + feature_contributions.
Parameters
----------
model : DecisionTreeRegressor, DecisionTreeClassifier,
ExtraTreeRegressor, ExtraTreeClassifier,
RandomForestRegressor, RandomForestClassifier,
ExtraTreesRegressor, ExtraTreesClassifier
Scikit-learn model on which the prediction should be decomposed.
X : array-like, shape = (n_samples, n_features)
Test samples.
joint_contribution : boolean
Specifies if contributions are given individually from each feature,
or jointly over them
Returns
-------
decomposed prediction : triple of
* prediction, shape = (n_samples) for regression and (n_samples, n_classes)
for classification
* bias, shape = (n_samples) for regression and (n_samples, n_classes) for
classification
* contributions, If joint_contribution is False then returns and array of
shape = (n_samples, n_features) for regression or
shape = (n_samples, n_features, n_classes) for classification, denoting
contribution from each feature.
If joint_contribution is True, then shape is array of size n_samples,
where each array element is a dict from a tuple of feature indices to
to a value denoting the contribution from that feature tuple.
"""
# Only single out response variable supported,
if model.n_outputs_ > 1:
raise ValueError("Multilabel classification trees not supported")
if (isinstance(model, DecisionTreeClassifier) or
isinstance(model, DecisionTreeRegressor)):
return _predict_tree(model, X, joint_contribution=joint_contribution)
elif (isinstance(model, RandomForestClassifier) or
isinstance(model, ExtraTreesClassifier) or
isinstance(model, RandomForestRegressor) or
isinstance(model, ExtraTreesRegressor)):
return _predict_forest(model, X, joint_contribution=joint_contribution)
else:
raise ValueError("Wrong model type. Base learner needs to be a "
"DecisionTreeClassifier or DecisionTreeRegressor.")
示例19
def test_distribution():
rng = check_random_state(12321)
# Single variable with 4 values
X = rng.randint(0, 4, size=(1000, 1))
y = rng.rand(1000)
n_trees = 500
clf = ExtraTreesRegressor(n_estimators=n_trees, random_state=42).fit(X, y)
uniques = defaultdict(int)
for tree in clf.estimators_:
tree = "".join(("%d,%d/" % (f, int(t)) if f >= 0 else "-")
for f, t in zip(tree.tree_.feature,
tree.tree_.threshold))
uniques[tree] += 1
uniques = sorted([(1. * count / n_trees, tree)
for tree, count in uniques.items()])
# On a single variable problem where X_0 has 4 equiprobable values, there
# are 5 ways to build a random tree. The more compact (0,1/0,0/--0,2/--) of
# them has probability 1/3 while the 4 others have probability 1/6.
assert_equal(len(uniques), 5)
assert_greater(0.20, uniques[0][0]) # Rough approximation of 1/6.
assert_greater(0.20, uniques[1][0])
assert_greater(0.20, uniques[2][0])
assert_greater(0.20, uniques[3][0])
assert_greater(uniques[4][0], 0.3)
assert_equal(uniques[4][1], "0,1/0,0/--0,2/--")
# Two variables, one with 2 values, one with 3 values
X = np.empty((1000, 2))
X[:, 0] = np.random.randint(0, 2, 1000)
X[:, 1] = np.random.randint(0, 3, 1000)
y = rng.rand(1000)
clf = ExtraTreesRegressor(n_estimators=100, max_features=1,
random_state=1).fit(X, y)
uniques = defaultdict(int)
for tree in clf.estimators_:
tree = "".join(("%d,%d/" % (f, int(t)) if f >= 0 else "-")
for f, t in zip(tree.tree_.feature,
tree.tree_.threshold))
uniques[tree] += 1
uniques = [(count, tree) for tree, count in uniques.items()]
assert_equal(len(uniques), 8)