Python源码示例:sklearn.ensemble.RandomForestRegressor()
示例1
def __init__(self, model_type='classifier', feature_type='fingerprints',
n_estimators=100, n_ensemble=5):
super(RandomForestQSAR, self).__init__()
self.n_estimators = n_estimators
self.n_ensemble = n_ensemble
self.model = []
self.model_type = model_type
if self.model_type == 'classifier':
for i in range(n_ensemble):
self.model.append(RFC(n_estimators=n_estimators))
elif self.model_type == 'regressor':
for i in range(n_ensemble):
self.model.append(RFR(n_estimators=n_estimators))
else:
raise ValueError('invalid value for argument')
self.feature_type = feature_type
if self.feature_type == 'descriptors':
self.calc = Calculator(descriptors, ignore_3D=True)
self.desc_mean = [0]*self.n_ensemble
示例2
def test_sklearn_regression_overfit(self):
"""Test that sklearn models can overfit simple regression datasets."""
n_samples = 10
n_features = 3
n_tasks = 1
# Generate dummy dataset
np.random.seed(123)
ids = np.arange(n_samples)
X = np.random.rand(n_samples, n_features)
y = np.random.rand(n_samples, n_tasks)
w = np.ones((n_samples, n_tasks))
dataset = dc.data.NumpyDataset(X, y, w, ids)
regression_metric = dc.metrics.Metric(dc.metrics.r2_score)
sklearn_model = RandomForestRegressor()
model = dc.models.SklearnModel(sklearn_model)
# Fit trained model
model.fit(dataset)
model.save()
# Eval model on train
scores = model.evaluate(dataset, [regression_metric])
assert scores[regression_metric.name] > .7
示例3
def get_regressor_fitted(file_path,
X_train,
X_test,
y_train,
y_test):
if os.path.exists(file_path):
try:
regressor_fitted = load_sklearn_model(file_path)
except EOFError as e:
print(file_path)
raise e
else:
regressor = RandomForestRegressor(n_estimators=50,
criterion="mse",
max_features="auto",
n_jobs=get_threads_number())
regressor_fitted = regressor.fit(X_train, y_train)
store_sklearn_model(file_path, regressor_fitted)
return regressor_fitted
示例4
def Train(data, treecount, tezh, yanzhgdata):
model = RF(n_estimators=treecount, max_features=tezh)
model.fit(data[:, :-1], data[:, -1])
# 给出训练数据的预测值
train_out = model.predict(data[:, :-1])
# 计算MSE
train_mse = mse(data[:, -1], train_out)
# 给出验证数据的预测值
add_yan = model.predict(yanzhgdata[:, :-1])
# 计算MSE
add_mse = mse(yanzhgdata[:, -1], add_yan)
print(train_mse, add_mse)
return train_mse, add_mse
# 最终确定组合的函数
示例5
def build_ensemble(**kwargs):
"""Generate ensemble."""
ens = SuperLearner(**kwargs)
prep = {'Standard Scaling': [StandardScaler()],
'Min Max Scaling': [MinMaxScaler()],
'No Preprocessing': []}
est = {'Standard Scaling':
[ElasticNet(), Lasso(), KNeighborsRegressor()],
'Min Max Scaling':
[SVR()],
'No Preprocessing':
[RandomForestRegressor(random_state=SEED),
GradientBoostingRegressor()]}
ens.add(est, prep)
ens.add(GradientBoostingRegressor(), meta=True)
return ens
示例6
def regression_rf(x,y):
'''
Estimate a random forest regressor
'''
# create the regressor object
random_forest = en.RandomForestRegressor(
min_samples_split=80, random_state=666,
max_depth=5, n_estimators=10)
# estimate the model
random_forest.fit(x,y)
# return the object
return random_forest
# the file name of the dataset
示例7
def test_single_condition():
estimator = ensemble.RandomForestRegressor(n_estimators=2, random_state=1)
estimator.fit([[1], [2]], [1, 2])
assembler = assemblers.RandomForestModelAssembler(estimator)
actual = assembler.assemble()
expected = ast.BinNumExpr(
ast.BinNumExpr(
ast.NumVal(1.0),
ast.IfExpr(
ast.CompExpr(
ast.FeatureRef(0),
ast.NumVal(1.5),
ast.CompOpType.LTE),
ast.NumVal(1.0),
ast.NumVal(2.0)),
ast.BinNumOpType.ADD),
ast.NumVal(0.5),
ast.BinNumOpType.MUL)
assert utils.cmp_exprs(actual, expected)
示例8
def generate_regression_data_and_models():
df = pd.DataFrame()
for _ in range(1000):
a = np.random.normal(0, 1)
b = np.random.normal(0, 3)
c = np.random.normal(12, 4)
target = a + b + c
df = df.append({
"A": a,
"B": b,
"C": c,
"target": target
}, ignore_index=True)
reg1 = tree.DecisionTreeRegressor()
reg2 = ensemble.RandomForestRegressor()
column_names = ["A", "B", "C"]
target_name = "target"
X = df[column_names]
reg1.fit(X, df[target_name])
reg2.fit(X, df[target_name])
return df, column_names, target_name, reg1, reg2
示例9
def fit(self, X, y):
"""
Fit a Random Forest model to data `X` and targets `y`.
Parameters
----------
X : array-like
Input values.
y: array-like
Target values.
"""
self.X = X
self.y = y
self.n = self.X.shape[0]
self.model = RandomForestRegressor(**self.params)
self.model.fit(X, y)
示例10
def test_regression(self):
training_pt = gpd.read_file(ms.meuse)
training = self.stack_meuse.extract_vector(gdf=training_pt)
training["zinc"] = training_pt["zinc"]
training["cadmium"] = training_pt["cadmium"]
training["copper"] = training_pt["copper"]
training["lead"] = training_pt["lead"]
training = training.dropna()
# single target regression
regr = RandomForestRegressor(n_estimators=50)
X = training.loc[:, self.stack_meuse.names]
y = training["zinc"]
regr.fit(X, y)
single_regr = self.stack_meuse.predict(regr)
self.assertIsInstance(single_regr, Raster)
self.assertEqual(single_regr.count, 1)
# multi-target regression
y = training.loc[:, ["zinc", "cadmium", "copper", "lead"]]
regr.fit(X, y)
multi_regr = self.stack_meuse.predict(regr)
self.assertIsInstance(multi_regr, Raster)
self.assertEqual(multi_regr.count, 4)
示例11
def fit(self, losses, configs=None):
if configs is None:
configs = [[]]*len(times)
# convert learning curves into X and y data
X = []
y = []
for l,c in zip(losses, configs):
l = self.apply_differencing(l)
for i in range(self.order, len(l)):
X.append(np.hstack([l[i-self.order:i], c]))
y.append(l[i])
self.X = np.array(X)
self.y = np.array(y)
self.rfr = rfr().fit(self.X,self.y)
示例12
def extend_partial(self, obs_losses, num_steps, config=None):
# TODO: add variance predictions
if config is None:
config = []
d_losses = self.apply_differencing(obs_losses)
for t in range(num_steps):
x = np.hstack([d_losses[-self.order:], config])
y = self.rfr.predict([x])
d_losses = np.hstack([d_losses, y])
prediction = self.invert_differencing( obs_losses, d_losses[-num_steps:])
return(prediction)
示例13
def test_random_forest_regressor(self):
for dtype in self.number_data_type.keys():
scikit_model = RandomForestRegressor(random_state=1)
data = self.scikit_data["data"].astype(dtype)
target = self.scikit_data["target"].astype(dtype)
scikit_model, spec = self._sklearn_setup(scikit_model, dtype, data, target)
test_data = data[0].reshape(1, -1)
self._check_tree_model(spec, "multiArrayType", "doubleType", 1)
coreml_model = create_model(spec)
try:
self.assertEqual(
scikit_model.predict(test_data)[0].dtype,
type(coreml_model.predict({"data": test_data})["target"]),
)
self.assertAlmostEqual(
scikit_model.predict(test_data)[0],
coreml_model.predict({"data": test_data})["target"],
msg="{} != {} for Dtype: {}".format(
scikit_model.predict(test_data)[0],
coreml_model.predict({"data": test_data})["target"],
dtype,
),
)
except RuntimeError:
print("{} not supported. ".format(dtype))
示例14
def _train_convert_evaluate_assert(self, **scikit_params):
"""
Train a scikit-learn model, convert it and then evaluate it with CoreML
"""
scikit_model = RandomForestRegressor(random_state=1, **scikit_params)
scikit_model.fit(self.X, self.target)
# Convert the model
spec = skl_converter.convert(scikit_model, self.feature_names, self.output_name)
if _is_macos() and _macos_version() >= (10, 13):
# Get predictions
df = pd.DataFrame(self.X, columns=self.feature_names)
df["prediction"] = scikit_model.predict(self.X)
# Evaluate it
metrics = evaluate_regressor(spec, df, verbose=False)
self._check_metrics(metrics, scikit_params)
示例15
def test_smoke_regression_methods(regression_test_data, n_jobs):
"""Construct, fit, and predict on realistic problem.
"""
xtrain = regression_test_data['x']
ytrain = regression_test_data['y']
rng = np.random.RandomState(17)
est_list = [('lr', LinearRegression()),
('rf', RandomForestRegressor(random_state=rng,
n_estimators=10)),
('nnls', NonNegativeLinearRegression())]
sm = StackedRegressor(est_list, n_jobs=n_jobs)
sm.fit(xtrain, ytrain)
sm.predict(xtrain)
sm.score(xtrain, ytrain)
with pytest.raises(AttributeError):
sm.predict_proba(xtrain)
示例16
def test_smoke_multiout_regression_methods(n_jobs):
"""Construct, fit, and predict on realistic problem.
"""
X, y = make_regression(random_state=7, n_samples=100, n_features=10,
n_informative=4, n_targets=2)
rng = np.random.RandomState(17)
est_list = [('lr', LinearRegression()),
('rf', RandomForestRegressor(random_state=rng,
n_estimators=10)),
('metalr', LinearRegression())]
sm = StackedRegressor(est_list, n_jobs=n_jobs)
sm.fit(X, y)
sm.predict(X)
sm.score(X, y)
with pytest.raises(AttributeError):
sm.predict_proba(X)
示例17
def test_integration_regression(regression_test_data, n_jobs):
"""Construct, fit, and predict on realistic problem. Compare goodness of
fit of stacked model vs. individual base estimators.
"""
xtrain = regression_test_data['x']
ytrain = regression_test_data['y']
xtest = regression_test_data['xtest']
ytest = regression_test_data['ytest']
sr = StackedRegressor([('rf', RandomForestRegressor(random_state=7,
n_estimators=10)),
('lr', LinearRegression()),
('metalr', NonNegativeLinearRegression())],
n_jobs=n_jobs)
rf = RandomForestRegressor(random_state=7, n_estimators=10)
lr = LinearRegression()
sr_mse = fit_predict_measure_reg(sr, xtrain, ytrain, xtest, ytest)
rf_mse = fit_predict_measure_reg(rf, xtrain, ytrain, xtest, ytest)
lr_mse = fit_predict_measure_reg(lr, xtrain, ytrain, xtest, ytest)
# Stacked regressor should perform better than its base estimators on this
# data.
assert sr_mse < rf_mse
assert sr_mse < lr_mse
assert sr_mse < 1.5 # Sanity check
示例18
def __init__(self, params):
super(RandomForestRegressorAlgorithm, self).__init__(params)
logger.debug("RandomForestRegressorAlgorithm.__init__")
self.library_version = sklearn.__version__
self.trees_in_step = regression_additional.get("trees_in_step", 5)
self.max_steps = regression_additional.get("max_steps", 3)
self.early_stopping_rounds = regression_additional.get(
"early_stopping_rounds", 50
)
self.model = RandomForestRegressor(
n_estimators=self.trees_in_step,
criterion=params.get("criterion", "mse"),
max_features=params.get("max_features", 0.8),
min_samples_split=params.get("min_samples_split", 4),
warm_start=True,
n_jobs=-1,
random_state=params.get("seed", 1),
)
示例19
def __init__(self, text, scores):
self.text = text
self.scores = scores
self.feature_generator = FeatureGenerator()
self.classifier = RandomForestRegressor(
n_estimators=100,
min_samples_split=4,
min_samples_leaf=3,
random_state=1
)
unique_scores = set(scores)
if len(unique_scores) <= self.classification_max:
self.classifier = RandomForestClassifier(
n_estimators=100,
min_samples_split=4,
min_samples_leaf=3,
random_state=1
)
self.fit_feats()
self.fit_done = False
示例20
def train_model(self, train_file_path, model_path):
print("==> Load the data ...")
X_train, Y_train = self.load_file(train_file_path)
print(train_file_path, shape(X_train))
print("==> Train the model ...")
min_max_scaler = preprocessing.MaxAbsScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
clf = RandomForestRegressor(n_estimators=self.n_estimators)
clf.fit(X_train_minmax.toarray(), Y_train)
print("==> Save the model ...")
pickle.dump(clf, open(model_path, 'wb'))
scaler_path = model_path.replace('.pkl', '.scaler.pkl')
pickle.dump(min_max_scaler, open(scaler_path, 'wb'))
return clf
示例21
def fit_ensemble(x,y):
fit_type = jhkaggle.jhkaggle_config['FIT_TYPE']
if 1:
if fit_type == jhkaggle.const.FIT_TYPE_BINARY_CLASSIFICATION:
blend = SGDClassifier(loss="log", penalty="elasticnet") # LogisticRegression()
else:
# blend = SGDRegressor()
#blend = LinearRegression()
#blend = RandomForestRegressor(n_estimators=10, n_jobs=-1, max_depth=5, criterion='mae')
blend = LassoLarsCV(normalize=True)
#blend = ElasticNetCV(normalize=True)
#blend = LinearRegression(normalize=True)
blend.fit(x, y)
else:
blend = LogisticRegression()
blend.fit(x, y)
return blend
示例22
def run_sklearn():
n_trees = 100
n_folds = 3
# https://www.analyticsvidhya.com/blog/2015/06/tuning-random-forest-model/
alg_list = [
['lreg',LinearRegression()],
['rforest',RandomForestRegressor(n_estimators=1000, n_jobs=-1, max_depth=3)],
['extree',ExtraTreesClassifier(n_estimators = 1000,max_depth=2)],
['adaboost',AdaBoostRegressor(base_estimator=None, n_estimators=600, learning_rate=1.0)],
['knn', sklearn.neighbors.KNeighborsRegressor(n_neighbors=5)]
]
start_time = time.time()
for name,alg in alg_list:
train = jhkaggle.train_sklearn.TrainSKLearn("1",name,alg,False)
train.run()
train = None
elapsed_time = time.time() - start_time
print("Elapsed time: {}".format(jhkaggle.util.hms_string(elapsed_time)))
示例23
def fit_and_trade(data, cols, split, threshold):
'''
Fits and backtests a theoretical trading strategy
'''
data = data[data.width > 0]
X = data[cols]
y = data.mid30
X_train = X.iloc[:split]
X_test = X.iloc[split:]
y_train = y.iloc[:split]
y_test = y.iloc[split:]
regressor = RandomForestRegressor(n_estimators=100,
min_samples_leaf=500,
random_state=42,
n_jobs=-1)
regressor.fit(X_train.values, y_train.values)
trade(X_test.values, y_test.values, regressor, threshold)
示例24
def create_sklearn_random_forest_regressor(X, y):
rfr = ensemble.RandomForestRegressor(max_depth=4, random_state=777)
model = rfr.fit(X, y)
return model
示例25
def fit(self, X, y):
"""load the data in, initiate the models"""
self.X = X
self.y = y
self.opt_XGBoost_reg = xgb.XGBRegressor(**self.opt_xgb_params)
self.opt_forest_reg = RandomForestRegressor(**self.opt_rf_params)
self.opt_svm_reg = SVR(**self.opt_svm_params)
""" fit the models """
self.opt_XGBoost_reg.fit(self.X ,self.y)
self.opt_forest_reg.fit(self.X ,self.y)
self.opt_svm_reg.fit(self.X ,self.y)
示例26
def test_missforest_numerical_single():
# Test imputation with default parameter values
# Test with a single missing value
df = np.array([
[1, 0, 0, 1],
[2, 1, 2, 2],
[3, 2, 3, 2],
[np.nan, 4, 5, 5],
[6, 7, 6, 7],
[8, 8, 8, 8],
[16, 15, 18, 19],
])
statistics_mean = np.nanmean(df, axis=0)
y = df[:, 0]
X = df[:, 1:]
good_rows = np.where(~np.isnan(y))[0]
bad_rows = np.where(np.isnan(y))[0]
rf = RandomForestRegressor(n_estimators=10, random_state=1337)
rf.fit(X=X[good_rows], y=y[good_rows])
pred_val = rf.predict(X[bad_rows])
df_imputed = np.array([
[1, 0, 0, 1],
[2, 1, 2, 2],
[3, 2, 3, 2],
[pred_val, 4, 5, 5],
[6, 7, 6, 7],
[8, 8, 8, 8],
[16, 15, 18, 19],
])
imputer = MissForest(n_estimators=10, random_state=1337)
assert_array_equal(imputer.fit_transform(df), df_imputed)
assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean)
示例27
def model_builder(model_dir):
sklearn_model = RandomForestRegressor(n_estimators=500)
return SklearnModel(sklearn_model, model_dir)
示例28
def task_model_builder(model_dir):
sklearn_model = RandomForestRegressor(
n_estimators=100, max_features=int(num_features/3),
min_samples_split=5, n_jobs=-1)
return dc.models.SklearnModel(sklearn_model, model_dir)
示例29
def task_model_builder(model_dir):
sklearn_model = RandomForestRegressor(
n_estimators=100,
max_features=int(num_features / 3),
min_samples_split=5,
n_jobs=-1)
return dc.models.SklearnModel(sklearn_model, model_dir)
示例30
def task_model_builder(model_dir):
sklearn_model = RandomForestRegressor(
n_estimators=100, max_features=int(num_features/3),
min_samples_split=5, n_jobs=-1)
return dc.models.SklearnModel(sklearn_model, model_dir)