Python源码示例:sklearn.ensemble.RandomForestClassifier()
示例1
def mmb_evaluate_model(self):
"""
Returns scores from cross validation evaluation on the malicious / benign classifier
"""
predictive_features = self.features['predictive_features']
self.clf_X = self.modeldata[predictive_features].values
self.clf_y = np.array(self.modeldata['label'])
X_train, X_test, y_train, y_test = train_test_split(self.clf_X, self.clf_y, test_size=0.2, random_state=0)
lb = LabelBinarizer()
y_train = np.array([number[0] for number in lb.fit_transform(y_train)])
eval_cls = RandomForestClassifier(n_estimators=100, max_features=.2)
eval_cls.fit(X_train, y_train)
recall = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='recall')
precision = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='precision')
accuracy = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='accuracy')
f1_score = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='f1_macro')
return {'accuracy': accuracy, 'f1': f1_score, 'precision': precision, 'recall': recall}
示例2
def __init__(self, model_type='classifier', feature_type='fingerprints',
n_estimators=100, n_ensemble=5):
super(RandomForestQSAR, self).__init__()
self.n_estimators = n_estimators
self.n_ensemble = n_ensemble
self.model = []
self.model_type = model_type
if self.model_type == 'classifier':
for i in range(n_ensemble):
self.model.append(RFC(n_estimators=n_estimators))
elif self.model_type == 'regressor':
for i in range(n_ensemble):
self.model.append(RFR(n_estimators=n_estimators))
else:
raise ValueError('invalid value for argument')
self.feature_type = feature_type
if self.feature_type == 'descriptors':
self.calc = Calculator(descriptors, ignore_3D=True)
self.desc_mean = [0]*self.n_ensemble
示例3
def __init__(self, outputs, inputs, k=None, hypers=None, params=None,
distargs=None, rng=None):
self.rng = gu.gen_rng() if rng is None else rng
self.outputs = outputs
self.inputs = inputs
self.rng = gu.gen_rng() if rng is None else rng
assert len(self.outputs) == 1
assert len(self.inputs) >= 1
assert self.outputs[0] not in self.inputs
assert len(distargs['inputs']['stattypes']) == len(self.inputs)
self.stattypes = distargs['inputs']['stattypes']
# Number of output categories and input dimension.
# XXX WHATTA HACK. BayesDB passes in top-level kwargs, not in distargs.
self.k = k if k is not None else int(distargs['k'])
self.p = len(distargs['inputs']['stattypes'])
# Sufficient statistics.
self.N = 0
self.data = Data(x=OrderedDict(), Y=OrderedDict())
self.counts = [0] * self.k
# Outlier and random forest parameters.
if params is None: params = {}
self.alpha = params.get('alpha', .1)
self.regressor = params.get('forest', None)
if self.regressor is None:
self.regressor = RandomForestClassifier(random_state=self.rng)
示例4
def trainFunctionTypeClassifier(self, scs):
"""Train the type classifier, according to all known code segments.
Args:
scs (list): list of all known (sark) code segments
Note:
Training must happen *after* the calibration phase
"""
functions = []
for sc in scs:
functions += list(filter(lambda func: not self._analyzer.fptr_identifier.isPointedFunction(func.start_ea), sc.functions))
clf = RandomForestClassifier(n_estimators=100)
eas = list(map(lambda x: x.start_ea, functions))
data_set = list(map(self.extractFunctionTypeSample, eas))
data_results = list(map(self._analyzer.codeType, eas))
# classify
clf.fit(data_set, data_results)
# store the results
self._type_classifier = clf
示例5
def buildModel(dataset, method, parameters):
"""
Build final model for predicting real testing data
"""
features = dataset.columns[0:-1]
if method == 'RNN':
clf = performRNNlass(dataset[features], dataset['UpDown'])
return clf
elif method == 'RF':
clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
elif method == 'KNN':
clf = neighbors.KNeighborsClassifier()
elif method == 'SVM':
c = parameters[0]
g = parameters[1]
clf = SVC(C=c, gamma=g)
elif method == 'ADA':
clf = AdaBoostClassifier()
return clf.fit(dataset[features], dataset['UpDown'])
示例6
def Train(data, treecount, tezh, yanzhgdata):
model = RFC(n_estimators=treecount, max_features=tezh, class_weight='balanced')
model.fit(data[:, :-1], data[:, -1])
# 给出训练数据的预测值
train_out = model.predict(data[:, :-1])
# 计算MSE
train_mse = fmse(data[:, -1], train_out)[0]
# 给出验证数据的预测值
add_yan = model.predict(yanzhgdata[:, :-1])
# 计算f1度量
add_mse = fmse(yanzhgdata[:, -1], add_yan)[0]
print(train_mse, add_mse)
return train_mse, add_mse
# 最终确定组合的函数
示例7
def make_pipeline(encoding_method):
# static transformers from the other columns
transformers = [('one-hot-clean', encoder_dict['one-hot'], clean_columns)]
# adding the encoded column
transformers += [(encoding_method + '-dirty', encoder_dict[encoding_method],
[dirty_column])]
pipeline = Pipeline([
# Use ColumnTransformer to combine the features
('union', ColumnTransformer(
transformers=transformers,
remainder='drop')),
('scaler', StandardScaler(with_mean=False)),
('classifier', RandomForestClassifier(random_state=5))
])
return pipeline
###############################################################################
# Evaluation of different encoding methods
# -----------------------------------------
# We then loop over encoding methods, scoring the different pipeline predictions
# using a cross validation score:
示例8
def build_model(self, X_train, y_train):
if self.paras.load == True:
model = self.load_training_model(self.paras.window_len)
if model != None:
return model
print('build Random Forrest model...')
# range of number of trees : 5*(1 -> 10) = 5,10,...,50 trees
t_min = self.paras.tree_min[index]
t_max = self.paras.tree_max[index]
# range of max of features : 1 -> 10 features
f_min = self.paras.feature_min[index]
f_max = self.paras.feature_max[index]
# range of window : 1 -> 70 days
w_min = self.paras.window_min
w_max = self.paras.window_max
w_opt, n_opt, m_opt = self.best_window(X_train, y_train, w_min,w_max,t_min,t_max,f_min,f_max)
model = RandomForestClassifier(n_estimators=n_opt,max_features=m_opt, n_jobs=8, verbose=self.paras.verbose)
return model
示例9
def test_run(self):
self.input_data['item2embedding'] = dict(i0=[1, 2], i1=[3, 4])
self.input_data['similarity_data'] = pd.DataFrame(
dict(item1=['i0', 'i0', 'i1'], item2=['i0', 'i1', 'i1'], similarity=[1, 0, 1]))
task = TrainPairwiseSimilarityModel(
item2embedding_task=_DummyTask(),
similarity_data_task=_DummyTask(),
model_name='RandomForestClassifier',
item0_column_name='item1',
item1_column_name='item2',
similarity_column_name='similarity')
task.load = MagicMock(side_effect=self._load)
task.dump = MagicMock(side_effect=self._dump)
task.run()
self.assertIsInstance(self.dump_data, RandomForestClassifier)
示例10
def create_random_forest_tfidf():
vectorizer = TfidfVectorizer(lowercase=False)
rf = RandomForestClassifier(n_estimators=500, random_state=777)
return Pipeline([("vectorizer", vectorizer), ("rf", rf)])
示例11
def create_random_forest_vectorizer():
vectorizer = CountVectorizer(lowercase=False, min_df=0.0, binary=True)
rf = RandomForestClassifier(n_estimators=500, random_state=777)
return Pipeline([("vectorizer", vectorizer), ("rf", rf)])
示例12
def create_sklearn_random_forest_classifier(X, y):
rfc = ensemble.RandomForestClassifier(max_depth=4, random_state=777)
model = rfc.fit(X, y)
return model
示例13
def build_models(self):
"""
After get_language_features is called, this function builds the models based on
the classifier matrix and labels.
:return:
"""
self.cls = RandomForestClassifier(n_estimators=100, max_features=.2)
# build classifier
self.cls.fit(self.clf_X, self.clf_y)
return self.cls
示例14
def define_clfs_params(self):
'''
Defines all relevant parameters and classes for classfier objects.
Edit these if you wish to change parameters.
'''
# These are the classifiers
self.clfs = {
'RF': RandomForestClassifier(n_estimators = 50, n_jobs = -1),
'ET': ExtraTreesClassifier(n_estimators = 10, n_jobs = -1, criterion = 'entropy'),
'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth = [1, 5, 10, 15]), algorithm = "SAMME", n_estimators = 200),
'LR': LogisticRegression(penalty = 'l1', C = 1e5),
'SVM': svm.SVC(kernel = 'linear', probability = True, random_state = 0),
'GB': GradientBoostingClassifier(learning_rate = 0.05, subsample = 0.5, max_depth = 6, n_estimators = 10),
'NB': GaussianNB(),
'DT': DecisionTreeClassifier(),
'SGD': SGDClassifier(loss = 'log', penalty = 'l2'),
'KNN': KNeighborsClassifier(n_neighbors = 3)
}
# These are the parameters which will be run through
self.params = {
'RF':{'n_estimators': [1,10,100,1000], 'max_depth': [10, 15,20,30,40,50,60,70,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]},
'LR': {'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10], 'random_state': [1]},
'SGD': {'loss': ['log'], 'penalty': ['l2','l1','elasticnet'], 'random_state': [1]},
'ET': {'n_estimators': [1,10,100,1000], 'criterion' : ['gini', 'entropy'], 'max_depth': [1,3,5,10,15], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]},
'AB': {'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000], 'random_state': [1]},
'GB': {'n_estimators': [1,10,100,1000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100], 'random_state': [1]},
'NB': {},
'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,2,15,20,30,40,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]},
'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear'], 'random_state': [1]},
'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
}
示例15
def setUp(self):
bl1 = RandomForestClassifier(random_state=8)
bl2 = LogisticRegression()
bl3 = RandomForestClassifier(max_depth=10, random_state=10)
meta_est = LogisticRegression()
skf = StratifiedKFold(random_state=8).split
self.stacked_ensemble = stacker.XcessivStackedEnsemble(
[bl1, bl2, bl3],
['predict', 'predict_proba', 'predict_proba'],
meta_est,
skf
)
示例16
def test_is_valid_json(self):
assert functions.is_valid_json({'x': ['i am serializable', 0.1]})
assert not functions.is_valid_json({'x': RandomForestClassifier()})
示例17
def test_make_serializable(self):
assert functions.is_valid_json({'x': ['i am serializable', 0.1]})
assert not functions.is_valid_json({'x': RandomForestClassifier()})
assert functions.make_serializable(
{
'x': ['i am serializable', 0.1],
'y': RandomForestClassifier()
}
) == {'x': ['i am serializable', 0.1]}
示例18
def test_verify_estimator_class(self):
np.random.seed(8)
performance_dict, hyperparameters = functions.verify_estimator_class(
RandomForestClassifier(),
'predict_proba',
dict(Accuracy=self.source),
self.dataset_properties
)
assert round(performance_dict['Accuracy'], 3) == 0.8
assert hyperparameters == {
'warm_start': False,
'oob_score': False,
'n_jobs': 1,
'verbose': 0,
'max_leaf_nodes': None,
'bootstrap': True,
'min_samples_leaf': 1,
'n_estimators': 10,
'min_samples_split': 2,
'min_weight_fraction_leaf': 0.0,
'criterion': 'gini',
'random_state': None,
'min_impurity_split': None,
'min_impurity_decrease': 0.0,
'max_features': 'auto',
'max_depth': None,
'class_weight': None
}
示例19
def test_non_serializable_parameters(self):
pipeline = Pipeline([('pca', PCA()), ('rf', RandomForestClassifier())])
performance_dict, hyperparameters = functions.verify_estimator_class(
pipeline,
'predict_proba',
dict(Accuracy=self.source),
self.dataset_properties
)
assert functions.is_valid_json(hyperparameters)
示例20
def test_assertion_meta_feature_generator(self):
np.random.seed(8)
self.assertRaises(
exceptions.UserError,
functions.verify_estimator_class,
RandomForestClassifier(),
'decision_function',
dict(Accuracy=self.source),
self.dataset_properties
)
示例21
def setUp(self):
self.base_learner_origin = models.BaseLearnerOrigin(
source=''.join([
"from sklearn.ensemble import RandomForestClassifier\n",
"base_learner = RandomForestClassifier(random_state=8)"
])
)
示例22
def test_return_estimator_from_json(self):
est = self.base_learner_origin.return_estimator()
assert isinstance(est, RandomForestClassifier)
示例23
def make_example_classifier(filename):
# Create a dummy RF model for train/classify testing
rf = RandomForestClassifier()
p, n_class = 42, 2
n = n_class * 5
X = np.random.rand(n, p)
y = np.repeat(range(n_class), n / n_class)
rf.fit(X, y)
jl.dump(rf, filename)
# EXAMPLE DATASETS
示例24
def test_missforest_categorical_single():
# Test imputation with default parameter values
# Test with a single missing value
df = np.array([
[0, 0, 0, 1],
[0, 1, 2, 2],
[0, 2, 3, 2],
[np.nan, 4, 5, 5],
[1, 7, 6, 7],
[1, 8, 8, 8],
[1, 15, 18, 19],
])
y = df[:, 0]
X = df[:, 1:]
good_rows = np.where(~np.isnan(y))[0]
bad_rows = np.where(np.isnan(y))[0]
rf = RandomForestClassifier(n_estimators=10, random_state=1337)
rf.fit(X=X[good_rows], y=y[good_rows])
pred_val = rf.predict(X[bad_rows])
df_imputed = np.array([
[0, 0, 0, 1],
[0, 1, 2, 2],
[0, 2, 3, 2],
[pred_val, 4, 5, 5],
[1, 7, 6, 7],
[1, 8, 8, 8],
[1, 15, 18, 19],
])
imputer = MissForest(n_estimators=10, random_state=1337)
assert_array_equal(imputer.fit_transform(df, cat_vars=0), df_imputed)
assert_array_equal(imputer.fit_transform(df, cat_vars=[0]), df_imputed)
示例25
def test_get_tree_num(self):
rfc = RandomForestClassifier(max_depth=10)
bt = BorutaPy(rfc)
self.assertEqual(bt._get_tree_num(10), 44, "Tree Est. Math Fail")
self.assertEqual(bt._get_tree_num(100), 141, "Tree Est. Math Fail")
示例26
def test_if_boruta_extracts_relevant_features(self):
np.random.seed(42)
y = np.random.binomial(1, 0.5, 1000)
X = np.zeros((1000, 10))
z = y - np.random.binomial(1, 0.1, 1000) + np.random.binomial(1, 0.1, 1000)
z[z == -1] = 0
z[z == 2] = 1
# 5 relevant features
X[:, 0] = z
X[:, 1] = y * np.abs(np.random.normal(0, 1, 1000)) + np.random.normal(0, 0.1, 1000)
X[:, 2] = y + np.random.normal(0, 1, 1000)
X[:, 3] = y ** 2 + np.random.normal(0, 1, 1000)
X[:, 4] = np.sqrt(y) + np.random.binomial(2, 0.1, 1000)
# 5 irrelevant features
X[:, 5] = np.random.normal(0, 1, 1000)
X[:, 6] = np.random.poisson(1, 1000)
X[:, 7] = np.random.binomial(1, 0.3, 1000)
X[:, 8] = np.random.normal(0, 1, 1000)
X[:, 9] = np.random.poisson(1, 1000)
rfc = RandomForestClassifier()
bt = BorutaPy(rfc)
bt.fit(X, y)
# make sure that only all the relevant features are returned
self.assertListEqual(list(range(5)), list(np.where(bt.support_)[0]))
# test if this works as expected for dataframe input
X_df, y_df = pd.DataFrame(X), pd.Series(y)
bt.fit(X_df, y_df)
self.assertListEqual(list(range(5)), list(np.where(bt.support_)[0]))
# check it dataframe is returned when return_df=True
self.assertIsInstance(bt.transform(X_df, return_df=True), pd.DataFrame)
示例27
def model_builder(model_dir):
sklearn_model = RandomForestClassifier(
class_weight="balanced", n_estimators=500, n_jobs=-1)
return dc.models.SklearnModel(sklearn_model, model_dir)
示例28
def model_builder(model_dir):
sklearn_model = RandomForestClassifier(
class_weight="balanced", n_estimators=500)
return dc.models.SklearnModel(sklearn_model, model_dir)
示例29
def model_builder(model_dir):
sklearn_model = RandomForestClassifier(
class_weight="balanced", n_estimators=500)
return SklearnModel(sklearn_model, model_dir)
示例30
def model_builder(model_dir):
sklearn_model = RandomForestClassifier(
class_weight="balanced", n_estimators=500, n_jobs=-1)
return dc.models.SklearnModel(sklearn_model, model_dir)