Python源码示例:sklearn.ensemble.IsolationForest()
示例1
def __init__(self, hybrid=False, n_estimators=100, max_samples='auto', contamination=0.1, n_jobs=-1, seed=None,
**kwargs):
"""Init Isolation Forest instance."""
self.n_estimators = n_estimators
self.max_samples = max_samples
self.contamination = contamination
self.n_jobs = n_jobs
self.seed = seed
self.model = IsolationForest(n_estimators=n_estimators, max_samples=max_samples, contamination=contamination,
n_jobs=n_jobs, random_state=seed, **kwargs)
self.hybrid = hybrid
self.ae_net = None # autoencoder network for the case of a hybrid model
self.results = {
'train_time': None,
'test_time': None,
'test_auc': None,
'test_scores': None
}
示例2
def test_iforest_sparse():
"""Check IForest for various parameter settings on sparse input."""
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
boston.target[:50],
random_state=rng)
grid = ParameterGrid({"max_samples": [0.5, 1.0],
"bootstrap": [True, False]})
for sparse_format in [csc_matrix, csr_matrix]:
X_train_sparse = sparse_format(X_train)
X_test_sparse = sparse_format(X_test)
for params in grid:
# Trained on sparse format
sparse_classifier = IsolationForest(
n_estimators=10, random_state=1, **params).fit(X_train_sparse)
sparse_results = sparse_classifier.predict(X_test_sparse)
# Trained on dense format
dense_classifier = IsolationForest(
n_estimators=10, random_state=1, **params).fit(X_train)
dense_results = dense_classifier.predict(X_test)
assert_array_equal(sparse_results, dense_results)
示例3
def test_iforest_performance():
"""Test Isolation Forest performs well"""
# Generate train/test data
rng = check_random_state(2)
X = 0.3 * rng.randn(120, 2)
X_train = np.r_[X + 2, X - 2]
X_train = X[:100]
# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
X_test = np.r_[X[100:], X_outliers]
y_test = np.array([0] * 20 + [1] * 20)
# fit the model
clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)
# predict scores (the lower, the more normal)
y_pred = - clf.decision_function(X_test)
# check that there is at most 6 errors (false positive or false negative)
assert_greater(roc_auc_score(y_test, y_pred), 0.98)
示例4
def test_deprecation():
X = [[0.0], [1.0]]
clf = IsolationForest()
assert_warns_message(FutureWarning,
'default contamination parameter 0.1 will change '
'in version 0.22 to "auto"',
clf.fit, X)
assert_warns_message(FutureWarning,
'behaviour="old" is deprecated and will be removed '
'in version 0.22',
clf.fit, X)
clf = IsolationForest().fit(X)
assert_warns_message(DeprecationWarning,
"threshold_ attribute is deprecated in 0.20 and will"
" be removed in 0.22.",
getattr, clf, "threshold_")
示例5
def sample_hyps_iso_forest(nest, contam, boot):
"""
:param nest:
:param contam:
:param boot:
:return: An IsolationForest object with specified hyperparameters, used to detect anomaly.
"""
n_estimators = nest # random.choice(range(20, 300)) # default is 100
max_samples = 'auto'
contamination = contam #randrange_float(0.0, 0.5, 0.05)
max_features = 1.0 # default is 1.0 (use all features)
bootstrap = boot # random.choice(['True', 'False'])
n_jobs = -1 # Uses all cores
verbose = 0
model = IsolationForest(n_estimators=n_estimators, max_samples=max_samples,
contamination=contamination, max_features=max_features,
bootstrap=bootstrap, n_jobs=n_jobs, verbose=verbose)
return model
示例6
def run_isolation_forest(features, id_list, fraction_of_outliers=.3):
"""Performs anomaly detection based on Isolation Forest."""
rng = np.random.RandomState(1984)
num_samples = features.shape[0]
iso_f = IsolationForest(max_samples=num_samples,
contamination=fraction_of_outliers,
random_state=rng)
iso_f.fit(features)
pred_scores = iso_f.decision_function(features)
threshold = stats.scoreatpercentile(pred_scores, 100 * fraction_of_outliers)
outlying_ids = id_list[pred_scores < threshold]
return outlying_ids
示例7
def test_iforest_sparse():
"""Check IForest for various parameter settings on sparse input."""
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
boston.target[:50],
random_state=rng)
grid = ParameterGrid({"max_samples": [0.5, 1.0],
"bootstrap": [True, False]})
for sparse_format in [csc_matrix, csr_matrix]:
X_train_sparse = sparse_format(X_train)
X_test_sparse = sparse_format(X_test)
for params in grid:
# Trained on sparse format
sparse_classifier = IsolationForest(
n_estimators=10, random_state=1, **params).fit(X_train_sparse)
sparse_results = sparse_classifier.predict(X_test_sparse)
# Trained on dense format
dense_classifier = IsolationForest(
n_estimators=10, random_state=1, **params).fit(X_train)
dense_results = dense_classifier.predict(X_test)
assert_array_equal(sparse_results, dense_results)
示例8
def test_iforest_error():
"""Test that it gives proper exception on deficient input."""
X = iris.data
# Test max_samples
assert_raises(ValueError,
IsolationForest(max_samples=-1).fit, X)
assert_raises(ValueError,
IsolationForest(max_samples=0.0).fit, X)
assert_raises(ValueError,
IsolationForest(max_samples=2.0).fit, X)
# The dataset has less than 256 samples, explicitly setting
# max_samples > n_samples should result in a warning. If not set
# explicitly there should be no warning
assert_warns_message(UserWarning,
"max_samples will be set to n_samples for estimation",
IsolationForest(max_samples=1000).fit, X)
assert_no_warnings(IsolationForest(max_samples='auto').fit, X)
assert_no_warnings(IsolationForest(max_samples=np.int64(2)).fit, X)
assert_raises(ValueError, IsolationForest(max_samples='foobar').fit, X)
assert_raises(ValueError, IsolationForest(max_samples=1.5).fit, X)
示例9
def test_iforest_performance():
"""Test Isolation Forest performs well"""
# Generate train/test data
rng = check_random_state(2)
X = 0.3 * rng.randn(120, 2)
X_train = np.r_[X + 2, X - 2]
X_train = X[:100]
# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
X_test = np.r_[X[100:], X_outliers]
y_test = np.array([0] * 20 + [1] * 20)
# fit the model
clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)
# predict scores (the lower, the more normal)
y_pred = - clf.decision_function(X_test)
# check that there is at most 6 errors (false positive or false negative)
assert_greater(roc_auc_score(y_test, y_pred), 0.98)
示例10
def __init__(self, _id, _config):
super(IsolationForest, self).__init__(_id, _config)
self._nb_samples = int(_config['nb_samples'])
示例11
def get_default_config():
return {
'module': IsolationForest.__name__,
'nb_samples': N_SAMPLES
}
示例12
def _get_best_detector(self, train):
detector = ensemble.IsolationForest()
detector.fit(train)
return detector
示例13
def setUp(self):
super(TestIsolationForest, self).setUp()
self.if_sml = isolation_forest.IsolationForest(
"fakeid", {"module": "fake", "nb_samples": 1000})
示例14
def test_learn_structure(self):
data = self.get_testing_data()
clf = self.if_sml.learn_structure(data)
self.assertIsInstance(clf, ensemble.IsolationForest)
示例15
def test_iforest():
"""Check Isolation Forest for various parameter settings."""
X_train = np.array([[0, 1], [1, 2]])
X_test = np.array([[2, 1], [1, 1]])
grid = ParameterGrid({"n_estimators": [3],
"max_samples": [0.5, 1.0, 3],
"bootstrap": [True, False]})
with ignore_warnings():
for params in grid:
IsolationForest(random_state=rng,
**params).fit(X_train).predict(X_test)
示例16
def test_iforest_error():
"""Test that it gives proper exception on deficient input."""
X = iris.data
# Test max_samples
assert_raises(ValueError,
IsolationForest(max_samples=-1).fit, X)
assert_raises(ValueError,
IsolationForest(max_samples=0.0).fit, X)
assert_raises(ValueError,
IsolationForest(max_samples=2.0).fit, X)
# The dataset has less than 256 samples, explicitly setting
# max_samples > n_samples should result in a warning. If not set
# explicitly there should be no warning
assert_warns_message(UserWarning,
"max_samples will be set to n_samples for estimation",
IsolationForest(max_samples=1000).fit, X)
# note that assert_no_warnings does not apply since it enables a
# PendingDeprecationWarning triggered by scipy.sparse's use of
# np.matrix. See issue #11251.
with pytest.warns(None) as record:
IsolationForest(max_samples='auto').fit(X)
user_warnings = [each for each in record
if issubclass(each.category, UserWarning)]
assert len(user_warnings) == 0
with pytest.warns(None) as record:
IsolationForest(max_samples=np.int64(2)).fit(X)
user_warnings = [each for each in record
if issubclass(each.category, UserWarning)]
assert len(user_warnings) == 0
assert_raises(ValueError, IsolationForest(max_samples='foobar').fit, X)
assert_raises(ValueError, IsolationForest(max_samples=1.5).fit, X)
# test X_test n_features match X_train one:
assert_raises(ValueError, IsolationForest().fit(X).predict, X[:, 1:])
# test threshold_ attribute error when behaviour is not old:
msg = "threshold_ attribute does not exist when behaviour != 'old'"
assert_raises_regex(AttributeError, msg, getattr,
IsolationForest(behaviour='new'), 'threshold_')
示例17
def test_recalculate_max_depth():
"""Check max_depth recalculation when max_samples is reset to n_samples"""
X = iris.data
clf = IsolationForest().fit(X)
for est in clf.estimators_:
assert_equal(est.max_depth, int(np.ceil(np.log2(X.shape[0]))))
示例18
def test_max_samples_attribute():
X = iris.data
clf = IsolationForest().fit(X)
assert_equal(clf.max_samples_, X.shape[0])
clf = IsolationForest(max_samples=500)
assert_warns_message(UserWarning,
"max_samples will be set to n_samples for estimation",
clf.fit, X)
assert_equal(clf.max_samples_, X.shape[0])
clf = IsolationForest(max_samples=0.4).fit(X)
assert_equal(clf.max_samples_, 0.4*X.shape[0])
示例19
def test_iforest_works(contamination):
# toy sample (the last two samples are outliers)
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]]
# Test IsolationForest
clf = IsolationForest(
behaviour="new", random_state=rng, contamination=contamination
)
clf.fit(X)
decision_func = -clf.decision_function(X)
pred = clf.predict(X)
# assert detect outliers:
assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2]))
assert_array_equal(pred, 6 * [1] + 2 * [-1])
示例20
def test_max_samples_consistency():
# Make sure validated max_samples in iforest and BaseBagging are identical
X = iris.data
clf = IsolationForest().fit(X)
assert_equal(clf.max_samples_, clf._max_samples)
示例21
def test_iforest_subsampled_features():
# It tests non-regression for #5732 which failed at predict.
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
boston.target[:50],
random_state=rng)
clf = IsolationForest(max_features=0.8)
clf.fit(X_train, y_train)
clf.predict(X_test)
示例22
def test_score_samples():
X_train = [[1, 1], [1, 2], [2, 1]]
clf1 = IsolationForest(contamination=0.1).fit(X_train)
clf2 = IsolationForest().fit(X_train)
assert_array_equal(clf1.score_samples([[2., 2.]]),
clf1.decision_function([[2., 2.]]) + clf1.offset_)
assert_array_equal(clf2.score_samples([[2., 2.]]),
clf2.decision_function([[2., 2.]]) + clf2.offset_)
assert_array_equal(clf1.score_samples([[2., 2.]]),
clf2.score_samples([[2., 2.]]))
示例23
def test_behaviour_param():
X_train = [[1, 1], [1, 2], [2, 1]]
clf1 = IsolationForest(behaviour='old').fit(X_train)
clf2 = IsolationForest(behaviour='new', contamination='auto').fit(X_train)
assert_array_equal(clf1.decision_function([[2., 2.]]),
clf2.decision_function([[2., 2.]]))
# mock get_chunk_n_rows to actually test more than one chunk (here one
# chunk = 3 rows:
示例24
def build_model(self, outlier_ratio=0.1, n_estimators=100, max_samples='auto'):
self.model = IsolationForest(contamination=outlier_ratio,
n_estimators=n_estimators,
max_samples=max_samples,
behaviour='new')
示例25
def _fit(self, X):
self.estimator_ = IsolationForest(
behaviour = 'new',
bootstrap = self.bootstrap,
contamination = self.contamination,
max_features = self.max_features,
max_samples = self.max_samples,
n_estimators = self.n_estimators,
n_jobs = self.n_jobs,
random_state = self.random_state
).fit(X)
return self
示例26
def fit(self):
isf = IsolationForest(n_estimators=self.nestimators, contamination=self.contamination)
isf.fit(self.X)
self.ift = isf
示例27
def fit(self, X, y=None, sample_weight=None):
self.ifor = IsolationForest(n_estimators=self.n_estimators,
max_samples=self.max_samples,
contamination=self.contamination,
max_features=self.max_features,
bootstrap=self.bootstrap,
n_jobs=self.n_jobs,
random_state=self.random_state,
verbose=self.verbose)
self.ifor.fit(X, y, sample_weight)
self.estimators_ = self.ifor.estimators_
self.estimators_features_ = self.ifor.estimators_features_
self.updated = False
示例28
def _multiview_fit(self, X, y, feature_partitions, n_estimators_view):
n_features = X.shape[1]
estimators_group = []
feature_offset = 0
logger.debug("IForestMultiview n_estimators_view: %s" % str(list(n_estimators_view)))
for n_feats, n_est_ in zip(feature_partitions, n_estimators_view):
estimators = []
X_ = X[:, feature_offset:(feature_offset+n_feats)]
if n_est_ > 0:
# contruct isolation forest for the view containing just the feature subset
ifor_ = IsolationForest(n_estimators=n_est_,
max_samples=self.max_samples,
contamination=self.contamination,
max_features=self.max_features,
bootstrap=self.bootstrap,
n_jobs=self.n_jobs,
random_state=self.random_state,
verbose=self.verbose)
ifor_.fit(X_, y, sample_weight=None)
for tree in ifor_.estimators_:
# The IsolationForest trees contain read-only properties. We copy
# over all the properties to our custom tree structure so that we
# can modify them if needed.
ifor_mv_estimator = IForestMultiviewTree(n_features=n_features, ifor_tree=tree.tree_)
# adjust the feature indexes at the tree nodes.
ifor_mv_estimator.tree_.feature += feature_offset
estimators.append(ifor_mv_estimator)
estimators_group.append(estimators)
feature_offset += n_feats
return estimators_group
示例29
def get_iso_model(x, y, opts):
outliers_fraction = 0.1
ifor_random_state = opts.randseed
iso_model = IsolationForest(n_estimators=100, max_samples=256,
contamination=outliers_fraction,
random_state=ifor_random_state)
iso_model.fit(x)
r = np.reshape(iso_model.decision_function(x), (-1, 1))
# logger.debug("iforest r:\n%s" % str(list(r)))
return iso_model, r
示例30
def __init__(self,options):
self.handle_options(options)
out_params = convert_params(
options.get('params',{}),
ints = ['n_estimators','n_jobs','random_state','verbose'],
floats = ['max_samples','contamination','max_features'],
bools = ['bootstrap']
)
self.return_scores = out_params.pop('anomaly_score', True)
# whitelist n_estimators > 0
if 'n_estimators' in out_params and out_params['n_estimators']<=0:
msg = 'Invalid value error: n_estimators must be greater than 0 and an integer, but found n_estimators="{}".'
raise RuntimeError(msg.format(out_params['n_estimators']))
# whitelist max_samples > 0 and < 1
if 'max_samples' in out_params and out_params['max_samples']<0 and out_params['max_samples']>1:
msg = 'Invalid value error: max_samples must be greater than 0 and a float, but found max_samples="{}".'
raise RuntimeError(msg.format(out_params['max_samples']))
# whitelist contamination should be in (0.0, 0.5] as error raised by sklearn for values out of range
if 'contamination' in out_params and not (0.0 < out_params['contamination'] <= 0.5):
msg = (
'Invalid value error: Valid values for contamination are in (0.0, 0.5], '
'but found contamination="{}".'
)
raise RuntimeError(msg.format(out_params['contamination']))
# whitelist max_features > 0 and < 1
if 'max_features' in out_params and out_params['max_features']<0 and out_params['max_features']>1:
msg = 'Invalid value error: max_features must be greater than 0, but found max_features="{}".'
raise RuntimeError(msg.format(out_params['max_features']))
self.estimator = _IsolationForest(**out_params)