Python源码示例:sklearn.preprocessing.RobustScaler()
示例1
def _iwp_model(self, processes, cv_folds):
"""Return the default model for the IWP regressor
"""
# Estimators are normally objects that have a fit and predict method
# (e.g. MLPRegressor from sklearn). To make their training easier we
# scale the input data in advance. With Pipeline objects from sklearn
# we can combine such steps easily since they behave like an
# estimator object as well.
estimator = Pipeline([
# SVM or NN work better if we have scaled the data in the first
# place. MinMaxScaler is the simplest one. RobustScaler or
# StandardScaler could be an alternative.
("scaler", RobustScaler(quantile_range=(15, 85))),
# The "real" estimator:
("estimator", MLPRegressor(max_iter=6000, early_stopping=True)),
])
# To optimize the results, we try different hyper parameters by
# using a grid search
hidden_layer_sizes = [
(15, 10, 3),
#(50, 20),
]
hyper_parameter = [
{ # Hyper parameter for lbfgs solver
'estimator__solver': ['lbfgs'],
'estimator__activation': ['tanh'],
'estimator__hidden_layer_sizes': hidden_layer_sizes,
'estimator__random_state': [0, 42, 100, 3452],
'estimator__alpha': [0.1, 0.001, 0.0001],
},
]
return GridSearchCV(
estimator, hyper_parameter, refit=True,
n_jobs=processes, cv=cv_folds, verbose=self.verbose,
)
示例2
def fit_transform(self, X):
compact_category_counts_catscale = X / X.sum(axis=0)
compact_category_counts_catscale_std = (
compact_category_counts_catscale.T - compact_category_counts_catscale.mean(axis=1)).T
return RobustScaler().fit_transform(compact_category_counts_catscale_std)
示例3
def __init__(self, *args, scale=False, center=False, **kwargs):
"""
A machine learned model. Beyond :class:`revscoring.Model`, this
"Learned" models implement
:func:`~revscoring.scoring.models.Learned.fit` and
:func:`~revscoring.scoring.models.Learned.cross_validate`.
"""
super().__init__(*args, **kwargs)
self.trained = None
if scale or center:
self.scaler = RobustScaler(with_centering=center,
with_scaling=scale)
else:
self.scaler = None
self.params.update({
'scale': scale,
'center': center
})
示例4
def __init__(self, options):
self.handle_options(options)
out_params = convert_params(
options.get('params', {}),
bools=['with_centering', 'with_scaling'],
strs=['quantile_range'],
)
if StrictVersion(sklearn_version) < StrictVersion(quantile_range_required_version) and 'quantile_range' in out_params.keys():
out_params.pop('quantile_range')
msg = 'The quantile_range option is ignored in this version of scikit-learn ({}): version {} or higher required'
msg = msg.format(sklearn_version, quantile_range_required_version)
messages.warn(msg)
if 'quantile_range' in out_params.keys():
try:
out_params['quantile_range'] = tuple(int(i) for i in out_params['quantile_range'].split('-'))
assert len(out_params['quantile_range']) == 2
except:
raise RuntimeError('Syntax Error: quantile_range requires a range, e.g., quantile_range=25-75')
self.estimator = _RobustScaler(**out_params)
示例5
def fit(self, X):
"""Fit detector.
Parameters
----------
X : dataframe of shape (n_samples, n_features)
The input samples.
"""
scaler = preprocessing.RobustScaler().fit(X)
X_train = scaler.transform(X)
if self.hidden_neurons is None:
self.hidden_neurons=[X_train.shape[1]//2+1,X_train.shape[1]//4+1,X_train.shape[1]//4+1,X_train.shape[1]//2+1]
self.batch_size=X_train.shape[0]//10
self.model=self._build_model()
self.model.fit(X_train,X_train,epochs=self.epoch,batch_size=self.batch_size)
return self
示例6
def test_df_values(self):
est1 = dpp.RobustScaler()
est2 = dpp.RobustScaler()
result_ar = est1.fit_transform(X)
result_df = est2.fit_transform(df)
if hasattr(result_df, "values"):
result_df = result_df.values
assert_eq_ar(result_ar, result_df)
for attr in ["scale_", "center_"]:
assert_eq_ar(getattr(est1, attr), getattr(est2, attr))
assert_eq_ar(est1.transform(X), est2.transform(X))
assert_eq_ar(est1.transform(df).values, est2.transform(X))
assert_eq_ar(est1.transform(X), est2.transform(df).values)
# different data types
df["0"] = df["0"].astype("float32")
result_ar = est1.fit_transform(X)
result_df = est2.fit_transform(df)
if hasattr(result_df, "values"):
result_df = result_df.values
assert_eq_ar(result_ar, result_df)
示例7
def transform(self, X):
"""Scale the data.
Parameters
----------
X : array-like, shape = (n_samples, n_timestamps)
Data to scale.
Returns
-------
X_new : array-like, shape = (n_samples, n_timestamps)
Scaled data.
"""
X = check_array(X, dtype='float64')
scaler = SklearnRobustScaler(
with_centering=self.with_centering,
with_scaling=self.with_scaling,
quantile_range=self.quantile_range
)
X_new = scaler.fit_transform(X.T).T
return X_new
示例8
def test_simple_feature_union(self):
data = numpy.array([[0, 0], [0, 0], [2, 1], [2, 1]],
dtype=numpy.float32)
model = FeatureUnion([("scaler1", StandardScaler()),
("scaler2", RobustScaler())])
model.fit(data)
all_models = list(enumerate_pipeline_models(model))
steps = collect_intermediate_steps(model, "feature union",
[("input",
FloatTensorType([None, 2]))])
assert len(steps) == 2
assert len(all_models) == 3
model.transform(data)
for step in steps:
onnx_step = step['onnx_step']
sess = onnxruntime.InferenceSession(onnx_step.SerializeToString())
onnx_outputs = sess.run(None, {'input': data})
onnx_output = onnx_outputs[0]
skl_outputs = step['model']._debug.outputs['transform']
assert_almost_equal(onnx_output, skl_outputs)
compare_objects(onnx_output, skl_outputs)
示例9
def test_objectmapper(self):
df = pdml.ModelFrame([])
self.assertIs(df.preprocessing.Binarizer, pp.Binarizer)
self.assertIs(df.preprocessing.FunctionTransformer,
pp.FunctionTransformer)
self.assertIs(df.preprocessing.Imputer, pp.Imputer)
self.assertIs(df.preprocessing.KernelCenterer, pp.KernelCenterer)
self.assertIs(df.preprocessing.LabelBinarizer, pp.LabelBinarizer)
self.assertIs(df.preprocessing.LabelEncoder, pp.LabelEncoder)
self.assertIs(df.preprocessing.MultiLabelBinarizer, pp.MultiLabelBinarizer)
self.assertIs(df.preprocessing.MaxAbsScaler, pp.MaxAbsScaler)
self.assertIs(df.preprocessing.MinMaxScaler, pp.MinMaxScaler)
self.assertIs(df.preprocessing.Normalizer, pp.Normalizer)
self.assertIs(df.preprocessing.OneHotEncoder, pp.OneHotEncoder)
self.assertIs(df.preprocessing.PolynomialFeatures, pp.PolynomialFeatures)
self.assertIs(df.preprocessing.RobustScaler, pp.RobustScaler)
self.assertIs(df.preprocessing.StandardScaler, pp.StandardScaler)
示例10
def __init__(
self,
base_estimator: BaseEstimator = KerasAutoEncoder(kind="feedforward_hourglass"),
scaler: TransformerMixin = RobustScaler(),
require_thresholds: bool = True,
window=None,
):
"""
Classifier which wraps a ``base_estimator`` and provides a diff error
based approach to anomaly detection.
It trains a ``scaler`` to the target **after** training, purely for
error calculations. The underlying ``base_estimator`` is trained
with the original, unscaled, ``y``.
Parameters
----------
base_estimator: sklearn.base.BaseEstimator
The model to which normal ``.fit``, ``.predict`` methods will be used.
defaults to py:class:`gordo.machine.model.models.KerasAutoEncoder` with
``kind='feedforward_hourglass``
scaler: sklearn.base.TransformerMixin
Defaults to ``sklearn.preprocessing.RobustScaler``
Used for transforming model output and the original ``y`` to calculate
the difference/error in model output vs expected.
require_thresholds: bool
Requires calculating ``thresholds_`` via a call to :func:`~DiffBasedAnomalyDetector.cross_validate`.
If this is set (default True), but :func:`~DiffBasedAnomalyDetector.cross_validate`
was not called before calling :func:`~DiffBasedAnomalyDetector.anomaly` an ``AttributeError``
will be raised.
window: int
Window size for smoothed thresholds
"""
self.base_estimator = base_estimator
self.scaler = scaler
self.require_thresholds = require_thresholds
self.window = window
示例11
def __init__(self, params, dataset):
"""Initializes a UMAPTransformer object.
Args:
params (Namespace): Contains parameters used to instantiate the transformer.
dataset (Dataset): Dataset used to "train" the projection mapping.
"""
# TODO: decide whether to make n_epochs a parameter
#default_n_epochs = None
default_n_epochs = 500
if params.prediction_type == 'classification':
target_metric = 'categorical'
else:
target_metric = 'l2'
self.scaler = RobustScaler()
# Use Imputer to replace missing values (NaNs) with means for each column
self.imputer = Imputer()
scaled_X = self.scaler.fit_transform(self.imputer.fit_transform(dataset.X))
self.mapper = umap.UMAP(n_neighbors=params.umap_neighbors,
n_components=params.umap_dim,
metric=params.umap_metric,
target_metric=target_metric,
target_weight=params.umap_targ_wt,
min_dist=params.umap_min_dist,
n_epochs=default_n_epochs)
# TODO: How to deal with multitask data?
self.mapper.fit(scaled_X, y=dataset.y.flatten())
# ****************************************************************************************
示例12
def ini_scaler(self, joint_transform=False):
assert self.scaler_id in SCALER_ID
if self.scaler_id == 'MinMaxScaler':
self.scaler = MinMaxScaler()
elif self.scaler_id == 'RobustScaler':
self.scaler = RobustScaler()
elif self.scaler_id == 'StandardScaler':
self.scaler = StandardScaler()
if self.train and 'DATASET' == self.scaler_level:
f_mat = self.df[self.feature_cols]
self.scaler.fit(f_mat)
if joint_transform: self.df[self.feature_cols] = self.scaler.transform(f_mat)
示例13
def fit(self, X, y=None):
self.rs = RobustScaler()
self.rs.fit(X)
self.center_ = pd.Series(self.rs.center_, index=X.columns)
self.scale_ = pd.Series(self.rs.scale_, index=X.columns)
return self
示例14
def test_generate_import_code():
"""Assert that generate_import_code() returns the correct set of dependancies for a given pipeline."""
pipeline = creator.Individual.from_string('GaussianNB(RobustScaler(input_matrix))', tpot_obj._pset)
expected_code = """import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
"""
assert expected_code == generate_import_code(pipeline, tpot_obj.operators)
示例15
def __init__(self, feature_range, robust=0, normalize=False, truncate=False):
self.feature_range = feature_range
self.robust = robust
self.normalize = normalize
self.truncate = truncate
if self.robust:
self.skprepro = skpreprocessing.RobustScaler()
示例16
def test_selective_scale_robust():
# test the ref for a provided estimator
rb_scale = RobustScaler().fit(X)
trans = SelectiveRobustScaler().fit(X)
assert_array_almost_equal(rb_scale.fit_transform(X),
trans.transform(X).values)
示例17
def test_fit(self):
a = dpp.RobustScaler()
b = spp.RobustScaler()
# bigger data to make percentile more reliable
# and not centered around 0 to make rtol work
X, y = make_classification(n_samples=1000, chunks=200, random_state=0)
X = X + 3
a.fit(X)
b.fit(X.compute())
assert_estimator_equal(a, b, rtol=0.2)
示例18
def test_transform(self):
a = dpp.RobustScaler()
b = spp.RobustScaler()
a.fit(X)
b.fit(X.compute())
# overwriting dask-ml's fitted attributes to have them exactly equal
# (the approximate equality is tested above)
a.scale_ = b.scale_
a.center_ = b.center_
assert dask.is_dask_collection(a.transform(X))
assert_eq_ar(a.transform(X), b.transform(X.compute()))
示例19
def test_inverse_transform(self):
a = dpp.RobustScaler()
result = a.inverse_transform(a.fit_transform(X))
assert dask.is_dask_collection(result)
assert_eq_ar(result, X)
示例20
def main():
"""Run the IDS using GMM experiment."""
week3Data = _parseTrainingData()
# Scale the training data (ignore the timestamp column)
scaler = preprocessing.RobustScaler().fit(week3Data[:, 1:])
X_train = scaler.transform(week3Data[:, 1:])
del week3Data
try:
gmm = pickle.load(open("data/gmm.pkl", "rb"))
print("Loading pre-trained GMM...")
except IOError:
print("Training the Gaussian Mixture...")
gmm = GaussianMixture(n_components=16,
covariance_type='full',
# reg_covar=1,
verbose=1,
verbose_interval=2).fit(X_train)
pickle.dump(gmm, open("data/gmm.pkl", "wb"))
del X_train
X_orig = _parseTestingData()
print("Scaling the test data...")
X_test = scaler.transform(X_orig[:, 1:])
print("Calculating prosterior probabilies of test data...")
probs = gmm.predict_proba(X_test)
del X_test
scores = _score(probs)
del probs
results = np.hstack((X_orig, scores.reshape((scores.shape[0], 1))))
_outputToCSV(results, "data/gmm_results_max.csv")
示例21
def test_simple_column_transformer(self):
if ColumnTransformer is None:
return
data = numpy.array([[0, 0], [0, 0], [2, 1], [2, 1]],
dtype=numpy.float32)
model = ColumnTransformer([("scaler1", StandardScaler(), [0]),
("scaler2", RobustScaler(), [1])])
model.fit(data)
all_models = list(enumerate_pipeline_models(model))
steps = collect_intermediate_steps(model, "coulmn transformer",
[("input",
FloatTensorType([None, 2]))])
assert len(steps) == 2
assert len(all_models) == 3
model.transform(data)
for step in steps:
onnx_step = step['onnx_step']
sess = onnxruntime.InferenceSession(onnx_step.SerializeToString())
onnx_outputs = sess.run(None, {'input': data})
onnx_output = onnx_outputs[0]
skl_outputs = step['model']._debug.outputs['transform']
assert_almost_equal(onnx_output, skl_outputs)
compare_objects(onnx_output.tolist(), skl_outputs.tolist())
示例22
def fit(self, X, y):
self.XX = vstack(X)
self.yy = hstack(y)
self.scaler = RobustScaler().fit(self.XX)
self.svc.fit(self.scaler.transform(self.XX), self.yy)
示例23
def test_large_grid():
"""In this test, we purposely overfit a RandomForest to completely random data
in order to assert that the test error will far supercede the train error.
"""
if not SK18:
custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42)
else:
custom_cv = KFold(n_splits=3, shuffle=True, random_state=42)
# define the pipe
pipe = Pipeline([
('scaler', SelectiveScaler()),
('pca', SelectivePCA(weight=True)),
('rf', RandomForestClassifier(random_state=42))
])
# define hyper parameters
hp = {
'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()],
'pca__whiten': [True, False],
'pca__weight': [True, False],
'pca__n_components': uniform(0.75, 0.15),
'rf__n_estimators': randint(5, 10),
'rf__max_depth': randint(5, 15)
}
# define the grid
grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42)
# this will fail because we haven't fit yet
assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train)
# fit the grid
grid.fit(X_train, y_train)
# score for coverage -- this might warn...
with warnings.catch_warnings():
warnings.simplefilter("ignore")
grid.score(X_train, y_train)
# coverage:
assert grid._estimator_type == 'classifier'
# get predictions
tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test)
# evaluate score (SHOULD be better than random...)
accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred)
# grid score reports:
# assert fails for bad percentile
assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0})
assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0})
# assert fails for bad y_axis
assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'})
# assert passes otherwise
report_grid_score_detail(grid, charts=True, percentile=0.95) # just ensure percentile works
示例24
def test_random_grid():
# build a pipeline
pipe = Pipeline([
('retainer', FeatureRetainer()), # will retain all
('dropper', FeatureDropper()), # won't drop any
('mapper', FunctionMapper()), # pass through
('encoder', OneHotCategoricalEncoder()), # no object dtypes, so will pass through
('collinearity', MulticollinearityFilterer(threshold=0.85)),
('imputer', SelectiveImputer()), # pass through
('scaler', SelectiveScaler()),
('boxcox', BoxCoxTransformer()),
('nzv', NearZeroVarianceFilterer(threshold=1e-4)),
('pca', SelectivePCA(n_components=0.9)),
('model', RandomForestClassifier(n_jobs=1))
])
# let's define a set of hyper-parameters over which to search
hp = {
'collinearity__threshold': uniform(loc=.8, scale=.15),
'collinearity__method': ['pearson', 'kendall', 'spearman'],
'scaler__scaler': [StandardScaler(), RobustScaler()],
'pca__n_components': uniform(loc=.75, scale=.2),
'pca__whiten': [True, False],
'model__n_estimators': randint(5, 10),
'model__max_depth': randint(2, 5),
'model__min_samples_leaf': randint(1, 5),
'model__max_features': uniform(loc=.5, scale=.5),
'model__max_leaf_nodes': randint(10, 15)
}
# define the gridsearch
search = RandomizedSearchCV(pipe, hp,
n_iter=2, # just to test it even works
scoring='accuracy',
cv=2,
random_state=42)
# fit the search
search.fit(X_train, y_train)
# test the report
report_grid_score_detail(search, charts=False)
示例25
def test_pipeline_column_transformer_pipeline_imputer_scaler_lr(self):
X = np.array([[1, 2], [3, np.nan], [3, 0]], dtype=np.float32)
y = np.array([1, 0, 1])
model = Pipeline([
(
"ct",
ColumnTransformer([
(
"pipeline1",
Pipeline([
("imputer", SimpleImputer()),
("scaler", StandardScaler()),
]),
[0],
),
(
"pipeline2",
Pipeline([
("imputer", SimpleImputer()),
("scaler", RobustScaler()),
]),
[1],
),
]),
),
("lr", LogisticRegression(solver="liblinear")),
])
model.fit(X, y)
model_onnx = convert_sklearn(
model,
"pipelinewithinpipeline",
[("input", FloatTensorType([None, X.shape[1]]))],
)
self.assertTrue(model_onnx is not None)
dump_data_and_model(
X,
model,
model_onnx,
basename="SklearnPipelineCTPipelineImputerScalerLR",
allow_failure="StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.2.1')",
)