Python源码示例:sklearn.preprocessing.MinMaxScaler()
示例1
def make_mnist_data(path, isconv=False):
X, Y = load_mnist(path, True)
X = X.astype(np.float64)
X2, Y2 = load_mnist(path, False)
X2 = X2.astype(np.float64)
X3 = np.concatenate((X, X2), axis=0)
minmaxscale = MinMaxScaler().fit(X3)
X = minmaxscale.transform(X)
if isconv:
X = X.reshape((-1, 1, 28, 28))
sio.savemat(osp.join(path, 'traindata.mat'), {'X': X, 'Y': Y})
X2 = minmaxscale.transform(X2)
if isconv:
X2 = X2.reshape((-1, 1, 28, 28))
sio.savemat(osp.join(path, 'testdata.mat'), {'X': X2, 'Y': Y2})
示例2
def main():
data_dir_path = './data'
model_dir_path = './models'
ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
print(ecg_data.head())
ecg_np_data = ecg_data.as_matrix()
scaler = MinMaxScaler()
ecg_np_data = scaler.fit_transform(ecg_np_data)
print(ecg_np_data.shape)
ae = BidirectionalLstmAutoEncoder()
# fit the data and save model into model_dir_path
if DO_TRAINING:
ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)
# load back the model saved in model_dir_path detect anomaly
ae.load_model(model_dir_path)
anomaly_information = ae.anomaly(ecg_np_data[:23, :])
reconstruction_error = []
for idx, (is_anomaly, dist) in enumerate(anomaly_information):
print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
reconstruction_error.append(dist)
visualize_reconstruction_error(reconstruction_error, ae.threshold)
示例3
def main():
data_dir_path = './data'
model_dir_path = './models'
ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
print(ecg_data.head())
ecg_np_data = ecg_data.as_matrix()
scaler = MinMaxScaler()
ecg_np_data = scaler.fit_transform(ecg_np_data)
print(ecg_np_data.shape)
ae = CnnLstmAutoEncoder()
# fit the data and save model into model_dir_path
if DO_TRAINING:
ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)
# load back the model saved in model_dir_path detect anomaly
ae.load_model(model_dir_path)
anomaly_information = ae.anomaly(ecg_np_data[:23, :])
reconstruction_error = []
for idx, (is_anomaly, dist) in enumerate(anomaly_information):
print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
reconstruction_error.append(dist)
visualize_reconstruction_error(reconstruction_error, ae.threshold)
示例4
def main():
data_dir_path = './data'
model_dir_path = './models'
ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
print(ecg_data.head())
ecg_np_data = ecg_data.as_matrix()
scaler = MinMaxScaler()
ecg_np_data = scaler.fit_transform(ecg_np_data)
print(ecg_np_data.shape)
ae = LstmAutoEncoder()
# fit the data and save model into model_dir_path
if DO_TRAINING:
ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)
# load back the model saved in model_dir_path detect anomaly
ae.load_model(model_dir_path)
anomaly_information = ae.anomaly(ecg_np_data[:23, :])
reconstruction_error = []
for idx, (is_anomaly, dist) in enumerate(anomaly_information):
print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
reconstruction_error.append(dist)
visualize_reconstruction_error(reconstruction_error, ae.threshold)
示例5
def make_misc_data(path, filename, dim, isconv=False):
import cPickle
fo = open(osp.join(path, filename), 'r')
data = cPickle.load(fo)
fo.close()
X = data['data'].astype(np.float64)
Y = data['labels']
minmaxscale = MinMaxScaler().fit(X)
X = minmaxscale.transform(X)
p = np.random.permutation(X.shape[0])
X = X[p]
Y = Y[p]
N = X.shape[0]
if isconv:
X = X.reshape((-1, dim[2], dim[0], dim[1]))
save_misc_data(path, X, Y, N)
示例6
def make_easy_visual_data(path, N=600):
"""Make 3 clusters of 2D data where the cluster centers lie along a line.
The latent variable would be just their x or y value since that uniquely defines their projection onto the line.
"""
line = (1.5, 1)
centers = [(m, m * line[0] + line[1]) for m in (-4, 0, 6)]
cluster_std = [1, 1, 1.5]
X, labels = make_blobs(n_samples=N, cluster_std=cluster_std, centers=centers, n_features=len(centers[0]))
# scale data
minmaxscale = MinMaxScaler().fit(X)
X = minmaxscale.transform(X)
save_misc_data(path, X, labels, N)
return X, labels
示例7
def applyFeatures(dataset, delta):
"""
applies rolling mean and delayed returns to each dataframe in the list
"""
columns = dataset.columns
close = columns[-3]
returns = columns[-1]
for n in delta:
addFeatures(dataset, close, returns, n)
dataset = dataset.drop(dataset.index[0:max(delta)]) #drop NaN due to delta spanning
# normalize columns
scaler = preprocessing.MinMaxScaler()
return pd.DataFrame(scaler.fit_transform(dataset),\
columns=dataset.columns, index=dataset.index)
示例8
def get_term_topic(self, X):
n_features = X.shape[1]
id2word = self.vocabulary_
word2topic = {}
with open('word_topic.txt', 'r') as f:
for line in f:
strs = line.decode('utf-8').strip('\n').split('\t')
word2topic[strs[0]] = strs[2]
topic = np.zeros((len(id2word),))
for i, key in enumerate(id2word):
if key in word2topic:
topic[id2word[key]] = word2topic[key]
else:
print key
topic = preprocessing.MinMaxScaler().fit_transform(topic)
# topic = sp.spdiags(topic, diags=0, m=n_features,
# n=n_features, format='csr')
return topic
示例9
def test_metrics_wrapper():
# make the features in y be in different scales
y = np.array([[1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]) * [1, 100]
# With no scaler provided it is relevant which of the two series gets an 80% error
metric_func_noscaler = model_utils.metric_wrapper(mean_squared_error)
mse_feature_one_wrong = metric_func_noscaler(y, y * [0.8, 1])
mse_feature_two_wrong = metric_func_noscaler(y, y * [1, 0.8])
assert not np.isclose(mse_feature_one_wrong, mse_feature_two_wrong)
# With a scaler provided it is not relevant which of the two series gets an 80%
# error
scaler = MinMaxScaler().fit(y)
metric_func_scaler = model_utils.metric_wrapper(mean_squared_error, scaler=scaler)
mse_feature_one_wrong = metric_func_scaler(y, y * [0.8, 1])
mse_feature_two_wrong = metric_func_scaler(y, y * [1, 0.8])
assert np.isclose(mse_feature_one_wrong, mse_feature_two_wrong)
示例10
def build_ensemble(**kwargs):
"""Generate ensemble."""
ens = SuperLearner(**kwargs)
prep = {'Standard Scaling': [StandardScaler()],
'Min Max Scaling': [MinMaxScaler()],
'No Preprocessing': []}
est = {'Standard Scaling':
[ElasticNet(), Lasso(), KNeighborsRegressor()],
'Min Max Scaling':
[SVR()],
'No Preprocessing':
[RandomForestRegressor(random_state=SEED),
GradientBoostingRegressor()]}
ens.add(est, prep)
ens.add(GradientBoostingRegressor(), meta=True)
return ens
示例11
def test_build_meowa_factory():
iris = datasets.load_iris()
X = iris.data
y = iris.target
from sklearn.preprocessing import MinMaxScaler
X = MinMaxScaler().fit_transform(X)
l = nfpc.FuzzyPatternClassifier(membership_factory=t_factory,
aggregation_factory=nfpc.MEOWAFactory())
from sklearn.model_selection import cross_val_score
scores = cross_val_score(l, X, y, cv=10)
mean = np.mean(scores)
assert 0.80 < mean
示例12
def test_build_ps_owa_factory():
iris = datasets.load_iris()
X = iris.data
y = iris.target
from sklearn.preprocessing import MinMaxScaler
X = MinMaxScaler().fit_transform(X)
l = nfpc.FuzzyPatternClassifier(
membership_factory=t_factory,
aggregation_factory=nfpc.GAOWAFactory(optimizer=nfpc.ps_owa_optimizer())
)
from sklearn.model_selection import cross_val_score
scores = cross_val_score(l, X, y, cv=10)
mean = np.mean(scores)
print("mean", mean)
assert 0.92 < mean
示例13
def test_classifier_iris():
iris = load_iris()
X = iris.data
y = iris.target
from sklearn.preprocessing import MinMaxScaler
X = MinMaxScaler().fit_transform(X)
l = fpcga.FuzzyPatternClassifierGA(iterations=100, random_state=1)
from sklearn.model_selection import cross_val_score
scores = cross_val_score(l, X, y, cv=10)
assert len(scores) == 10
assert np.mean(scores) > 0.6
mean = np.mean(scores)
print("mean", mean)
assert 0.92 == pytest.approx(mean, 0.01)
示例14
def scale_target_for_each_time_group(self, X, tgc_wo_time):
# Go through groups and standard scale them
if len(tgc_wo_time) > 0:
X_groups = X.groupby(tgc_wo_time)
else:
X_groups = [([None], X)]
self.scalers = {}
scaled_ys = []
for key, X_grp in X_groups:
# Create dict key to store the min max scaler
grp_hash = self.get_hash(key)
# Scale target for current group
self.scalers[grp_hash] = MinMaxScaler()
y_skl = self.scalers[grp_hash].fit_transform(X_grp[['y']].values)
# Put back in a DataFrame to keep track of original index
y_skl_df = pd.DataFrame(y_skl, columns=['y'])
y_skl_df.index = X_grp.index
scaled_ys.append(y_skl_df)
# Set target back in original frame but keep original
X['y_orig'] = X['y']
X['y'] = pd.concat(tuple(scaled_ys), axis=0)
return X
示例15
def _pp_min_max_scale(df):
"""
特征值归一化处理
"""
print(" start minmax scaling...")
# drop掉id和price_date字段
# df = df.drop(['id', 'price_date'], axis=1)
# 保存index信息及column信息
index = df.index
columns = df.columns
# 对特征进行归一化
feature_scaled = preprocessing.MinMaxScaler().fit_transform(df.iloc[:, :-1])
target = np.array(df.iloc[:, -1])
target.shape = (len(target), 1)
# 合并归一化后的X和未做归一化的y(归一化后Pandas 的 DataFrame类型会转换成numpy的ndarray类型)
df_scaled = pd.DataFrame(np.hstack((feature_scaled, target)))
# 重新设置索引及column信息
df_scaled.index = index
df_scaled.columns = columns
print(" minmax scaling finished.")
return df_scaled
示例16
def test_03_xgb_classifier(self):
print("\ntest 03 (xgb classifier with preprocessing) [binary-class]\n")
model = XGBClassifier()
pipeline_obj = Pipeline([
('scaler',MinMaxScaler()),
("model", model)
])
pipeline_obj.fit(self.X,self.Y_bin)
file_name = "test03xgboost.pmml"
xgboost_to_pmml(pipeline_obj, self.features, 'Species', file_name)
model_name = self.adapa_utility.upload_to_zserver(file_name)
predictions, probabilities = self.adapa_utility.score_in_zserver(model_name, self.test_file)
model_pred = pipeline_obj.predict(self.X)
model_prob = pipeline_obj.predict_proba(self.X)
self.assertEqual(self.adapa_utility.compare_predictions(predictions, model_pred), True)
self.assertEqual(self.adapa_utility.compare_probability(probabilities, model_prob), True)
示例17
def test_01_lgbm_classifier(self):
print("\ntest 01 (lgbm classifier with preprocessing) [binary-class]\n")
model = LGBMClassifier()
pipeline_obj = Pipeline([
('scaler',MinMaxScaler()),
("model", model)
])
pipeline_obj.fit(self.X,self.Y_bin)
file_name = "test01lgbm.pmml"
lgb_to_pmml(pipeline_obj, self.features, 'Species', file_name)
model_name = self.adapa_utility.upload_to_zserver(file_name)
predictions, probabilities = self.adapa_utility.score_in_zserver(model_name, self.test_file)
model_pred = pipeline_obj.predict(self.X)
model_prob = pipeline_obj.predict_proba(self.X)
self.assertEqual(self.adapa_utility.compare_predictions(predictions, model_pred), True)
self.assertEqual(self.adapa_utility.compare_probability(probabilities, model_prob), True)
示例18
def test_persistence():
"""Make sure we can pickle it."""
X = iris.data # Use the iris features.
X = MinMaxScaler().fit_transform(X)
ae = Autoencoder(hidden_units=(1,),
n_epochs=1000,
random_state=4556,
learning_rate=1e-2,
keep_prob=1.0)
Xenc = ae.fit_transform(X)
b = BytesIO()
pickle.dump(ae, b)
ae_pickled = pickle.loads(b.getvalue())
Xenc_pickled = ae_pickled.transform(X)
assert_array_almost_equal(Xenc, Xenc_pickled)
示例19
def test_replicability():
"""Make sure it can be seeded properly."""
X = iris.data # Use the iris features.
X = MinMaxScaler().fit_transform(X)
ae1 = Autoencoder(hidden_units=(1,),
n_epochs=1000,
random_state=4556,
learning_rate=1e-2,
keep_prob=1.0)
Xenc1 = ae1.fit_transform(X)
ae2 = Autoencoder(hidden_units=(1,),
n_epochs=1000,
random_state=4556,
learning_rate=1e-2,
keep_prob=1.0)
Xenc2 = ae2.fit_transform(X)
assert_array_almost_equal(Xenc1, Xenc2)
示例20
def test_monitor_ae():
"""Test the monitor keyword."""
# Use the iris features.
X = iris.data
X = MinMaxScaler().fit_transform(X)
ae = Autoencoder(hidden_units=(3, 2,),
n_epochs=7500,
random_state=4556,
learning_rate=DEFAULT_LEARNING_RATE,
keep_prob=1.0,
hidden_activation=tf.nn.sigmoid,
encoding_activation=tf.nn.sigmoid,
output_activation=tf.nn.sigmoid)
def _monitor(epoch, est, stats):
assert epoch <= 1000, "The autoencoder has been running too long!"
if stats['loss'] < 0.2:
assert epoch > 10, "The autoencoder returned too soon!"
return True
else:
return False
ae.fit(X, monitor=_monitor)
示例21
def __init__(self, transportAlgorithm="MappingTransport",
scaler=False, params=None, feedback=True):
try:
from sklearn.metrics import mean_squared_error
from itertools import product
from sklearn.metrics import (
f1_score, cohen_kappa_score, accuracy_score)
except BaseException:
raise ImportError('Please install itertools and scikit-learn')
self.transportAlgorithm = transportAlgorithm
self.feedback = feedback
self.params_ = params
if scaler:
from sklearn.preprocessing import MinMaxScaler
self.scaler = MinMaxScaler(feature_range=(-1, 1))
self.scalerTarget = MinMaxScaler(feature_range=(-1, 1))
else:
self.scaler = scaler
示例22
def test_fit_instance_default(self):
scale_param = self.get_scale_param()
scale_param.scale_col_indexes = -1
scale_obj = MinMaxScale(scale_param)
fit_instance = scale_obj.fit(self.table_instance)
column_min_value = scale_obj.column_min_value
column_max_value = scale_obj.column_max_value
scaler = MMS()
scaler.fit(self.test_data)
self.assertListEqual(np.round(self.get_table_instance_feature(fit_instance),6).tolist(),
np.around(scaler.transform(self.test_data), 6).tolist())
data_min = list(scaler.data_min_)
data_max = list(scaler.data_max_)
self.assertListEqual(column_min_value, data_min)
self.assertListEqual(column_max_value, data_max)
transform_data = scale_obj.transform(self.table_instance)
self.assertListEqual(self.get_table_instance_feature(fit_instance),
self.get_table_instance_feature(transform_data))
# test with (area="all", upper=2, lower=1):
示例23
def transform(X, scaler=None, scaler_type=None):
"""
Apply standard scaling to the input variables
:param X: the data
:param scaler: the scaler to use, None if StandardScaler has to be used
:return:
scaler used
X transformed using scaler
"""
if scaler is None:
if scaler_type == 'minmax':
scaler = MinMaxScaler()
else:
scaler = StandardScaler()
scaler.fit(X)
return scaler, scaler.transform(X)
示例24
def get_model(with_pipeline=False):
"""Get a multi-layer perceptron model.
Optionally, put it in a pipeline that scales the data.
"""
model = NeuralNetClassifier(MLPClassifier)
if with_pipeline:
model = Pipeline([
('scale', FeatureUnion([
('minmax', MinMaxScaler()),
('normalize', Normalizer()),
])),
('select', SelectKBest(k=N_FEATURES)), # keep input size constant
('net', model),
])
return model
示例25
def test_print_help_pipeline(self, print_help, pipe, capsys):
print_help(pipe)
out = capsys.readouterr()[0]
expected_snippets = [
'-- --help',
'<MinMaxScaler> options',
'--features__scale__feature_range',
'<NeuralNetClassifier> options',
'--net__module : torch module (class or instance)',
'--net__batch_size : int (default=128)',
'<MLPModule> options',
'--net__module__hidden_units : int (default=10)'
]
for snippet in expected_snippets:
assert snippet in out
示例26
def test_print_help_pipeline_custom_defaults(
self, print_help, pipe, capsys):
defaults = {'net__batch_size': 256, 'net__module__hidden_units': 55}
print_help(pipe, defaults=defaults)
out = capsys.readouterr()[0]
expected_snippets = [
'-- --help',
'<MinMaxScaler> options',
'--features__scale__feature_range',
'<NeuralNetClassifier> options',
'--net__module : torch module (class or instance)',
'--net__batch_size : int (default=256)',
'<MLPModule> options',
'--net__module__hidden_units : int (default=55)'
]
for snippet in expected_snippets:
assert snippet in out
示例27
def scale_data(self):
"""
Scale the X and Y data with minimax scaller.
The scaling is done separately for the train and test set to avoid look ahead bias.
"""
self.XY = pd.concat([self.X, self.Y], axis=1).dropna()
train_set = self.XY.loc[START_TRAIN:END_TRAIN]
test_set = self.XY.loc[START_TEST:END_TEST]
# MinMax scaling
minmaxed_scaler = MinMaxScaler(feature_range=(0, 1))
self.minmaxed = minmaxed_scaler.fit(train_set)
train_set_matrix = minmaxed_scaler.transform(train_set)
test_set_matrix = minmaxed_scaler.transform(test_set)
self.train_set_matrix_df = pd.DataFrame(train_set_matrix, index=train_set.index, columns=train_set.columns)
self.test_set_matrix_df = pd.DataFrame(test_set_matrix, index=test_set.index, columns=test_set.columns)
self.XY = pd.concat([self.train_set_matrix_df, self.test_set_matrix_df], axis=0)
# print ("Train set shape: ", train_set_matrix.shape)
# print ("Test set shape: ", test_set_matrix.shape)
示例28
def main():
data_dir_path = './data'
model_dir_path = './models'
# ecg data in which each row is a temporal sequence data of continuous values
ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
print(ecg_data.head())
ecg_np_data = ecg_data.as_matrix()
scaler = MinMaxScaler()
ecg_np_data = scaler.fit_transform(ecg_np_data)
print(ecg_np_data.shape)
ae = Conv1DAutoEncoder()
# fit the data and save model into model_dir_path
if DO_TRAINING:
ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)
# load back the model saved in model_dir_path detect anomaly
ae.load_model(model_dir_path)
anomaly_information = ae.anomaly(ecg_np_data[:23, :])
reconstruction_error = []
for idx, (is_anomaly, dist) in enumerate(anomaly_information):
print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
reconstruction_error.append(dist)
visualize_reconstruction_error(reconstruction_error, ae.threshold)
示例29
def main():
data_dir_path = './data'
model_dir_path = './models'
# ecg data in which each row is a temporal sequence data of continuous values
ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
print(ecg_data.head())
ecg_np_data = ecg_data.as_matrix()
scaler = MinMaxScaler()
ecg_np_data = scaler.fit_transform(ecg_np_data)
print(ecg_np_data.shape)
ae = FeedForwardAutoEncoder()
# fit the data and save model into model_dir_path
if DO_TRAINING:
ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)
# load back the model saved in model_dir_path detect anomaly
ae.load_model(model_dir_path)
anomaly_information = ae.anomaly(ecg_np_data[:23, :])
reconstruction_error = []
for idx, (is_anomaly, dist) in enumerate(anomaly_information):
print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
reconstruction_error.append(dist)
visualize_reconstruction_error(reconstruction_error, ae.threshold)
示例30
def min_max_scale(X_train, X_test):
preprocessor = prep.MinMaxScaler().fit(X_train)
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)
return X_train, X_test