Python源码示例:sklearn.preprocessing.MinMaxScaler()

示例1
def make_mnist_data(path, isconv=False):
    X, Y = load_mnist(path, True)
    X = X.astype(np.float64)
    X2, Y2 = load_mnist(path, False)
    X2 = X2.astype(np.float64)
    X3 = np.concatenate((X, X2), axis=0)

    minmaxscale = MinMaxScaler().fit(X3)

    X = minmaxscale.transform(X)
    if isconv:
        X = X.reshape((-1, 1, 28, 28))

    sio.savemat(osp.join(path, 'traindata.mat'), {'X': X, 'Y': Y})

    X2 = minmaxscale.transform(X2)
    if isconv:
        X2 = X2.reshape((-1, 1, 28, 28))

    sio.savemat(osp.join(path, 'testdata.mat'), {'X': X2, 'Y': Y2}) 
示例2
def main():
    data_dir_path = './data'
    model_dir_path = './models'
    ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
    print(ecg_data.head())
    ecg_np_data = ecg_data.as_matrix()
    scaler = MinMaxScaler()
    ecg_np_data = scaler.fit_transform(ecg_np_data)
    print(ecg_np_data.shape)

    ae = BidirectionalLstmAutoEncoder()

    # fit the data and save model into model_dir_path
    if DO_TRAINING:
        ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)

    # load back the model saved in model_dir_path detect anomaly
    ae.load_model(model_dir_path)
    anomaly_information = ae.anomaly(ecg_np_data[:23, :])
    reconstruction_error = []
    for idx, (is_anomaly, dist) in enumerate(anomaly_information):
        print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
        reconstruction_error.append(dist)

    visualize_reconstruction_error(reconstruction_error, ae.threshold) 
示例3
def main():
    data_dir_path = './data'
    model_dir_path = './models'
    ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
    print(ecg_data.head())
    ecg_np_data = ecg_data.as_matrix()
    scaler = MinMaxScaler()
    ecg_np_data = scaler.fit_transform(ecg_np_data)
    print(ecg_np_data.shape)

    ae = CnnLstmAutoEncoder()

    # fit the data and save model into model_dir_path
    if DO_TRAINING:
        ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)

    # load back the model saved in model_dir_path detect anomaly
    ae.load_model(model_dir_path)
    anomaly_information = ae.anomaly(ecg_np_data[:23, :])
    reconstruction_error = []
    for idx, (is_anomaly, dist) in enumerate(anomaly_information):
        print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
        reconstruction_error.append(dist)

    visualize_reconstruction_error(reconstruction_error, ae.threshold) 
示例4
def main():
    data_dir_path = './data'
    model_dir_path = './models'
    ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
    print(ecg_data.head())
    ecg_np_data = ecg_data.as_matrix()
    scaler = MinMaxScaler()
    ecg_np_data = scaler.fit_transform(ecg_np_data)
    print(ecg_np_data.shape)

    ae = LstmAutoEncoder()

    # fit the data and save model into model_dir_path
    if DO_TRAINING:
        ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)

    # load back the model saved in model_dir_path detect anomaly
    ae.load_model(model_dir_path)
    anomaly_information = ae.anomaly(ecg_np_data[:23, :])
    reconstruction_error = []
    for idx, (is_anomaly, dist) in enumerate(anomaly_information):
        print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
        reconstruction_error.append(dist)

    visualize_reconstruction_error(reconstruction_error, ae.threshold) 
示例5
def make_misc_data(path, filename, dim, isconv=False):
    import cPickle
    fo = open(osp.join(path, filename), 'r')
    data = cPickle.load(fo)
    fo.close()
    X = data['data'].astype(np.float64)
    Y = data['labels']

    minmaxscale = MinMaxScaler().fit(X)
    X = minmaxscale.transform(X)

    p = np.random.permutation(X.shape[0])
    X = X[p]
    Y = Y[p]

    N = X.shape[0]

    if isconv:
        X = X.reshape((-1, dim[2], dim[0], dim[1]))
    save_misc_data(path, X, Y, N) 
示例6
def make_easy_visual_data(path, N=600):
    """Make 3 clusters of 2D data where the cluster centers lie along a line.
    The latent variable would be just their x or y value since that uniquely defines their projection onto the line.
    """

    line = (1.5, 1)
    centers = [(m, m * line[0] + line[1]) for m in (-4, 0, 6)]
    cluster_std = [1, 1, 1.5]
    X, labels = make_blobs(n_samples=N, cluster_std=cluster_std, centers=centers, n_features=len(centers[0]))

    # scale data
    minmaxscale = MinMaxScaler().fit(X)
    X = minmaxscale.transform(X)

    save_misc_data(path, X, labels, N)
    return X, labels 
示例7
def applyFeatures(dataset, delta):
    """
    applies rolling mean and delayed returns to each dataframe in the list
    """
    columns = dataset.columns
    close = columns[-3]
    returns = columns[-1]
    for n in delta:
        addFeatures(dataset, close, returns, n)

    dataset = dataset.drop(dataset.index[0:max(delta)]) #drop NaN due to delta spanning

    # normalize columns
    scaler = preprocessing.MinMaxScaler()
    return pd.DataFrame(scaler.fit_transform(dataset),\
            columns=dataset.columns, index=dataset.index) 
示例8
def get_term_topic(self, X):
        n_features = X.shape[1]
        id2word = self.vocabulary_
        word2topic = {}

        with open('word_topic.txt', 'r') as f:
            for line in f:
                strs = line.decode('utf-8').strip('\n').split('\t')
                word2topic[strs[0]] = strs[2]

        topic = np.zeros((len(id2word),))

        for i, key in enumerate(id2word):
            if key in word2topic:
                topic[id2word[key]] = word2topic[key]
            else:
                print key

        topic = preprocessing.MinMaxScaler().fit_transform(topic)
        # topic = sp.spdiags(topic, diags=0, m=n_features,
        #                    n=n_features, format='csr')
        return topic 
示例9
def test_metrics_wrapper():
    # make the features in y be in different scales
    y = np.array([[1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]) * [1, 100]

    # With no scaler provided it is relevant which of the two series gets an 80% error
    metric_func_noscaler = model_utils.metric_wrapper(mean_squared_error)

    mse_feature_one_wrong = metric_func_noscaler(y, y * [0.8, 1])
    mse_feature_two_wrong = metric_func_noscaler(y, y * [1, 0.8])

    assert not np.isclose(mse_feature_one_wrong, mse_feature_two_wrong)

    # With a scaler provided it is not relevant which of the two series gets an 80%
    # error
    scaler = MinMaxScaler().fit(y)
    metric_func_scaler = model_utils.metric_wrapper(mean_squared_error, scaler=scaler)

    mse_feature_one_wrong = metric_func_scaler(y, y * [0.8, 1])
    mse_feature_two_wrong = metric_func_scaler(y, y * [1, 0.8])

    assert np.isclose(mse_feature_one_wrong, mse_feature_two_wrong) 
示例10
def build_ensemble(**kwargs):
    """Generate ensemble."""

    ens = SuperLearner(**kwargs)
    prep = {'Standard Scaling': [StandardScaler()],
            'Min Max Scaling': [MinMaxScaler()],
            'No Preprocessing': []}

    est = {'Standard Scaling':
               [ElasticNet(), Lasso(), KNeighborsRegressor()],
           'Min Max Scaling':
               [SVR()],
           'No Preprocessing':
               [RandomForestRegressor(random_state=SEED),
                GradientBoostingRegressor()]}

    ens.add(est, prep)

    ens.add(GradientBoostingRegressor(), meta=True)

    return ens 
示例11
def test_build_meowa_factory():

    iris = datasets.load_iris()
    X = iris.data
    y = iris.target

    from sklearn.preprocessing import MinMaxScaler
    X = MinMaxScaler().fit_transform(X)

    l = nfpc.FuzzyPatternClassifier(membership_factory=t_factory,
                                    aggregation_factory=nfpc.MEOWAFactory())

    from sklearn.model_selection import cross_val_score

    scores = cross_val_score(l, X, y, cv=10)
    mean = np.mean(scores)

    assert 0.80 < mean 
示例12
def test_build_ps_owa_factory():

    iris = datasets.load_iris()
    X = iris.data
    y = iris.target

    from sklearn.preprocessing import MinMaxScaler
    X = MinMaxScaler().fit_transform(X)

    l = nfpc.FuzzyPatternClassifier(
        membership_factory=t_factory,
        aggregation_factory=nfpc.GAOWAFactory(optimizer=nfpc.ps_owa_optimizer())
    )

    from sklearn.model_selection import cross_val_score

    scores = cross_val_score(l, X, y, cv=10)
    mean = np.mean(scores)

    print("mean", mean)

    assert 0.92 < mean 
示例13
def test_classifier_iris():

    iris = load_iris()

    X = iris.data
    y = iris.target

    from sklearn.preprocessing import MinMaxScaler
    X = MinMaxScaler().fit_transform(X)

    l = fpcga.FuzzyPatternClassifierGA(iterations=100, random_state=1)

    from sklearn.model_selection import cross_val_score

    scores = cross_val_score(l, X, y, cv=10)

    assert len(scores) == 10
    assert np.mean(scores) > 0.6
    mean = np.mean(scores)

    print("mean", mean)

    assert 0.92 == pytest.approx(mean, 0.01) 
示例14
def scale_target_for_each_time_group(self, X, tgc_wo_time):
        # Go through groups and standard scale them
        if len(tgc_wo_time) > 0:
            X_groups = X.groupby(tgc_wo_time)
        else:
            X_groups = [([None], X)]

        self.scalers = {}
        scaled_ys = []
        for key, X_grp in X_groups:
            # Create dict key to store the min max scaler
            grp_hash = self.get_hash(key)
            # Scale target for current group
            self.scalers[grp_hash] = MinMaxScaler()
            y_skl = self.scalers[grp_hash].fit_transform(X_grp[['y']].values)
            # Put back in a DataFrame to keep track of original index
            y_skl_df = pd.DataFrame(y_skl, columns=['y'])
            y_skl_df.index = X_grp.index
            scaled_ys.append(y_skl_df)
        # Set target back in original frame but keep original
        X['y_orig'] = X['y']
        X['y'] = pd.concat(tuple(scaled_ys), axis=0)
        return X 
示例15
def _pp_min_max_scale(df):
    """
    特征值归一化处理
    """
    print("  start minmax scaling...")
    # drop掉id和price_date字段
    # df = df.drop(['id', 'price_date'], axis=1)
    # 保存index信息及column信息
    index = df.index
    columns = df.columns
    # 对特征进行归一化
    feature_scaled = preprocessing.MinMaxScaler().fit_transform(df.iloc[:, :-1])

    target = np.array(df.iloc[:, -1])
    target.shape = (len(target), 1)

    # 合并归一化后的X和未做归一化的y(归一化后Pandas 的 DataFrame类型会转换成numpy的ndarray类型)
    df_scaled = pd.DataFrame(np.hstack((feature_scaled, target)))
    # 重新设置索引及column信息
    df_scaled.index = index
    df_scaled.columns = columns

    print("  minmax scaling finished.")
    return df_scaled 
示例16
def test_03_xgb_classifier(self):
        print("\ntest 03 (xgb classifier with preprocessing) [binary-class]\n")
        model = XGBClassifier()
        pipeline_obj = Pipeline([
            ('scaler',MinMaxScaler()),
            ("model", model)
        ])
        pipeline_obj.fit(self.X,self.Y_bin)
        file_name = "test03xgboost.pmml"
        xgboost_to_pmml(pipeline_obj, self.features, 'Species', file_name)
        model_name  = self.adapa_utility.upload_to_zserver(file_name)
        predictions, probabilities = self.adapa_utility.score_in_zserver(model_name, self.test_file)
        model_pred = pipeline_obj.predict(self.X)
        model_prob = pipeline_obj.predict_proba(self.X)
        self.assertEqual(self.adapa_utility.compare_predictions(predictions, model_pred), True)
        self.assertEqual(self.adapa_utility.compare_probability(probabilities, model_prob), True) 
示例17
def test_01_lgbm_classifier(self):
        print("\ntest 01 (lgbm classifier with preprocessing) [binary-class]\n")
        model = LGBMClassifier()
        pipeline_obj = Pipeline([
            ('scaler',MinMaxScaler()),
            ("model", model)
        ])
        pipeline_obj.fit(self.X,self.Y_bin)
        file_name = "test01lgbm.pmml"
        lgb_to_pmml(pipeline_obj, self.features, 'Species', file_name)
        model_name  = self.adapa_utility.upload_to_zserver(file_name)
        predictions, probabilities = self.adapa_utility.score_in_zserver(model_name, self.test_file)
        model_pred = pipeline_obj.predict(self.X)
        model_prob = pipeline_obj.predict_proba(self.X)
        self.assertEqual(self.adapa_utility.compare_predictions(predictions, model_pred), True)
        self.assertEqual(self.adapa_utility.compare_probability(probabilities, model_prob), True) 
示例18
def test_persistence():
    """Make sure we can pickle it."""
    X = iris.data  # Use the iris features.
    X = MinMaxScaler().fit_transform(X)

    ae = Autoencoder(hidden_units=(1,),
                     n_epochs=1000,
                     random_state=4556,
                     learning_rate=1e-2,
                     keep_prob=1.0)
    Xenc = ae.fit_transform(X)

    b = BytesIO()
    pickle.dump(ae, b)
    ae_pickled = pickle.loads(b.getvalue())
    Xenc_pickled = ae_pickled.transform(X)
    assert_array_almost_equal(Xenc, Xenc_pickled) 
示例19
def test_replicability():
    """Make sure it can be seeded properly."""
    X = iris.data  # Use the iris features.
    X = MinMaxScaler().fit_transform(X)

    ae1 = Autoencoder(hidden_units=(1,),
                      n_epochs=1000,
                      random_state=4556,
                      learning_rate=1e-2,
                      keep_prob=1.0)
    Xenc1 = ae1.fit_transform(X)

    ae2 = Autoencoder(hidden_units=(1,),
                      n_epochs=1000,
                      random_state=4556,
                      learning_rate=1e-2,
                      keep_prob=1.0)
    Xenc2 = ae2.fit_transform(X)

    assert_array_almost_equal(Xenc1, Xenc2) 
示例20
def test_monitor_ae():
    """Test the monitor keyword."""
    # Use the iris features.
    X = iris.data
    X = MinMaxScaler().fit_transform(X)

    ae = Autoencoder(hidden_units=(3, 2,),
                     n_epochs=7500,
                     random_state=4556,
                     learning_rate=DEFAULT_LEARNING_RATE,
                     keep_prob=1.0,
                     hidden_activation=tf.nn.sigmoid,
                     encoding_activation=tf.nn.sigmoid,
                     output_activation=tf.nn.sigmoid)

    def _monitor(epoch, est, stats):
        assert epoch <= 1000, "The autoencoder has been running too long!"
        if stats['loss'] < 0.2:
            assert epoch > 10, "The autoencoder returned too soon!"
            return True
        else:
            return False
    ae.fit(X, monitor=_monitor) 
示例21
def __init__(self, transportAlgorithm="MappingTransport",
                 scaler=False, params=None, feedback=True):
        try:
            from sklearn.metrics import mean_squared_error
            from itertools import product
            from sklearn.metrics import (
                f1_score, cohen_kappa_score, accuracy_score)
        except BaseException:
            raise ImportError('Please install itertools and scikit-learn')

        self.transportAlgorithm = transportAlgorithm
        self.feedback = feedback

        self.params_ = params

        if scaler:
            from sklearn.preprocessing import MinMaxScaler
            self.scaler = MinMaxScaler(feature_range=(-1, 1))
            self.scalerTarget = MinMaxScaler(feature_range=(-1, 1))
        else:
            self.scaler = scaler 
示例22
def test_fit_instance_default(self):
        scale_param = self.get_scale_param()
        scale_param.scale_col_indexes = -1
        scale_obj = MinMaxScale(scale_param)
        fit_instance = scale_obj.fit(self.table_instance)
        column_min_value = scale_obj.column_min_value
        column_max_value = scale_obj.column_max_value

        scaler = MMS()
        scaler.fit(self.test_data)
        self.assertListEqual(np.round(self.get_table_instance_feature(fit_instance),6).tolist(),
                             np.around(scaler.transform(self.test_data), 6).tolist())
        data_min = list(scaler.data_min_)
        data_max = list(scaler.data_max_)
        self.assertListEqual(column_min_value, data_min)
        self.assertListEqual(column_max_value, data_max)

        transform_data = scale_obj.transform(self.table_instance)
        self.assertListEqual(self.get_table_instance_feature(fit_instance),
                             self.get_table_instance_feature(transform_data))

    # test with (area="all", upper=2, lower=1): 
示例23
def transform(X, scaler=None, scaler_type=None):
    """
    Apply standard scaling to the input variables
    :param X: the data
    :param scaler: the scaler to use, None if StandardScaler has to be used
    :return:
        scaler used
        X transformed using scaler
    """
    if scaler is None:
        if scaler_type == 'minmax':
            scaler = MinMaxScaler()
        else:
            scaler = StandardScaler()
        scaler.fit(X)
    return scaler, scaler.transform(X) 
示例24
def get_model(with_pipeline=False):
    """Get a multi-layer perceptron model.

    Optionally, put it in a pipeline that scales the data.

    """
    model = NeuralNetClassifier(MLPClassifier)
    if with_pipeline:
        model = Pipeline([
            ('scale', FeatureUnion([
                ('minmax', MinMaxScaler()),
                ('normalize', Normalizer()),
            ])),
            ('select', SelectKBest(k=N_FEATURES)),  # keep input size constant
            ('net', model),
        ])
    return model 
示例25
def test_print_help_pipeline(self, print_help, pipe, capsys):
        print_help(pipe)
        out = capsys.readouterr()[0]

        expected_snippets = [
            '-- --help',
            '<MinMaxScaler> options',
            '--features__scale__feature_range',
            '<NeuralNetClassifier> options',
            '--net__module : torch module (class or instance)',
            '--net__batch_size : int (default=128)',
            '<MLPModule> options',
            '--net__module__hidden_units : int (default=10)'
        ]
        for snippet in expected_snippets:
            assert snippet in out 
示例26
def test_print_help_pipeline_custom_defaults(
            self, print_help, pipe, capsys):
        defaults = {'net__batch_size': 256, 'net__module__hidden_units': 55}
        print_help(pipe, defaults=defaults)
        out = capsys.readouterr()[0]

        expected_snippets = [
            '-- --help',
            '<MinMaxScaler> options',
            '--features__scale__feature_range',
            '<NeuralNetClassifier> options',
            '--net__module : torch module (class or instance)',
            '--net__batch_size : int (default=256)',
            '<MLPModule> options',
            '--net__module__hidden_units : int (default=55)'
        ]
        for snippet in expected_snippets:
            assert snippet in out 
示例27
def scale_data(self):
        """
        Scale the X and Y data with minimax scaller.
        The scaling is done separately for the train and test set to avoid look ahead bias.
        """
        self.XY = pd.concat([self.X, self.Y], axis=1).dropna()
        train_set = self.XY.loc[START_TRAIN:END_TRAIN]
        test_set = self.XY.loc[START_TEST:END_TEST]
        # MinMax scaling
        minmaxed_scaler = MinMaxScaler(feature_range=(0, 1))
        self.minmaxed = minmaxed_scaler.fit(train_set)
        train_set_matrix = minmaxed_scaler.transform(train_set)
        test_set_matrix = minmaxed_scaler.transform(test_set)
        self.train_set_matrix_df = pd.DataFrame(train_set_matrix, index=train_set.index, columns=train_set.columns)
        self.test_set_matrix_df = pd.DataFrame(test_set_matrix, index=test_set.index, columns=test_set.columns)
        self.XY = pd.concat([self.train_set_matrix_df, self.test_set_matrix_df], axis=0)

        # print ("Train set shape: ", train_set_matrix.shape)
        # print ("Test set shape: ", test_set_matrix.shape) 
示例28
def main():
    data_dir_path = './data'
    model_dir_path = './models'

    # ecg data in which each row is a temporal sequence data of continuous values
    ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
    print(ecg_data.head())
    ecg_np_data = ecg_data.as_matrix()
    scaler = MinMaxScaler()
    ecg_np_data = scaler.fit_transform(ecg_np_data)

    print(ecg_np_data.shape)

    ae = Conv1DAutoEncoder()

    # fit the data and save model into model_dir_path
    if DO_TRAINING:
        ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)

    # load back the model saved in model_dir_path detect anomaly
    ae.load_model(model_dir_path)
    anomaly_information = ae.anomaly(ecg_np_data[:23, :])
    reconstruction_error = []
    for idx, (is_anomaly, dist) in enumerate(anomaly_information):
        print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
        reconstruction_error.append(dist)

    visualize_reconstruction_error(reconstruction_error, ae.threshold) 
示例29
def main():
    data_dir_path = './data'
    model_dir_path = './models'

    # ecg data in which each row is a temporal sequence data of continuous values
    ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
    print(ecg_data.head())
    ecg_np_data = ecg_data.as_matrix()
    scaler = MinMaxScaler()
    ecg_np_data = scaler.fit_transform(ecg_np_data)

    print(ecg_np_data.shape)

    ae = FeedForwardAutoEncoder()

    # fit the data and save model into model_dir_path
    if DO_TRAINING:
        ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)

    # load back the model saved in model_dir_path detect anomaly
    ae.load_model(model_dir_path)
    anomaly_information = ae.anomaly(ecg_np_data[:23, :])
    reconstruction_error = []
    for idx, (is_anomaly, dist) in enumerate(anomaly_information):
        print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
        reconstruction_error.append(dist)

    visualize_reconstruction_error(reconstruction_error, ae.threshold) 
示例30
def min_max_scale(X_train, X_test):
    preprocessor = prep.MinMaxScaler().fit(X_train)
    X_train = preprocessor.transform(X_train)
    X_test = preprocessor.transform(X_test)
    return X_train, X_test