Python源码示例:sklearn.preprocessing.Normalizer()
示例1
def test_make_column_transformer_kwargs():
scaler = StandardScaler()
norm = Normalizer()
ct = make_column_transformer((scaler, 'first'), (norm, ['second']),
n_jobs=3, remainder='drop',
sparse_threshold=0.5)
assert_equal(ct.transformers, make_column_transformer(
(scaler, 'first'), (norm, ['second'])).transformers)
assert_equal(ct.n_jobs, 3)
assert_equal(ct.remainder, 'drop')
assert_equal(ct.sparse_threshold, 0.5)
# invalid keyword parameters should raise an error message
assert_raise_message(
TypeError,
'Unknown keyword arguments: "transformer_weights"',
make_column_transformer, (scaler, 'first'), (norm, ['second']),
transformer_weights={'pca': 10, 'Transf': 1}
)
示例2
def get_model(with_pipeline=False):
"""Get a multi-layer perceptron model.
Optionally, put it in a pipeline that scales the data.
"""
model = NeuralNetClassifier(MLPClassifier)
if with_pipeline:
model = Pipeline([
('scale', FeatureUnion([
('minmax', MinMaxScaler()),
('normalize', Normalizer()),
])),
('select', SelectKBest(k=N_FEATURES)), # keep input size constant
('net', model),
])
return model
示例3
def test_boston_OHE_pipeline(self):
data = load_boston()
for categorical_features in [[3], [8], [3, 8], [8, 3]]:
# Put it in a pipeline so that we can test whether the output dimension
# handling is correct.
model = Pipeline(
[
("OHE", OneHotEncoder(categorical_features=categorical_features)),
("Normalizer", Normalizer()),
]
)
model.fit(data.data.copy(), data.target)
# Convert the model
spec = sklearn.convert(model, data.feature_names, "out").get_spec()
input_data = [dict(zip(data.feature_names, row)) for row in data.data]
output_data = [{"out": row} for row in model.transform(data.data.copy())]
result = evaluate_transformer(spec, input_data, output_data)
assert result["num_errors"] == 0
示例4
def test_random(self):
# Generate some random data_imputeValue.multiArrayValue[i]
X = _np.random.random(size=(50, 3))
for param in ("l1", "l2", "max"):
cur_model = Normalizer(norm=param)
output = cur_model.fit_transform(X)
spec = converter.convert(cur_model, ["a", "b", "c"], "out")
evaluate_transformer(
spec,
[dict(zip(["a", "b", "c"], row)) for row in X],
[{"out": row} for row in output],
)
示例5
def test_within_pipeline():
pytest.importorskip('cv2')
pytest.importorskip('sklearn')
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
stim = join(get_test_data_path(), 'image', 'apple.jpg')
graph = Graph([BrightnessExtractor(), SharpnessExtractor()])
trans = PliersTransformer(graph)
normalizer = Normalizer()
pipeline = Pipeline([('pliers', trans), ('normalizer', normalizer)])
res = pipeline.fit_transform(stim)
assert res.shape == (1, 2)
assert np.isclose(res[0][0], 0.66393, 1e-5)
assert np.isclose(res[0][1], 0.74780, 1e-5)
meta = trans.metadata_
assert 'onset' in meta.columns
assert meta['class'][0] == 'ImageStim'
示例6
def __init__(self, source_model: mx.mod.Module, feature_layer_names, context_function=mx.context.cpu, num_devices=1,
max_function_evaluations=100, apply_l2_norm=False):
# Call base class constructor with parameters required for meta-models
super().__init__(source_model, feature_layer_names, context_function, num_devices)
self.max_function_evaluations = max_function_evaluations
self.apply_l2_norm = apply_l2_norm
# Mean of features to use for normalization. Computed in training phase.
# Used to normalize features in training and in prediction.
self.feature_mean = None
# Optimizer to use for training GP model
self.optimizer = 'lbfgs'
# Number of inducing points to use for sparse GP
self.NUM_INDUCING_SPARSE_GP = 100
# Normalizer to use when apply_l2_norm flag is set
self.l2_normalizer = Normalizer(norm='l2')
示例7
def test_kneighbors_with_or_without_self_hit(LSH: callable, metric, n_jobs, verbose):
X, y = make_classification(random_state=234)
X = Normalizer().fit_transform(X)
lsh = LSH(metric=metric, n_jobs=n_jobs, verbose=verbose)
lsh.fit(X, y)
neigh_dist, neigh_ind = lsh.kneighbors(return_distance=True)
neigh_dist_self, neigh_ind_self = lsh.kneighbors(X, return_distance=True)
ind_only = lsh.kneighbors(return_distance=False)
ind_only_self = lsh.kneighbors(X, return_distance=False)
assert_array_equal(neigh_ind, ind_only)
assert_array_equal(neigh_ind_self, ind_only_self)
assert (neigh_ind - neigh_ind_self).mean() <= .01, f'More than 1% of neighbors mismatch'
assert ((neigh_dist - neigh_dist_self) < 0.0001).mean() <= 0.01,\
f'Not almost equal to 4 decimals in more than 1% of neighbor slots'
示例8
def test_radius_neighbors_with_or_without_self_hit(LSH, metric, n_jobs, verbose):
X, y = make_classification()
X = Normalizer().fit_transform(X)
lsh = LSH(metric=metric, n_jobs=n_jobs, verbose=verbose)
lsh.fit(X, y)
radius = lsh.kneighbors(n_candidates=3)[0][:, 2].max()
neigh_dist, neigh_ind = lsh.radius_neighbors(return_distance=True, radius=radius)
neigh_dist_self, neigh_ind_self = lsh.radius_neighbors(X, return_distance=True, radius=radius)
ind_only = lsh.radius_neighbors(return_distance=False, radius=radius)
ind_only_self = lsh.radius_neighbors(X, return_distance=False, radius=radius)
assert len(neigh_ind) == len(neigh_ind_self) == len(neigh_dist) == len(neigh_dist_self)
for i in range(len(neigh_ind)):
assert_array_equal(neigh_ind[i], ind_only[i])
assert_array_equal(neigh_ind_self[i], ind_only_self[i])
assert_array_equal(neigh_ind[i][:3],
neigh_ind_self[i][1:4])
assert_array_almost_equal(neigh_dist[i][:3],
neigh_dist_self[i][1:4])
示例9
def test_squared_euclidean_same_neighbors_as_euclidean(LSH):
X, y = make_classification(random_state=234)
X = Normalizer().fit_transform(X)
lsh = LSH(metric='minkowski')
lsh.fit(X, y)
neigh_dist_eucl, neigh_ind_eucl = lsh.kneighbors()
lsh_sq = LSH(metric='sqeuclidean')
lsh_sq.fit(X, y)
neigh_dist_sqeucl, neigh_ind_sqeucl = lsh_sq.kneighbors()
assert_array_equal(neigh_ind_eucl, neigh_ind_sqeucl)
assert_array_almost_equal(neigh_dist_eucl ** 2, neigh_dist_sqeucl)
if LSH in LSH_WITH_RADIUS:
radius = neigh_dist_eucl[:, 2].max()
rad_dist_eucl, rad_ind_eucl = lsh.radius_neighbors(radius=radius)
rad_dist_sqeucl, rad_ind_sqeucl = lsh_sq.radius_neighbors(radius=radius**2)
for i in range(len(rad_ind_eucl)):
assert_array_equal(rad_ind_eucl[i], rad_ind_sqeucl[i])
assert_array_almost_equal(rad_dist_eucl[i] ** 2, rad_dist_sqeucl[i])
示例10
def nbow_model(task, embeddings, word2idx):
if task == "clf":
algo = LogisticRegression(C=0.6, random_state=0,
class_weight='balanced')
elif task == "reg":
algo = SVR(kernel='linear', C=0.6)
else:
raise ValueError("invalid task!")
embeddings_features = NBOWVectorizer(aggregation=["mean"],
embeddings=embeddings,
word2idx=word2idx,
stopwords=False)
model = Pipeline([
('embeddings-feats', embeddings_features),
('normalizer', Normalizer(norm='l2')),
('clf', algo)
])
return model
示例11
def test_objectmapper(self):
df = pdml.ModelFrame([])
self.assertIs(df.preprocessing.Binarizer, pp.Binarizer)
self.assertIs(df.preprocessing.FunctionTransformer,
pp.FunctionTransformer)
self.assertIs(df.preprocessing.Imputer, pp.Imputer)
self.assertIs(df.preprocessing.KernelCenterer, pp.KernelCenterer)
self.assertIs(df.preprocessing.LabelBinarizer, pp.LabelBinarizer)
self.assertIs(df.preprocessing.LabelEncoder, pp.LabelEncoder)
self.assertIs(df.preprocessing.MultiLabelBinarizer, pp.MultiLabelBinarizer)
self.assertIs(df.preprocessing.MaxAbsScaler, pp.MaxAbsScaler)
self.assertIs(df.preprocessing.MinMaxScaler, pp.MinMaxScaler)
self.assertIs(df.preprocessing.Normalizer, pp.Normalizer)
self.assertIs(df.preprocessing.OneHotEncoder, pp.OneHotEncoder)
self.assertIs(df.preprocessing.PolynomialFeatures, pp.PolynomialFeatures)
self.assertIs(df.preprocessing.RobustScaler, pp.RobustScaler)
self.assertIs(df.preprocessing.StandardScaler, pp.StandardScaler)
示例12
def __init__(self, min_df=1, max_df=0.9, tokenizer=LemmaTokenizer, hash=False):
"""
`min_df` is set to filter out extremely rare words,
since we don't want those to dominate the distance metric.
`max_df` is set to filter out extremely common words,
since they don't convey much information.
"""
# Wrap the specified tokenizer
t = Tokenizer(tokenizer())
if hash:
vectr = HashingVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t)
else:
vectr = CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t, min_df=min_df, max_df=max_df)
args = [
('vectorizer', vectr),
('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)),
('normalizer', Normalizer(copy=False))
]
self.pipeline = Pipeline(args)
self.trained = False
示例13
def __init__(self):
super().__init__()
self.normalizer = Normalizer()
示例14
def train(self, training_data_X, training_data_Y):
self.normalizer = Normalizer()
self.svc = svm.SVC(gamma=0.001, C=100.)
normalised_training_data_X = self.normalizer.fit_transform(training_data_X)
self.svc.fit(normalised_training_data_X, training_data_Y)
示例15
def data_cleaning_formatting(X):
# Basic cleaning
X = X.fillna(0)
X = X.fillna('ffill')
# Encode data
X = encode_data(X)
X = Normalizer().fit_transform(X)
return X
示例16
def test_make_column_transformer():
scaler = StandardScaler()
norm = Normalizer()
ct = make_column_transformer((scaler, 'first'), (norm, ['second']))
names, transformers, columns = zip(*ct.transformers)
assert_equal(names, ("standardscaler", "normalizer"))
assert_equal(transformers, (scaler, norm))
assert_equal(columns, ('first', ['second']))
# XXX remove in v0.22
with pytest.warns(DeprecationWarning,
match='`make_column_transformer` now expects'):
ct1 = make_column_transformer(([0], norm))
ct2 = make_column_transformer((norm, [0]))
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
assert_almost_equal(ct1.fit_transform(X_array),
ct2.fit_transform(X_array))
with pytest.warns(DeprecationWarning,
match='`make_column_transformer` now expects'):
make_column_transformer(('first', 'drop'))
with pytest.warns(DeprecationWarning,
match='`make_column_transformer` now expects'):
make_column_transformer(('passthrough', 'passthrough'),
('first', 'drop'))
示例17
def test_make_column_transformer_pandas():
pd = pytest.importorskip('pandas')
X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
X_df = pd.DataFrame(X_array, columns=['first', 'second'])
norm = Normalizer()
# XXX remove in v0.22
with pytest.warns(DeprecationWarning,
match='`make_column_transformer` now expects'):
ct1 = make_column_transformer((X_df.columns, norm))
ct2 = make_column_transformer((norm, X_df.columns))
assert_almost_equal(ct1.fit_transform(X_df),
ct2.fit_transform(X_df))
示例18
def test_make_column_transformer_remainder_transformer():
scaler = StandardScaler()
norm = Normalizer()
remainder = StandardScaler()
ct = make_column_transformer((scaler, 'first'), (norm, ['second']),
remainder=remainder)
assert ct.remainder == remainder
示例19
def test():
parser = argparse.ArgumentParser()
parser.add_argument("File")
args = parser.parse_args()
info = fh.get_function_information(args.File)
#info = fh.get_arg_funcs(args.File)
info = trim_funcs(info, args.File)
vect, func_sparse = funcs_to_sparse(info)
transformer = Normalizer().fit(func_sparse)
func_sparse = transformer.transform(func_sparse)
#svd = TruncatedSVD(random_state=2)
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
func_sparse = svd.fit_transform(func_sparse)
scores = []
clust_count = []
for x in range(2, 20):
result = KMeans(n_clusters=x, random_state=2).fit(func_sparse)
score = silhouette_score(func_sparse, result.labels_, metric="cosine")
scores.append(score)
clust_count.append(x)
print("Clusters {:<3} | Silhoette Score : {}".format(x, score))
plt.plot(clust_count, scores)
plt.xlabel("Cluster Centroid Count")
plt.ylabel("Silhoette Score")
plt.grid = True
plt.show()
pass
示例20
def single_cluster(all_functions, centroid_count=2):
vect, func_sparse = funcs_to_sparse(all_functions)
transformer = Normalizer().fit(func_sparse)
func_sparse = transformer.transform(func_sparse)
# svd = TruncatedSVD(random_state=2)
# svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
# func_sparse = svd.fit_transform(func_sparse)
labels = []
result = KMeans(n_clusters=centroid_count, random_state=2).fit(func_sparse)
score = silhouette_score(func_sparse,
result.labels_,
metric="cosine",
random_state=2,
sample_size=5000)
labels.append(result.labels_)
print("Clusters {:<3} | Silhoette Score : {}".format(
centroid_count, score))
return result.labels_
示例21
def get_cosine_dist(all_functions):
return_dict = {}
vect, func_sparse = funcs_to_sparse(all_functions)
transformer = Normalizer().fit(func_sparse)
func_sparse = transformer.transform(func_sparse)
return cosine_distances(func_sparse, func_sparse)
示例22
def get_single_cluster(all_functions, centroid_count=2):
return_dict = {}
vect, func_sparse = funcs_to_sparse(all_functions)
transformer = Normalizer().fit(func_sparse)
func_sparse = transformer.transform(func_sparse)
# svd = TruncatedSVD(random_state=2)
# svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
# func_sparse = svd.fit_transform(func_sparse)
labels = []
result = KMeans(n_clusters=centroid_count, random_state=2).fit(func_sparse)
score = silhouette_score(func_sparse,
result.labels_,
metric="cosine",
random_state=2,
sample_size=5000)
labels.append(result.labels_)
#print("Clusters {:<3} | Silhoette Score : {}".format(centroid_count, score))
return_dict['count'] = centroid_count
return_dict['score'] = score
return_dict['labels'] = result.labels_
return return_dict
示例23
def get_normalizer(self, x_train):
from sklearn.preprocessing import Normalizer
return Normalizer().fit(self.x_train)
示例24
def test_boston(self):
from sklearn.datasets import load_boston
scikit_data = load_boston()
scikit_model = Normalizer(norm="l2").fit(scikit_data.data)
spec = converter.convert(scikit_model, scikit_data.feature_names, "out")
input_data = [
dict(zip(scikit_data.feature_names, row)) for row in scikit_data.data
]
output_data = [{"out": row} for row in scikit_model.transform(scikit_data.data)]
evaluate_transformer(spec, input_data, output_data)
示例25
def get_normalized_vectors(vectors, norm='l1', progress=progress, pad_zeros=True):
progress.update('Normalizing vectors using "%s" norm' % norm)
vectors = np.array(vectors, dtype=np.float64)
if pad_zeros:
vectors += 0.0000001
normalizer = preprocessing.Normalizer(norm=norm)
return normalizer.fit_transform(vectors)
示例26
def __init__(self, norm='l2'):
super(BagNormalizer, self).__init__(Normalizer(norm))
示例27
def __init__(self, **kwargs):
super().__init__()
self.estimator = prep.Normalizer()
示例28
def train_lsa(corpus,n_topics, max_df=0.95, min_df=2,cleaning=clearstring,stop_words='english'):
if cleaning is not None:
for i in range(len(corpus)): corpus[i] = cleaning(corpus[i])
tfidf_vectorizer = TfidfVectorizer(max_df = max_df, min_df = min_df, stop_words = stop_words)
tfidf = tfidf_vectorizer.fit_transform(corpus)
tfidf_features = tfidf_vectorizer.get_feature_names()
tfidf = Normalizer().fit_transform(tfidf)
lsa = TruncatedSVD(n_topics).fit(tfidf)
return TOPIC(tfidf_features,lsa)
示例29
def train_lsa(corpus, n_topics, max_df=0.95, min_df=2,
cleaning=clearstring, stop_words='english'):
if cleaning is not None:
for i in range(len(corpus)):
corpus[i] = cleaning(corpus[i])
tfidf_vectorizer = TfidfVectorizer(
max_df=max_df, min_df=min_df, stop_words=stop_words)
tfidf = tfidf_vectorizer.fit_transform(corpus)
tfidf_features = tfidf_vectorizer.get_feature_names()
tfidf = Normalizer().fit_transform(tfidf)
lsa = TruncatedSVD(n_topics).fit(tfidf)
return TOPIC(tfidf_features, lsa)
示例30
def normalize(train_inputs, non_train_inputs):
normalizer = Normalizer()
train_inputs[train_inputs.columns] = normalizer.fit_transform(train_inputs.values)
non_train_inputs[train_inputs.columns] = normalizer.transform(non_train_inputs.values)
return train_inputs, non_train_inputs