Python源码示例:sklearn.preprocessing.Binarizer()
示例1
def _transform(arr, method):
if method is not None:
if method in ["log", "log10"]:
# arr = np.log(arr, where=(arr > 0))
# hacky, but np.log(arr, where=arr>0) is really buggy
arr = arr.copy()
if method == "log":
arr[arr > 0] = np.log(arr[arr > 0])
else:
arr[arr > 0] = np.log10(arr[arr > 0])
elif method in ["zero-boost", "simple-all", "simple-nonzero"]:
arr = pass_to_ranks(arr, method=method)
elif method == "binarize":
transformer = Binarizer().fit(arr)
arr = transformer.transform(arr)
else:
msg = "Transform must be one of {log, log10, binarize, zero-boost, simple-all, \
simple-nonzero, not {}.".format(
method
)
raise ValueError(msg)
return arr
示例2
def get_model_alias(model_type):
"""
Get alias model. Raise an exception if not found.
:param model_type: A scikit-learn object (e.g., SGDClassifier
and Binarizer)
:return: A string which stands for the type of the input model in
our conversion framework
"""
res = _get_sklearn_operator_name(model_type)
if res is None:
raise RuntimeError("Unable to find alias for model '{}'. "
"The converter is likely missing."
"".format(type(model_type)))
return res
# registered converters
示例3
def test_model_binarizer(self):
data = np.array([[1., -1., 2.],
[2., 0., 0.],
[0., 1., -1.]], dtype=np.float32)
model = Binarizer(threshold=0.5)
model_onnx = convert_sklearn(
model,
"scikit-learn binarizer",
[("input", FloatTensorType(data.shape))],
)
self.assertTrue(model_onnx is not None)
dump_data_and_model(
data,
model,
model_onnx,
basename="SklearnBinarizer-SkipDim1",
)
示例4
def test_onnx_helper_load_save(self):
model = make_pipeline(StandardScaler(), Binarizer(threshold=0.5))
X = numpy.array([[0.1, 1.1], [0.2, 2.2]])
model.fit(X)
model_onnx = convert_sklearn(model, "binarizer",
[("input", FloatTensorType([None, 2]))])
filename = "temp_onnx_helper_load_save.onnx"
save_onnx_model(model_onnx, filename)
model = load_onnx_model(filename)
new_model = select_model_inputs_outputs(model, "variable")
assert new_model.graph is not None
tr1 = self.get_model(model)
tr2 = self.get_model(new_model)
X = X.astype(numpy.float32)
X1 = tr1(X)
X2 = tr2(X)
assert X1.shape == (2, 2)
assert X2.shape == (2, 2)
示例5
def test_onnx_helper_load_save_init(self):
model = make_pipeline(
Binarizer(),
OneHotEncoder(sparse=False, handle_unknown='ignore'),
StandardScaler())
X = numpy.array([[0.1, 1.1], [0.2, 2.2], [0.4, 2.2], [0.2, 2.4]])
model.fit(X)
model_onnx = convert_sklearn(model, "pipe3",
[("input", FloatTensorType([None, 2]))])
filename = "temp_onnx_helper_load_save.onnx"
save_onnx_model(model_onnx, filename)
model = load_onnx_model(filename)
new_model = select_model_inputs_outputs(model, "variable")
assert new_model.graph is not None
tr1 = self.get_model(model)
tr2 = self.get_model(new_model)
X = X.astype(numpy.float32)
X1 = tr1(X)
X2 = tr2(X)
assert X1.shape == (4, 2)
assert X2.shape == (4, 2)
示例6
def test_objectmapper(self):
df = pdml.ModelFrame([])
self.assertIs(df.preprocessing.Binarizer, pp.Binarizer)
self.assertIs(df.preprocessing.FunctionTransformer,
pp.FunctionTransformer)
self.assertIs(df.preprocessing.Imputer, pp.Imputer)
self.assertIs(df.preprocessing.KernelCenterer, pp.KernelCenterer)
self.assertIs(df.preprocessing.LabelBinarizer, pp.LabelBinarizer)
self.assertIs(df.preprocessing.LabelEncoder, pp.LabelEncoder)
self.assertIs(df.preprocessing.MultiLabelBinarizer, pp.MultiLabelBinarizer)
self.assertIs(df.preprocessing.MaxAbsScaler, pp.MaxAbsScaler)
self.assertIs(df.preprocessing.MinMaxScaler, pp.MinMaxScaler)
self.assertIs(df.preprocessing.Normalizer, pp.Normalizer)
self.assertIs(df.preprocessing.OneHotEncoder, pp.OneHotEncoder)
self.assertIs(df.preprocessing.PolynomialFeatures, pp.PolynomialFeatures)
self.assertIs(df.preprocessing.RobustScaler, pp.RobustScaler)
self.assertIs(df.preprocessing.StandardScaler, pp.StandardScaler)
示例7
def test_transform_1d_frame_int(self):
arr = np.array([1, 2, 3, 1, 2, 3, 1, 2, 3])
idx = pd.Index('a b c d e f g h i'.split(' '))
df = pdml.ModelFrame(arr, index=idx, columns=['X'])
self.assertEqual(len(df.columns), 1)
# reshape arr to 2d
arr = arr.reshape(-1, 1)
if pd.compat.PY3:
models = ['Binarizer', 'Imputer', 'StandardScaler']
# MinMaxScalar raises TypeError in ufunc
else:
models = ['Binarizer', 'Imputer', 'StandardScaler', 'MinMaxScaler']
for model in models:
mod1 = getattr(df.preprocessing, model)()
mod2 = getattr(pp, model)()
self._assert_transform(df, arr, mod1, mod2)
mod1 = getattr(df.preprocessing, model)()
mod2 = getattr(pp, model)()
self._assert_fit_transform(df, arr, mod1, mod2)
示例8
def sklearn_one_hot_vectorize(corpus):
# The Sklearn one hot vectorize method
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Binarizer
freq = CountVectorizer()
vectors = freq.fit_transform(corpus)
print(len(vectors.toarray()[0]))
onehot = Binarizer()
vectors = onehot.fit_transform(vectors.toarray())
print(len(vectors[0]))
示例9
def _get_sklearn_operator_name(model_type):
"""
Get operator name of the input argument
:param model_type: A scikit-learn object (e.g., SGDClassifier
and Binarizer)
:return: A string which stands for the type of the input model in
our conversion framework
"""
if model_type not in sklearn_operator_name_map:
# "No proper operator name found, it means a local operator.
return None
return sklearn_operator_name_map[model_type]
示例10
def test_transform_series_int(self):
arr = np.array([1, 2, 3, 1, 2, 3, 1, 2, 3])
s = pdml.ModelSeries(arr, index='a b c d e f g h i'.split(' '))
# reshape arr to 2d
arr = arr.reshape(-1, 1)
if pd.compat.PY3:
models = ['Binarizer', 'Imputer', 'StandardScaler']
# MinMaxScalar raises TypeError in ufunc
else:
models = ['Binarizer', 'Imputer', 'StandardScaler', 'MinMaxScaler']
for model in models:
mod1 = getattr(s.preprocessing, model)()
mod2 = getattr(pp, model)()
s.fit(mod1)
mod2.fit(arr)
result = s.transform(mod1)
expected = mod2.transform(arr).flatten()
self.assertIsInstance(result, pdml.ModelSeries)
self.assert_numpy_array_almost_equal(result.values, expected)
mod1 = getattr(s.preprocessing, model)()
mod2 = getattr(pp, model)()
result = s.fit_transform(mod1)
expected = mod2.fit_transform(arr).flatten()
self.assertIsInstance(result, pdml.ModelSeries)
self.assert_numpy_array_almost_equal(result.values, expected)
示例11
def main():
x, fc6 = initModel()
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
img_names = load_image_names(args.input_data_dir)
with open(args.output_image_name_file, 'w') as img_names_file:
for img_name in img_names:
img_names_file.write(img_name + '\n')
t = time.time()
# 图像太多了,必须分批次
batch_size = 100
features = []
with open(args.output_feature_file, 'w') as output_file:
for i in range(0, int(math.ceil(len(img_names) / (batch_size * 1.0)))):
print('batch: %d' % i)
if (i + 1) * batch_size < len(img_names):
img_names_batch = img_names[i * batch_size:(i + 1) * batch_size]
else:
img_names_batch = img_names[i * batch_size:len(img_names)]
img_batch = load_images(img_names_batch)
output = sess.run(fc6, feed_dict={x: img_batch})
features.append(output)
features = np.vstack(features)
# binarizer = preprocessing.Binarizer().fit(features)
# features = binarizer.transform(features)
np.save(output_file, features)
# with open('fc6.npy', 'w') as output_file:
# for i in range(0, int(math.ceil(len(imgs) / (batch_size * 1.0)))):
# print('batch: %d' % i)
# if (i + 1) * batch_size < len(imgs):
# img_batch = imgs[i * batch_size:(i + 1) * batch_size]
# else:
# img_batch = imgs[i * batch_size: len(imgs)]
# output = sess.run(fc6, feed_dict={x: img_batch})
# features.append(output)
# features = np.vstack(features)
# np.save(output_file, features)
print(time.time() - t)
示例12
def main():
t = time.time()
img = imread(args.img_file_path)
imgs = [img, watermark(img), rotate(img), crop(img), mirror(img)]
imgs_norm = image_normalize(imgs)
dataset_features = np.load('fc6.npy')
query_start = time.time()
query_features = extract_feature(imgs_norm)
binarizer = preprocessing.Binarizer().fit(query_features)
query_features = binarizer.transform(query_features)
print(dataset_features)
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html#scipy.spatial.distance.cdist
cosine = distance.cdist(dataset_features, query_features, 'cosine')
print(cosine.shape)
dis = cosine
inds_all = argsort(dis, axis=0) # 按列排序 https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html
print('query cost: %f, dataset: %d, query: %d' % (time.time() - query_start, len(dataset_features), len(imgs)))
img_names = load_image_names()
fig, axes = plt.subplots(5, 11, figsize=(22, 10), subplot_kw={'xticks': [], 'yticks': []})
fig.subplots_adjust(hspace=0.15, wspace=0.01, left=.02, right=.98, top=.92, bottom=.08)
titles = ['original', 'watermark', 'rotate', 'crop', 'mirror']
for i in range(len(imgs)):
topK = []
inds = inds_all[:, i]
# print(inds)
for k in range(10):
topK.append(img_names[inds[k]])
print(inds[k], dis[inds[k], i], img_names[inds[k]])
original = axes[i, 0]
original.set_title(titles[i])
img = imgs[i]
original.imshow(img)
for j in range(10):
ax = axes[i, j + 1]
img = imread(topK[j])
ax.imshow(img)
title = "%d : %f" % (j + 1, dis[inds[j], i])
ax.set_title(title)
savePath = args.img_file_path + '_search_result.jpg'
plt.savefig(savePath)
print(time.time() - t)
# os.system('open -a Preview.app -F ' + savePath)