Python源码示例:sklearn.preprocessing.normalize()
示例1
def load_names(data_names, norm=True, log1p=False, verbose=True):
# Load datasets.
datasets = []
genes_list = []
n_cells = 0
for name in data_names:
X_i, genes_i = load_data(name)
if norm:
X_i = normalize(X_i, axis=1)
if log1p:
X_i = np.log1p(X_i)
X_i = csr_matrix(X_i)
datasets.append(X_i)
genes_list.append(genes_i)
n_cells += X_i.shape[0]
if verbose:
print('Loaded {} with {} genes and {} cells'.
format(name, X_i.shape[1], X_i.shape[0]))
if verbose:
print('Found {} cells among all datasets'
.format(n_cells))
return datasets, genes_list, n_cells
示例2
def main():
from sklearn import preprocessing
from sklearn.datasets import fetch_openml as fetch_mldata
from sklearn.model_selection import train_test_split
db_name = 'diabetes'
data_set = fetch_mldata(db_name)
data_set.data = preprocessing.normalize(data_set.data)
tmp = data_set.target
tmpL = [ 1 if i == "tested_positive" else -1 for i in tmp]
data_set.target = tmpL
X_train, X_test, y_train, y_test = train_test_split(
data_set.data, data_set.target, test_size=0.4)
mlelm = MLELM(hidden_units=(10, 30, 200)).fit(X_train, y_train)
elm = ELM(200).fit(X_train, y_train)
print("MLELM Accuracy %0.3f " % mlelm.score(X_test, y_test))
print("ELM Accuracy %0.3f " % elm.score(X_test, y_test))
示例3
def train(self):
self.adj = self.getAdjMat()
self.node_size = self.adj.shape[0]
self.Ak = np.matrix(np.identity(self.node_size))
self.RepMat = np.zeros((self.node_size, int(self.dim*self.Kstep)))
for i in range(self.Kstep):
print('Kstep =', i)
self.Ak = np.dot(self.Ak, self.adj)
probTranMat = self.GetProbTranMat(self.Ak)
Rk = self.GetRepUseSVD(probTranMat, 0.5)
Rk = normalize(Rk, axis=1, norm='l2')
self.RepMat[:, self.dim*i:self.dim*(i+1)] = Rk[:, :]
# get embeddings
self.vectors = {}
look_back = self.g.look_back_list
for i, embedding in enumerate(self.RepMat):
self.vectors[look_back[i]] = embedding
示例4
def pre_factorization(G, n_components, exponent):
"""
Network Embedding as Sparse Matrix Factorization
"""
C1 = preprocessing.normalize(G, "l1")
# Prepare negative samples
neg = np.array(C1.sum(axis=0))[0] ** exponent
neg = neg / neg.sum()
neg = sparse.diags(neg, format="csr")
neg = G.dot(neg)
# Set negative elements to 1 -> 0 when log
C1.data[C1.data <= 0] = 1
neg.data[neg.data <= 0] = 1
C1.data = np.log(C1.data)
neg.data = np.log(neg.data)
C1 -= neg
features_matrix = ProNE.tsvd_rand(C1, n_components=n_components)
return features_matrix
示例5
def load_names(data_names, norm=True, log1p=False, verbose=True):
# Load datasets.
datasets = []
genes_list = []
n_cells = 0
for name in data_names:
X_i, genes_i = load_data(name)
if norm:
X_i = normalize(X_i, axis=1)
if log1p:
X_i = np.log1p(X_i)
X_i = csr_matrix(X_i)
datasets.append(X_i)
genes_list.append(genes_i)
n_cells += X_i.shape[0]
if verbose:
print('Loaded {} with {} genes and {} cells'.
format(name, X_i.shape[1], X_i.shape[0]))
if verbose:
print('Found {} cells among all datasets'
.format(n_cells))
return datasets, genes_list, n_cells
示例6
def parse():
parser = argparse.ArgumentParser()
parser.add_argument('dataset', help='pol or main', type=str)
parser.add_argument('-n', '--n', default=1, help='Number of grams', type=int)
parser.add_argument('--min_count', default=1, help='Min count', type=int)
parser.add_argument('--embedding', default=CCGLOVE,
help='embedding file', type=str)
parser.add_argument('--weights', default=None,
help='weights to use for ngrams (e.g. sif, None)', type=str)
parser.add_argument('-norm', '--normalize', action='store_true',
help='Normalize vectors')
parser.add_argument('-l', '--lower', action='store_true',
help='Whether or not to lowercase text')
parser.add_argument('-e', '--embed', action='store_true',
help='Use embeddings instead of bong')
return parser.parse_args()
示例7
def strip_accents_unicode(s):
"""Transform accentuated unicode symbols into their simple counterpart
Warning: the python-level loop and join operations make this
implementation 20 times slower than the strip_accents_ascii basic
normalization.
See also
--------
strip_accents_ascii
Remove accentuated char for any unicode symbol that has a direct
ASCII equivalent.
"""
normalized = unicodedata.normalize('NFKD', s)
if normalized == s:
return s
else:
return ''.join([c for c in normalized if not unicodedata.combining(c)])
示例8
def _char_wb_ngrams(self, text_document):
"""Whitespace sensitive char-n-gram tokenization.
Tokenize text_document into a sequence of character n-grams
excluding any whitespace (operating only inside word boundaries)"""
# normalize white spaces
text_document = self._white_spaces.sub(" ", text_document)
min_n, max_n = self.ngram_range
ngrams = []
for w in text_document.split():
w = ' ' + w + ' '
w_len = len(w)
for n in xrange(min_n, max_n + 1):
offset = 0
ngrams.append(w[offset:offset + n])
while offset + n < w_len:
offset += 1
ngrams.append(w[offset:offset + n])
if offset == 0: # count a short word (w_len < n) only once
break
return ngrams
示例9
def __init__(self, word_vec_list, args, input_dimension=1500, hidden_dimensions=None):
self.session = load_session()
self.args = args
self.weights, self.biases = {}, {}
self.input_dimension = input_dimension
if hidden_dimensions is None:
hidden_dimensions = [1024, 512, self.args.dim]
self.hidden_dimensions = hidden_dimensions
self.layer_num = len(self.hidden_dimensions)
self.encoder_output = None
self.decoder_output = None
self.decoder_op = None
self.word_vec_list = np.reshape(word_vec_list, [len(word_vec_list), input_dimension])
if self.args.encoder_normalize:
self.word_vec_list = preprocessing.normalize(self.word_vec_list)
self._init_graph()
self._loss_optimizer()
tf.global_variables_initializer().run(session=self.session)
示例10
def _generate_name_vectors_mat(self):
name_ordered_list = list()
num = len(self.entities)
print("total entities:", num)
entity_id_uris_dic = dict(zip(self.kgs.kg1.entities_id_dict.values(), self.kgs.kg1.entities_id_dict.keys()))
entity_id_uris_dic2 = dict(zip(self.kgs.kg2.entities_id_dict.values(), self.kgs.kg2.entities_id_dict.keys()))
entity_id_uris_dic.update(entity_id_uris_dic2)
print('total entities ids:', len(entity_id_uris_dic))
assert len(entity_id_uris_dic) == num
for i in range(num):
assert i in entity_id_uris_dic
entity_uri = entity_id_uris_dic.get(i)
assert entity_uri in self.entity_local_name_dict
entity_name = self.entity_local_name_dict.get(entity_uri)
entity_name_index = self.literal_id_dic.get(entity_name)
name_ordered_list.append(entity_name_index)
print('name_ordered_list', len(name_ordered_list))
name_mat = self.literal_vectors_mat[name_ordered_list, ]
print("entity name embeddings mat:", type(name_mat), name_mat.shape)
if self.args.literal_normalize:
name_mat = preprocessing.normalize(name_mat)
self.local_name_vectors = name_mat
示例11
def valid(model, embed_choice='avg', w=(1, 1, 1)):
if embed_choice == 'nv':
ent_embeds = model.name_embeds.eval(session=model.session)
elif embed_choice == 'rv':
ent_embeds = model.rv_ent_embeds.eval(session=model.session)
elif embed_choice == 'av':
ent_embeds = model.av_ent_embeds.eval(session=model.session)
elif embed_choice == 'final':
ent_embeds = model.ent_embeds.eval(session=model.session)
elif embed_choice == 'avg':
ent_embeds = w[0] * model.name_embeds.eval(session=model.session) + \
w[1] * model.rv_ent_embeds.eval(session=model.session) + \
w[2] * model.av_ent_embeds.eval(session=model.session)
else: # 'final'
ent_embeds = model.ent_embeds
print(embed_choice, 'valid results:')
embeds1 = ent_embeds[model.kgs.valid_entities1,]
embeds2 = ent_embeds[model.kgs.valid_entities2 + model.kgs.test_entities2,]
hits1_12, mrr_12 = eva.valid(embeds1, embeds2, None, model.args.top_k, model.args.test_threads_num,
normalize=True)
del embeds1, embeds2
gc.collect()
return mrr_12
示例12
def test(model, embed_choice='avg', w=(1, 1, 1)):
if embed_choice == 'nv':
ent_embeds = model.name_embeds.eval(session=model.session)
elif embed_choice == 'rv':
ent_embeds = model.rv_ent_embeds.eval(session=model.session)
elif embed_choice == 'av':
ent_embeds = model.av_ent_embeds.eval(session=model.session)
elif embed_choice == 'final':
ent_embeds = model.ent_embeds.eval(session=model.session)
elif embed_choice == 'avg':
ent_embeds = w[0] * model.name_embeds.eval(session=model.session) + \
w[1] * model.rv_ent_embeds.eval(session=model.session) + \
w[2] * model.av_ent_embeds.eval(session=model.session)
else: # wavg
ent_embeds = model.ent_embeds
print(embed_choice, 'test results:')
embeds1 = ent_embeds[model.kgs.test_entities1,]
embeds2 = ent_embeds[model.kgs.test_entities2,]
hits1_12, mrr_12 = eva.valid(embeds1, embeds2, None, model.args.top_k, model.args.test_threads_num,
normalize=True)
del embeds1, embeds2
gc.collect()
return mrr_12
示例13
def _compute_weight(embeds1, embeds2, embeds3):
def min_max_normalization(mat):
min_ = np.min(mat)
max_ = np.max(mat)
return (mat - min_) / (max_ - min_)
other_embeds = (embeds1 + embeds2 + embeds3) / 3
# other_embeds = (embeds2 + embeds3) / 2
other_embeds = preprocessing.normalize(other_embeds)
embeds1 = preprocessing.normalize(embeds1)
# sim_mat = sim(embeds1, other_embeds, metric='cosine')
sim_mat = np.matmul(embeds1, other_embeds.T)
# sim_mat = 1 - euclidean_distances(embeds1, other_embeds)
weights = np.diag(sim_mat)
# print(weights.shape, np.mean(weights))
# weights = min_max_normalization(weights)
print(weights.shape, np.mean(weights))
return np.mean(weights)
示例14
def _predict_proba(self, X):
y_proba = np.asarray([0.])
for i in range(len(self.ensemble)):
y_proba_temp = self.ensemble[i].predict_proba(X)
if np.sum(y_proba_temp) > 0.0:
y_proba_temp = normalize(y_proba_temp, norm='l1')[0].copy()
acc = self.ensemble[i].performance_evaluator.accuracy_score()
if not self.disable_weighted_vote and acc > 0.0:
y_proba_temp *= acc
# Check array length consistency
if len(y_proba_temp) != len(y_proba):
if len(y_proba_temp) > len(y_proba):
y_proba.resize((len(y_proba_temp), ), refcheck=False)
else:
y_proba_temp.resize((len(y_proba), ), refcheck=False)
# Add values
y_proba += y_proba_temp
return y_proba
示例15
def _update_embedding(self, graph, original_embedding):
r"""Performs the Network Embedding Update on the original embedding.
Args:
original_embedding (Numpy array): An array containing an embedding.
graph (NetworkX graph): The embedded graph.
Return types:
embedding (Numpy array): An array containing the updated embedding.
"""
embedding = self._normalize_embedding(original_embedding)
adjacency = nx.adjacency_matrix(graph, nodelist=range(graph.number_of_nodes()))
normalized_adjacency = normalize(adjacency, norm='l1', axis=1)
for _ in range(self.iterations):
embedding = (embedding +
self.L1*(normalized_adjacency @ embedding) +
self.L2*(normalized_adjacency @ (normalized_adjacency @ embedding)))
return embedding
示例16
def transform(self, X_si, high=None, low=None, limit=None):
"""
Same as HashingVectorizer transform, except allows for
interaction list, which is an iterable the same length as X
filled with True/False. This method adds an empty row to
docs labelled as False.
"""
analyzer = self.build_analyzer()
X = self._get_hasher().transform(
analyzer(self._deal_with_input(doc)) for doc in X_si)
X.data.fill(1)
if self.norm is not None:
X = normalize(X, norm=self.norm, copy=False)
if low:
X = self._limit_features(X, low=low)
return X
示例17
def test_cosine_similarity():
# Test the cosine_similarity.
rng = np.random.RandomState(0)
X = rng.random_sample((5, 4))
Y = rng.random_sample((3, 4))
Xcsr = csr_matrix(X)
Ycsr = csr_matrix(Y)
for X_, Y_ in ((X, None), (X, Y),
(Xcsr, None), (Xcsr, Ycsr)):
# Test that the cosine is kernel is equal to a linear kernel when data
# has been previously normalized by L2-norm.
K1 = pairwise_kernels(X_, Y=Y_, metric="cosine")
X_ = normalize(X_)
if Y_ is not None:
Y_ = normalize(Y_)
K2 = pairwise_kernels(X_, Y=Y_, metric="linear")
assert_array_almost_equal(K1, K2)
示例18
def vectorize(features, vocab):
""" Transform a features list into a numeric vector
with a given vocab
:type dpvocab: dict
:param dpvocab: vocab for distributional representation
:type projmat: scipy.lil_matrix
:param projmat: projection matrix for disrep
"""
vec = lil_matrix((1, len(vocab)))
for feat in features:
try:
fidx = vocab[feat]
vec[0, fidx] += 1.0
except KeyError:
pass
# Normalization
vec = normalize(vec)
return vec
示例19
def process_data(datasets, genes, hvg=HVG, dimred=DIMRED, verbose=False):
# Only keep highly variable genes
if not hvg is None and hvg > 0 and hvg < len(genes):
if verbose:
print('Highly variable filter...')
X = vstack(datasets)
disp = dispersion(X)
highest_disp_idx = np.argsort(disp[0])[::-1]
top_genes = set(genes[highest_disp_idx[range(hvg)]])
for i in range(len(datasets)):
gene_idx = [ idx for idx, g_i in enumerate(genes)
if g_i in top_genes ]
datasets[i] = datasets[i][:, gene_idx]
genes = np.array(sorted(top_genes))
# Normalize.
if verbose:
print('Normalizing...')
for i, ds in enumerate(datasets):
datasets[i] = normalize(ds, axis=1)
# Compute compressed embedding.
if dimred > 0:
if verbose:
print('Reducing dimension...')
datasets_dimred = dimensionality_reduce(datasets, dimred=dimred)
if verbose:
print('Done processing.')
return datasets_dimred, genes
if verbose:
print('Done processing.')
return datasets, genes
# Plot t-SNE visualization.
示例20
def batch_bias(curr_ds, match_ds, bias, batch_size=None, sigma=SIGMA):
if batch_size is None:
weights = rbf_kernel(curr_ds, match_ds, gamma=0.5*sigma)
weights = normalize(weights, axis=1, norm='l1')
avg_bias = np.dot(weights, bias)
return avg_bias
base = 0
avg_bias = np.zeros(curr_ds.shape)
denom = np.zeros(curr_ds.shape[0])
while base < match_ds.shape[0]:
batch_idx = range(
base, min(base + batch_size, match_ds.shape[0])
)
weights = rbf_kernel(curr_ds, match_ds[batch_idx, :],
gamma=0.5*sigma)
avg_bias += np.dot(weights, bias[batch_idx, :])
denom += np.sum(weights, axis=1)
base += batch_size
denom = handle_zeros_in_scale(denom, copy=False)
avg_bias /= denom[:, np.newaxis]
return avg_bias
# Compute nonlinear translation vectors between dataset
# and a reference.
示例21
def __init__(self, hps, example_list, dqn_batch_size, use_state_prime = False, max_art_oovs = 0):
"""
Args:
hps: seq2seq model parameters
example_list: list of experiences
dqn_batch_size: DDQN batch size
use_state_prime: whether to use the next decoder state to make the batch or the current one
max_art_oovs: number of OOV tokens in current batch
Properties:
_x: The input to DDQN model for training, this is basically the decoder output (dqn_batch_size, dqn_input_feature_len)
_y: The Q-estimation (dqn_batch_size, vocab_size)
_y_extended: The Q-estimation (dqn_batch_size, vocab_size + max_art_oovs)
"""
self._x = np.zeros((dqn_batch_size, hps.dqn_input_feature_len))
self._y = np.zeros((dqn_batch_size, hps.vocab_size))
self._y_extended = np.zeros((dqn_batch_size, hps.vocab_size + max_art_oovs))
for i,e in enumerate(example_list):
if use_state_prime:
self._x[i,:]=e.state_prime
else:
self._x[i,:]=e.state
self._y[i,:]=normalize(e.q_value[0:hps.vocab_size], axis=1, norm='l1')
if max_art_oovs == 0:
self._y_extended[i,:] = normalize(e.q_value[0:hps.vocab_size], axis=1, norm='l1')
else:
self._y_extended[i,:] = e.q_value
示例22
def avg_log_prob(self):
# normalize log probability by number of tokens (otherwise longer sequences always have lower probability)
return self.log_prob / len(self.tokens)
示例23
def main():
from sklearn import preprocessing
from sklearn.datasets import fetch_openml as fetch_mldata
from sklearn.model_selection import ShuffleSplit, KFold, cross_val_score
db_name = 'australian'
hid_nums = [100, 200, 300]
data_set = fetch_mldata(db_name)
data_set.data = preprocessing.normalize(data_set.data)
data_set.target = [1 if i == 1 else -1
for i in data_set.target.astype(int)]
for hid_num in hid_nums:
print(hid_num, end=' ')
e = ELM(hid_num)
ave = 0
for i in range(10):
cv = KFold(n_splits=5, shuffle=True)
scores = cross_val_score(
e, data_set.data, data_set.target,
cv=cv, scoring='accuracy', n_jobs=-1)
ave += scores.mean()
ave /= 10
print("Accuracy: %0.3f " % (ave))
示例24
def __init__(self, params, normalize=False, whiten=True):
self.model_id = common.get_next_model_id()
self.norm = normalize
self.whiten = whiten
self.x_path = '%s_%sx%s' % (params['dataset']['dataset'],params['dataset']['npatches'],params['dataset']['window'])
self.y_path = '%s_%s_%s' % (params['dataset']['fact'],params['dataset']['dim'],params['dataset']['dataset'])
self.dataset_settings = params['dataset']
self.training_params = params['training']
self.model_arch = params['cnn']
self.predicting_params = params['predicting']
示例25
def batch_block_generator(params, y_path, N_train, id2gt, X_meta=None,
val_from_file=False):
hdf5_file = common.PATCHES_DIR+"/patches_train_%s_%sx%s.hdf5" % (params['dataset']['dataset'],params['dataset']['npatches'],params['dataset']['window'])
f = h5py.File(hdf5_file,"r")
block_step = 50000
batch_size = params['training']['n_minibatch']
randomize = True
with_meta = False
if X_meta != None:
with_meta = True
while 1:
for i in range(0, N_train, block_step):
x_block = f['features'][i:min(N_train, i+block_step)]
index_block = f['index'][i:min(N_train, i+block_step)]
#y_block = f['targets'][i:min(N_train,i+block_step)]
x_block = np.delete(x_block, np.where(index_block == ""), axis=0)
index_block = np.delete(index_block, np.where(index_block == ""))
y_block = np.asarray([id2gt[id] for id in index_block])
if params['training']['normalize_y']:
normalize(y_block, copy=False)
items_list = range(x_block.shape[0])
if randomize:
random.shuffle(items_list)
for j in range(0, len(items_list), batch_size):
if j+batch_size <= x_block.shape[0]:
items_in_batch = items_list[j:j+batch_size]
x_batch = x_block[items_in_batch]
y_batch = y_block[items_in_batch]
if with_meta:
x_batch = [x_batch, X_meta[items_in_batch]]
yield (x_batch, y_batch)
示例26
def combine_vectors(order1_input_file, order2_input_file, output_file):
o1_in_file = open(order1_input_file, 'r')
o2_in_file = open(order2_input_file, 'r')
o1_line = o1_in_file.readline()
o2_line = o2_in_file.readline()
vectors = []
keys = []
while o1_line and o2_line:
o1_line = o1_line.split()
o2_line = o2_line.split()
assert(o1_line[0] == o2_line[0]), "%s and %s are not the same." % (o1_line[0], o2_line[0])
if len(o1_line) == len(o2_line) and len(o1_line) == 2:
print("WARNING: Skipping a line because it appears to be header line.")
o1_line = o1_in_file.readline()
o2_line = o2_in_file.readline()
continue
vector = [val for val in o1_line[1:]] + [val for val in o2_line[1:]]
vectors.append(vector)
keys.append(o1_line[0])
o1_line = o1_in_file.readline()
o2_line = o2_in_file.readline()
vector_length = len(vectors[0])
vector_cnt = len(vectors)
vectors = preprocessing.normalize(vectors)
output = ""
for key, vector in zip(keys, vectors):
output += "%s %s\n" % (key, ' '.join([str(num) for num in vector]))
out_file = open(output_file, 'w')
output = "%s %s\n%s" % (vector_cnt, vector_length, output)
out_file.write(output)
示例27
def pca_components(X, dim):
X = X.reshape((len(X), dim))
pca = PCA(n_components=dim)
pca.fit(X)
U = (pca.components_).T
U_norm = normalize(U, axis=0)
return U_norm[:,:args.num_comp]
示例28
def pca_components(X, dim):
X = X.reshape((len(X), dim))
pca = PCA(n_components=dim)
pca.fit(X)
U = (pca.components_).T
U_norm = normalize(U, axis=0)
return U_norm[:,:args.num_comp]
示例29
def normalize_l2(x):
return preprocessing.normalize(x)
示例30
def normalize_l1(x):
return preprocessing.normalize(x, norm='l1')