Python源码示例:sklearn.decomposition.TruncatedSVD()
示例1
def fit_transform(self, documents):
# Vectorizer will be False if pipeline hasn't been fit yet,
# Trigger fit_transform and save the vectorizer and lexicon.
if self.vectorizer == False:
self.lexicon = self.pipeline.fit_transform(documents)
self.vect = self.pipeline.named_steps['tfidf']
self.knn = self.pipeline.named_steps['knn']
self.save()
# If there's a stored vectorizer and prefitted lexicon,
# use them instead.
else:
self.vect = self.vectorizer
self.knn = Pipeline([
('svd', TruncatedSVD(n_components=100)),
('knn', KNNTransformer(k=self.k, algorithm='ball_tree'))
])
self.knn.fit_transform(self.lexicon)
示例2
def optimize(self):
"""
Learning an embedding.
"""
print("\nOptimization started.\n")
self.embeddings = []
for step in tqdm(range(self.args.order)):
target_matrix = self._create_target_matrix()
svd = TruncatedSVD(n_components=self.args.dimensions,
n_iter=self.args.iterations,
random_state=self.args.seed)
svd.fit(target_matrix)
embedding = svd.transform(target_matrix)
self.embeddings.append(embedding)
示例3
def test_resolve_embeddings(self):
tdm = self.corpus.get_unigram_corpus().select(ClassPercentageCompactor(term_count=1))
embeddings_resolver = EmbeddingsResolver(tdm)
# embeddings = TruncatedSVD(n_components=20).fit_transform(tdm.get_term_doc_mat().T).T
# embeddings_resolver.set_embeddings(embeddings)
embeddings_resolver = embeddings_resolver.set_embeddings(tdm.get_term_doc_mat())
if self.assertRaisesRegex:
with self.assertRaisesRegex(Exception,
"You have already set embeddings by running set_embeddings or set_embeddings_model."):
embeddings_resolver.set_embeddings_model(None)
embeddings_resolver = EmbeddingsResolver(tdm)
embeddings_resolver = embeddings_resolver.set_embeddings_model(MockWord2Vec(tdm.get_terms()))
if self.assertRaisesRegex:
with self.assertRaisesRegex(Exception,
"You have already set embeddings by running set_embeddings or set_embeddings_model."):
embeddings_resolver.set_embeddings(tdm.get_term_doc_mat())
c, axes = embeddings_resolver.project_embeddings(projection_model=TruncatedSVD(3))
self.assertIsInstance(c, ParsedCorpus)
self.assertEqual(axes.to_dict(), pd.DataFrame(index=['speak'], data={'x': [0.,], 'y':[0.,]}).to_dict())
示例4
def test_selective_tsvd():
original = X
cols = [original.columns[0], original.columns[1]] # Only perform on first two columns...
compare_cols = np.array(
original[['petal length (cm)', 'petal width (cm)']].as_matrix()) # should be the same as the trans cols
transformer = SelectiveTruncatedSVD(cols=cols, n_components=1).fit(original)
transformed = transformer.transform(original)
untouched_cols = np.array(transformed[['petal length (cm)', 'petal width (cm)']].as_matrix())
assert_array_almost_equal(compare_cols, untouched_cols)
assert 'Concept1' in transformed.columns
assert transformed.shape[1] == 3
assert isinstance(transformer.get_decomposition(), TruncatedSVD)
assert SelectiveTruncatedSVD().get_decomposition() is None # default None
# test the selective mixin
assert isinstance(transformer.cols, list)
示例5
def transform(self):
# ngrams
obs_ngrams = list(map(lambda x: ngram_utils._ngrams(x.split(" "), self.obs_ngram, "_"), self.obs_corpus))
target_ngrams = list(map(lambda x: ngram_utils._ngrams(x.split(" "), self.target_ngram, "_"), self.target_corpus))
# cooccurrence ngrams
cooc_terms = list(map(lambda lst1,lst2: self._get_cooc_terms(lst1, lst2, "X"), obs_ngrams, target_ngrams))
## tfidf
tfidf = self._init_word_ngram_tfidf(ngram=1)
X = tfidf.fit_transform(cooc_terms)
## svd
svd = TruncatedSVD(n_components=self.svd_dim,
n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
return svd.fit_transform(X)
# 2nd in CrowdFlower (preprocessing_mikhail.py)
示例6
def transform(self):
## get common vocabulary
tfidf = self._init_word_ngram_tfidf(self.ngram)
tfidf.fit(list(self.obs_corpus) + list(self.target_corpus))
vocabulary = tfidf.vocabulary_
## obs tfidf
tfidf = self._init_word_ngram_tfidf(self.ngram, vocabulary)
X_obs = tfidf.fit_transform(self.obs_corpus)
## targetument tfidf
tfidf = self._init_word_ngram_tfidf(self.ngram, vocabulary)
X_target = tfidf.fit_transform(self.target_corpus)
## svd
svd = TruncatedSVD(n_components = self.svd_dim,
n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
svd.fit(scipy.sparse.vstack((X_obs, X_target)))
X_obs = svd.transform(X_obs)
X_target = svd.transform(X_target)
## cosine similarity
sim = list(map(dist_utils._cosine_sim, X_obs, X_target))
sim = np.asarray(sim).squeeze()
return sim
示例7
def _reduce_dimensions(self, X):
"""
Using Truncated SVD.
Arg types:
* **X** *(Scipy COO or Numpy array)* - The wide feature matrix.
Return types:
* **X** *(Numpy array)* - The reduced feature matrix of nodes.
"""
svd = TruncatedSVD(n_components=self.reduction_dimensions,
n_iter=self.svd_iterations,
random_state=self.seed)
svd.fit(X)
X = svd.transform(X)
return X
示例8
def _create_reduced_features(self, X):
"""
Creating a dense reduced node feature matrix.
Arg types:
* **X** *(Scipy COO or Numpy array)* - The wide feature matrix.
Return types:
* **T** *(Numpy array)* - The reduced feature matrix of nodes.
"""
svd = TruncatedSVD(n_components=self.reduction_dimensions,
n_iter=self.svd_iterations,
random_state=self.seed)
svd.fit(X)
T = svd.transform(X)
return T.T
示例9
def fit_truncatedSVD(data):
'''
Fit the model with truncated SVD principal components
'''
# keyword parameters for the PCA
kwrd_params = {
'algorithm': 'randomized',
'n_components': 5,
'n_iter': 5,
'random_state': 42,
'tol': 0.0
}
# reduce the data
reduced = reduceDimensions(cd.TruncatedSVD,
data, **kwrd_params)
# prepare the data for the classifier
data_l = prepare_data(data, reduced,
kwrd_params['n_components'])
# fit the model
class_fit_predict_print(data_l)
# the file name of the dataset
示例10
def test_random_hasher():
# test random forest hashing on circles dataset
# make sure that it is linearly separable.
# even after projected to two SVD dimensions
# Note: Not all random_states produce perfect results.
hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
X, y = datasets.make_circles(factor=0.5)
X_transformed = hasher.fit_transform(X)
# test fit and transform:
hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
assert_array_equal(hasher.fit(X).transform(X).toarray(),
X_transformed.toarray())
# one leaf active per data point per forest
assert_equal(X_transformed.shape[0], X.shape[0])
assert_array_equal(X_transformed.sum(axis=1), hasher.n_estimators)
svd = TruncatedSVD(n_components=2)
X_reduced = svd.fit_transform(X_transformed)
linear_clf = LinearSVC()
linear_clf.fit(X_reduced, y)
assert_equal(linear_clf.score(X_reduced, y), 1.)
示例11
def test_truncated_svd_eq_pca():
# TruncatedSVD should be equal to PCA on centered data
X_c = X - X.mean(axis=0)
params = dict(n_components=10, random_state=42)
svd = TruncatedSVD(algorithm='arpack', **params)
pca = PCA(svd_solver='arpack', **params)
Xt_svd = svd.fit_transform(X_c)
Xt_pca = pca.fit_transform(X_c)
assert_allclose(Xt_svd, Xt_pca, rtol=1e-9)
assert_allclose(pca.mean_, 0, atol=1e-9)
assert_allclose(svd.components_, pca.components_)
示例12
def dim_reduction_method(self):
"""
select dimensionality reduction method
"""
if self.dim_reduction=='pca':
return PCA()
elif self.dim_reduction=='factor-analysis':
return FactorAnalysis()
elif self.dim_reduction=='fast-ica':
return FastICA()
elif self.dim_reduction=='kernel-pca':
return KernelPCA()
elif self.dim_reduction=='sparse-pca':
return SparsePCA()
elif self.dim_reduction=='truncated-svd':
return TruncatedSVD()
elif self.dim_reduction!=None:
raise ValueError('%s is not a supported dimensionality reduction method. Valid inputs are: \
"pca","factor-analysis","fast-ica,"kernel-pca","sparse-pca","truncated-svd".'
%(self.dim_reduction))
示例13
def plot_z_run(z_run, label, ):
f1, ax1 = plt.subplots(2, 1)
# First fit a PCA
PCA_model = TruncatedSVD(n_components=3).fit(z_run)
z_run_reduced = PCA_model.transform(z_run)
ax1[0].scatter(z_run_reduced[:, 0], z_run_reduced[:, 1], c=label, marker='*', linewidths=0)
ax1[0].set_title('PCA on z_run')
# THen fit a tSNE
tSNE_model = TSNE(verbose=2, perplexity=80, min_grad_norm=1E-12, n_iter=3000)
z_run_tsne = tSNE_model.fit_transform(z_run)
ax1[1].scatter(z_run_tsne[:, 0], z_run_tsne[:, 1], c=label, marker='*', linewidths=0)
ax1[1].set_title('tSNE on z_run')
plt.show()
return
示例14
def __init__(self, k=3, **kwargs):
self.k = k
self.pipeline = Pipeline([
('norm', TextNormalizer(minimum=10, maximum=100)),
('tfidf', TfidfVectorizer()),
('knn', Pipeline([
('svd', TruncatedSVD(n_components=100)),
('model', KNNTransformer(k=self.k, algorithm='ball_tree'))
]))
])
self.lex_path = "lexicon.pkl"
self.vect_path = "vect.pkl"
self.vectorizer = False
self.lexicon = None
self.load()
示例15
def __init__(self, n_topics=50, estimator='LDA'):
"""
n_topics is the desired number of topics
To use Latent Semantic Analysis, set estimator to 'LSA',
To use Non-Negative Matrix Factorization, set estimator to 'NMF',
otherwise, defaults to Latent Dirichlet Allocation ('LDA').
"""
self.n_topics = n_topics
if estimator == 'LSA':
self.estimator = TruncatedSVD(n_components=self.n_topics)
elif estimator == 'NMF':
self.estimator = NMF(n_components=self.n_topics)
else:
self.estimator = LatentDirichletAllocation(n_topics=self.n_topics)
self.model = Pipeline([
('norm', TextNormalizer()),
('tfidf', CountVectorizer(tokenizer=identity,
preprocessor=None, lowercase=False)),
('model', self.estimator)
])
示例16
def create_pipeline(estimator, reduction=False):
steps = [
('normalize', TextNormalizer()),
('vectorize', TfidfVectorizer(
tokenizer=identity, preprocessor=None, lowercase=False
))
]
if reduction:
steps.append((
'reduction', TruncatedSVD(n_components=10000)
))
# Add the estimator
steps.append(('classifier', estimator))
return Pipeline(steps)
示例17
def reduce_dimensionality(X, n_features):
"""
Apply PCA or SVD to reduce dimension to n_features.
:param X:
:param n_features:
:return:
"""
# Initialize reduction method: PCA or SVD
# reducer = PCA(n_components=n_features)
reducer = TruncatedSVD(n_components=n_features)
# Fit and transform data to n_features-dimensional space
reducer.fit(X)
X = reducer.transform(X)
logging.debug("Reduced number of features to {0}".format(n_features))
logging.debug("Percentage explained: %s\n" % reducer.explained_variance_ratio_.sum())
return X
示例18
def do_tfidf_feature(df, tfidf):
n_components = 30
svd = TruncatedSVD(
n_components=n_components, algorithm="arpack", random_state=2019
)
col_tfidf = tfidf.transform(df["col"])
feature_names = tfidf.get_feature_names()
ret_df = pd.DataFrame(col_tfidf.toarray(), columns=feature_names)
return ret_df
col_svd = svd.fit_transform(col_tfidf)
best_fearures = [
feature_names[i] + "i" for i in svd.components_[0].argsort()[::-1]
]
ret_df = pd.DataFrame(col_svd, columns=best_fearures[:n_components])
return ret_df
示例19
def __init__(self, **kwargs):
super().__init__()
self.truncated_svd = decomposition.TruncatedSVD(**kwargs)
示例20
def create_spectral_features(args, positive_edges, negative_edges, node_count):
"""
Creating spectral node features using the train dataset edges.
:param args: Arguments object.
:param positive_edges: Positive edges list.
:param negative_edges: Negative edges list.
:param node_count: Number of nodes.
:return X: Node features.
"""
p_edges = positive_edges + [[edge[1], edge[0]] for edge in positive_edges]
n_edges = negative_edges + [[edge[1], edge[0]] for edge in negative_edges]
train_edges = p_edges + n_edges
index_1 = [edge[0] for edge in train_edges]
index_2 = [edge[1] for edge in train_edges]
values = [1]*len(p_edges) + [-1]*len(n_edges)
shaping = (node_count, node_count)
signed_A = sparse.csr_matrix(sparse.coo_matrix((values, (index_1, index_2)),
shape=shaping,
dtype=np.float32))
svd = TruncatedSVD(n_components=args.reduction_dimensions,
n_iter=args.reduction_iterations,
random_state=args.seed)
svd.fit(signed_A)
X = svd.components_.T
return X
示例21
def create_spectral_features(self, pos_edge_index, neg_edge_index,
num_nodes=None):
r"""Creates :obj:`in_channels` spectral node features based on
positive and negative edges.
Args:
pos_edge_index (LongTensor): The positive edge indices.
neg_edge_index (LongTensor): The negative edge indices.
num_nodes (int, optional): The number of nodes, *i.e.*
:obj:`max_val + 1` of :attr:`pos_edge_index` and
:attr:`neg_edge_index`. (default: :obj:`None`)
"""
edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=1)
N = edge_index.max().item() + 1 if num_nodes is None else num_nodes
edge_index = edge_index.to(torch.device('cpu'))
pos_val = torch.full((pos_edge_index.size(1), ), 2, dtype=torch.float)
neg_val = torch.full((neg_edge_index.size(1), ), 0, dtype=torch.float)
val = torch.cat([pos_val, neg_val], dim=0)
row, col = edge_index
edge_index = torch.cat([edge_index, torch.stack([col, row])], dim=1)
val = torch.cat([val, val], dim=0)
edge_index, val = coalesce(edge_index, val, N, N)
val = val - 1
# Borrowed from:
# https://github.com/benedekrozemberczki/SGCN/blob/master/src/utils.py
edge_index = edge_index.detach().numpy()
val = val.detach().numpy()
A = scipy.sparse.coo_matrix((val, edge_index), shape=(N, N))
svd = TruncatedSVD(n_components=self.in_channels, n_iter=128)
svd.fit(A)
x = svd.components_.T
return torch.from_numpy(x).to(torch.float).to(pos_edge_index.device)
示例22
def train(self, x, y, textual_evidence=None):
if self.TEXT:
x = textual_evidence
x = [str(i) for i in x] # make sure every word is string
self.train_x = x
self.train_y = y
# VECTORIZATION
print("vectorization..")
# X = np.concatenate([train['triples_prep'], test['triples_prep']])
self.count_vect = CountVectorizer().fit(x)
x = self.count_vect.transform(x)
self.tf_transformer = TfidfTransformer().fit(x)
x = self.tf_transformer.transform(x)
self.svd = TruncatedSVD(n_components=self.N_COMPONENTS).fit(x)
x = self.svd.transform(x)
# CLUSTERING
print("clustering..")
self.neigh = NearestNeighbors(self.K, self.RADIUS)
self.neigh.fit(x) # clustering only training set
示例23
def fit_base_SVD_model(self):
"""
Reducing the dimensionality with SVD in the 1st step.
"""
self.P = self.P.dot(self.X)
self.model = TruncatedSVD(n_components=self.args.dimensions,
n_iter=70,
random_state=42)
self.model.fit(self.P)
self.P = self.model.fit_transform(self.P)
示例24
def fit(self, X, y=None):
"""Fit the transformer.
Parameters
----------
X : Pandas ``DataFrame``, shape=(n_samples, n_features)
The Pandas frame to fit. The frame will only
be fit on the prescribed ``cols`` (see ``__init__``) or
all of them if ``cols`` is None. Furthermore, ``X`` will
not be altered in the process of the fit.
y : None
Passthrough for ``sklearn.pipeline.Pipeline``. Even
if explicitly set, will not change behavior of ``fit``.
Returns
-------
self
"""
# check on state of X and cols
X, self.cols = validate_is_pd(X, self.cols)
cols = _cols_if_none(X, self.cols)
# fails thru if names don't exist:
self.svd_ = TruncatedSVD(
n_components=self.n_components,
algorithm=self.algorithm,
n_iter=self.n_iter).fit(X[cols].as_matrix())
return self
示例25
def get_decomposition(self):
"""Overridden from the :class:``skutil.decomposition.decompose._BaseSelectiveDecomposer`` class,
this method returns the internal decomposition class:
``sklearn.decomposition.TruncatedSVD``
Returns
-------
self.svd_ : ``sklearn.decomposition.TruncatedSVD``
The fit internal decomposition class
"""
return self.svd_ if hasattr(self, 'svd_') else None
示例26
def transform(self):
tfidf = self._init_word_ngram_tfidf(self.ngram)
X = tfidf.fit_transform(self.obs_corpus)
svd = TruncatedSVD(n_components = self.svd_dim,
n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
return svd.fit_transform(X)
示例27
def transform(self):
tfidf = self._init_char_ngram_tfidf(self.ngram)
X = tfidf.fit_transform(self.obs_corpus)
svd = TruncatedSVD(n_components=self.svd_dim,
n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
return svd.fit_transform(X)
# ------------------------ Cooccurrence LSA -------------------------------
# 1st in CrowdFlower
示例28
def transform(self):
## tfidf
tfidf = self._init_word_ngram_tfidf(ngram=self.ngram)
X_obs = tfidf.fit_transform(self.obs_corpus)
X_target = tfidf.fit_transform(self.target_corpus)
X_tfidf = scipy.sparse.hstack([X_obs, X_target]).tocsr()
## svd
svd = TruncatedSVD(n_components=self.svd_dim,
n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
X_svd = svd.fit_transform(X_tfidf)
return X_svd
# -------------------------------- TSNE ------------------------------------------
# 2nd in CrowdFlower (preprocessing_mikhail.py)
示例29
def _create_single_embedding(self, target_matrix):
"""
Fitting a single SVD embedding of a PMI matrix.
"""
svd = TruncatedSVD(n_components=self.dimensions,
n_iter=self.iterations,
random_state=self.seed)
svd.fit(target_matrix)
embedding = svd.transform(target_matrix)
self._embeddings.append(embedding)
示例30
def _create_embedding(self, target_matrix):
"""
Fitting a truncated SVD embedding of a PMI matrix.
"""
svd = TruncatedSVD(n_components=self.dimensions,
n_iter=self.iterations,
random_state=self.seed)
svd.fit(target_matrix)
embedding = svd.transform(target_matrix)
return embedding