Python源码示例:sklearn.decomposition.NMF
示例1
def write_topics(ftopics, fwords, ftopics_words, poem_words, n_topic, n_topic_words):
count_matrix = count_vect.fit_transform(poem_words)
tfidf = TfidfTransformer().fit_transform(count_matrix)
nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
feature_names = count_vect.get_feature_names()
fw = codecs.open(ftopics, 'w', 'utf-8')
for topic in nmf.components_:
fw.write(' '.join([feature_names[i] for i in topic.argsort()[:-n_topic_words - 1:-1]]) + '\n')
fw.close()
print('Write topics done.')
fw = codecs.open(fwords, 'wb')
pickle.dump(feature_names, fw)
fw.close()
print('Write words done.')
fw = codecs.open(ftopics_words, 'wb')
pickle.dump(nmf.components_, fw)
fw.close()
print('Write topic_words done.')
示例2
def _fit_and_score_NMF(self, new_residuals):
"""
Factorizing a residual matrix, returning the approximate target, and an embedding.
Arg types:
* **new_residuals** *(COO Scipy matrix)* - The residual matrix.
Return types:
* **scores** *(COO Scipy matrix)* - The residual scores.
* **W** *(Numpy array)* - The embedding matrix.
"""
model = NMF(n_components=self.dimensions,
init="random",
verbose=False,
alpha=self.alpha)
W = model.fit_transform(new_residuals)
H = model.components_
sub_scores = np.sum(np.multiply(W[self._index_1, :], H[:, self._index_2].T), axis=1)
scores = np.maximum(self._residuals.data-sub_scores, 0)
scores = sparse.csr_matrix((scores, (self._index_1, self._index_2)),
shape=self._shape,
dtype=np.float32)
return scores, W
示例3
def __init__(self, n_topics=50, estimator='LDA'):
"""
n_topics is the desired number of topics
To use Latent Semantic Analysis, set estimator to 'LSA',
To use Non-Negative Matrix Factorization, set estimator to 'NMF',
otherwise, defaults to Latent Dirichlet Allocation ('LDA').
"""
self.n_topics = n_topics
if estimator == 'LSA':
self.estimator = TruncatedSVD(n_components=self.n_topics)
elif estimator == 'NMF':
self.estimator = NMF(n_components=self.n_topics)
else:
self.estimator = LatentDirichletAllocation(n_topics=self.n_topics)
self.model = Pipeline([
('norm', TextNormalizer()),
('tfidf', CountVectorizer(tokenizer=identity,
preprocessor=None, lowercase=False)),
('model', self.estimator)
])
示例4
def fit_and_score_NMF(self, new_residuals):
"""
Factorizing a residual matrix, returning the approximate target and an embedding.
:param new_residuals: Input target matrix.
:return scores: Approximate target matrix.
:return W: Embedding matrix.
"""
model = NMF(n_components=self.args.dimensions,
init="random",
verbose=False,
alpha=self.args.alpha)
W = model.fit_transform(new_residuals)
H = model.components_
print("Scoring started.\n")
sub_scores = np.sum(np.multiply(W[self.index_1, :], H[:, self.index_2].T), axis=1)
scores = np.maximum(self.residuals.data-sub_scores, 0)
scores = sparse.csr_matrix((scores, (self.index_1, self.index_2)),
shape=self.shape,
dtype=np.float32)
return scores, W
示例5
def factorize_nmf():
print('factorizing matrix')
newsgroups_mmf_file = '/Users/fpena/tmp/nmf_graphlab/newsgroups/newsgroups_matrix.mmf'
document_term_matrix = mmread(newsgroups_mmf_file)
factorizer = decomposition.NMF(
init="nndsvd", n_components=Constants.TOPIC_MODEL_NUM_TOPICS,
max_iter=Constants.TOPIC_MODEL_ITERATIONS,
alpha=Constants.NMF_REGULARIZATION,
l1_ratio=Constants.NMF_REGULARIZATION_RATIO
)
document_topic_matrix = \
factorizer.fit_transform(document_term_matrix)
topic_term_matrix = factorizer.components_
# mmwrite(mmf_file, small_matrix)
# mmwrite(newsgroups_mmf_file, X)
示例6
def train_nmf(corpus, n_topics=10, max_df=0.95, min_df=2,
cleaning=clearstring, stop_words='english'):
if cleaning is not None:
for i in range(len(corpus)):
corpus[i] = cleaning(corpus[i])
tfidf_vectorizer = TfidfVectorizer(
max_df=max_df, min_df=min_df, stop_words=stop_words)
tfidf = tfidf_vectorizer.fit_transform(corpus)
tfidf_features = tfidf_vectorizer.get_feature_names()
nmf = NMF(
n_components=n_topics,
random_state=1,
alpha=.1,
l1_ratio=.5,
init='nndsvd').fit(tfidf)
return TOPIC(tfidf_features, nmf)
示例7
def nmf_to_onnx(W, H, op_version=12):
"""
The function converts a NMF described by matrices
*W*, *H* (*WH* approximate training data *M*).
into a function which takes two indices *(i, j)*
and returns the predictions for it. It assumes
these indices applies on the training data.
"""
col = OnnxArrayFeatureExtractor(H, 'col')
row = OnnxArrayFeatureExtractor(W.T, 'row')
dot = OnnxMul(col, row, op_version=op_version)
res = OnnxReduceSum(dot, output_names="rec", op_version=op_version)
indices_type = np.array([0], dtype=np.int64)
onx = res.to_onnx(inputs={'col': indices_type,
'row': indices_type},
outputs=[('rec', FloatTensorType((None, 1)))],
target_opset=op_version)
return onx
示例8
def get_nmf_decomposition(
X: np.ndarray,
n_roles: int,
) -> FactorTuple:
"""
Compute NMF decomposition
:param X: matrix to factor
:param n_roles: rank of decomposition
"""
nmf = NMF(n_components=n_roles, solver='mu', init='nndsvda')
with warnings.catch_warnings():
# ignore convergence warning from NMF since
# this will result in a large cost anyways
warnings.simplefilter('ignore')
G = nmf.fit_transform(X)
F = nmf.components_
return G, F
示例9
def test_objectmapper(self):
df = pdml.ModelFrame([])
self.assertIs(df.decomposition.PCA, decomposition.PCA)
self.assertIs(df.decomposition.IncrementalPCA,
decomposition.IncrementalPCA)
self.assertIs(df.decomposition.KernelPCA, decomposition.KernelPCA)
self.assertIs(df.decomposition.FactorAnalysis,
decomposition.FactorAnalysis)
self.assertIs(df.decomposition.FastICA, decomposition.FastICA)
self.assertIs(df.decomposition.TruncatedSVD, decomposition.TruncatedSVD)
self.assertIs(df.decomposition.NMF, decomposition.NMF)
self.assertIs(df.decomposition.SparsePCA, decomposition.SparsePCA)
self.assertIs(df.decomposition.MiniBatchSparsePCA,
decomposition.MiniBatchSparsePCA)
self.assertIs(df.decomposition.SparseCoder, decomposition.SparseCoder)
self.assertIs(df.decomposition.DictionaryLearning,
decomposition.DictionaryLearning)
self.assertIs(df.decomposition.MiniBatchDictionaryLearning,
decomposition.MiniBatchDictionaryLearning)
self.assertIs(df.decomposition.LatentDirichletAllocation,
decomposition.LatentDirichletAllocation)
示例10
def factorize_string_matrix(self):
"""
Creating string labels by factorization.
"""
rows = [node for node, features in self.binned_features.items() for feature in features]
columns = [int(feature) for node, features in self.binned_features.items() for feature in features]
scores = [1 for i in range(len(columns))]
row_number = max(rows)+1
column_number = max(columns)+1
features = csr_matrix((scores, (rows, columns)), shape=(row_number, column_number))
model = NMF(n_components=self.args.factors, init="random", random_state=self.args.seed, alpha=self.args.beta)
factors = model.fit_transform(features)
kmeans = KMeans(n_clusters=self.args.clusters, random_state=self.args.seed).fit(factors)
labels = kmeans.labels_
features = {str(node): str(labels[node]) for node in self.graph.nodes()}
return features
示例11
def apply( self, X, k = 2 ):
"""
Apply NMF to the specified document-term matrix X.
"""
import nimfa
self.W = None
self.H = None
initialize_only = self.max_iters < 1
if self.update == "euclidean":
objective = "fro"
else:
objective = "div"
lsnmf = nimfa.Lsnmf(X, max_iter = self.max_iters, rank = k, seed = self.init_strategy, update = self.update, objective = objective, test_conv = self.test_conv )
res = lsnmf()
# TODO: fix
try:
self.W = res.basis().todense()
self.H = res.coef().todense()
except:
self.W = res.basis()
self.H = res.coef()
# last number of iterations
self.n_iter = res.n_iter
示例12
def get_topics_from_model(
self,
pipe=Pipeline([
('tfidf', TfidfTransformer(sublinear_tf=True)),
('nmf', (NMF(n_components=30, alpha=.1, l1_ratio=.5, random_state=0)))]),
num_terms_per_topic=10):
'''
Parameters
----------
pipe : Pipeline
For example, `Pipeline([
('tfidf', TfidfTransformer(sublinear_tf=True)),
('nmf', (NMF(n_components=30, alpha=.1, l1_ratio=.5, random_state=0)))])`
The last transformer must populate a `components_` attribute when finished.
num_terms_per_topic : int
Returns
-------
dict: {term: [term1, ...], ...}
'''
pipe.fit_transform(self.sentX)
topic_model = {}
for topic_idx, topic in enumerate(pipe._final_estimator.components_):
term_list = [self.termidxstore.getval(i)
for i
in topic.argsort()[:-num_terms_per_topic - 1:-1]
if topic[i] > 0]
if len(term_list) > 0:
topic_model['%s. %s' % (topic_idx, term_list[0])] = term_list
else:
Warning("Topic %s has no terms with scores > 0. Omitting." % (topic_idx))
return topic_model
示例13
def skNMF(data, dim):
model = NMF(n_components=dim)
model.fit(data)
return model.transform(data)
# Max-min norm
示例14
def _sklearn_pretrain(self, i):
"""
Pre-training a single layer of the model with sklearn.
Arg types:
* **i** *(int)* - The layer index.
"""
nmf_model = NMF(n_components=self.layers[i],
init="random",
random_state=self.seed,
max_iter=self.pre_iterations)
U = nmf_model.fit_transform(self._Z)
V = nmf_model.components_
return U, V
示例15
def _pre_training(self):
"""
Pre-training each NMF layer.
"""
self._U_s = []
self._V_s = []
for i in range(self._p):
self._setup_z(i)
U, V = self._sklearn_pretrain(i)
self._U_s.append(U)
self._V_s.append(V)
示例16
def _setup_base_model(self):
"""
Fitting NMF on the starting matrix.
"""
self._shape = self._residuals.shape
indices = self._residuals.nonzero()
self._index_1 = indices[0]
self._index_2 = indices[1]
base_score, embedding = self._fit_and_score_NMF(self._residuals)
self._embeddings = [embedding]
示例17
def test_check_estimator_clones():
# check that check_estimator doesn't modify the estimator it receives
from sklearn.datasets import load_iris
iris = load_iris()
for Estimator in [GaussianMixture, LinearRegression,
RandomForestClassifier, NMF, SGDClassifier,
MiniBatchKMeans]:
with ignore_warnings(category=(FutureWarning, DeprecationWarning)):
# when 'est = SGDClassifier()'
est = Estimator()
set_checking_parameters(est)
set_random_state(est)
# without fitting
old_hash = _joblib.hash(est)
check_estimator(est)
assert_equal(old_hash, _joblib.hash(est))
with ignore_warnings(category=(FutureWarning, DeprecationWarning)):
# when 'est = SGDClassifier()'
est = Estimator()
set_checking_parameters(est)
set_random_state(est)
# with fitting
est.fit(iris.data + 10, iris.target)
old_hash = _joblib.hash(est)
check_estimator(est)
assert_equal(old_hash, _joblib.hash(est))
示例18
def fit(self):
"""Train the model with the indicated algorithm.
Do not forget to tune the hyperparameters.
"""
if self.algorithm == "PCA":
self.model = PCA(n_components=self.nb_compo)
elif self.algorithm == "NMF":
self.model = NMF(n_components=self.nb_compo,init = "nndsvd")
if self.scaling == True:
self.model.fit(self.X_train_sc)
else:
self.model.fit(self.X_train)
示例19
def transformer_factory(self) -> TransformerMixin:
return NMF(n_components=self.width, random_state=71)
示例20
def sklearn_pretrain(self, i):
"""
Pretraining a single layer of the model with sklearn.
:param i: Layer index.
"""
nmf_model = NMF(n_components=self.args.layers[i],
init="random",
random_state=self.args.seed,
max_iter=self.args.pre_iterations)
U = nmf_model.fit_transform(self.Z)
V = nmf_model.components_
return U, V
示例21
def pre_training(self):
"""
Pre-training each NMF layer.
"""
print("\nLayer pre-training started. \n")
self.U_s = []
self.V_s = []
for i in tqdm(range(self.p), desc="Layers trained: ", leave=True):
self.setup_z(i)
U, V = self.sklearn_pretrain(i)
self.U_s.append(U)
self.V_s.append(V)
示例22
def apply( self, X, k = 2 ):
"""
Apply NMF to the specified document-term matrix X.
"""
self.W = None
self.H = None
model = decomposition.NMF(init=self.init_strategy, n_components=k, max_iter=self.max_iters, random_state = self.random_seed)
self.W = model.fit_transform(X)
self.H = model.components_
示例23
def rank_terms( self, topic_index, top = -1 ):
"""
Return the top ranked terms for the specified topic, generated during the last NMF run.
"""
if self.H is None:
raise ValueError("No results for previous run available")
# NB: reverse
top_indices = np.argsort( self.H[topic_index,:] )[::-1]
# truncate if necessary
if top < 1 or top > len(top_indices):
return top_indices
return top_indices[0:top]
示例24
def generate_doc_rankings( W ):
'''
Rank document indices, based on values in a W factor matrix produced by NMF.
'''
doc_rankings = []
k = W.shape[1]
for topic_index in range(k):
w = np.array( W[:,topic_index] )
top_indices = np.argsort(w)[::-1]
doc_rankings.append(top_indices)
return doc_rankings
示例25
def save_nmf_results( out_path, doc_ids, terms, term_rankings, partition, W, H, topic_labels=None ):
"""
Save output of NMF using Joblib. Note that we use the scikit-learn bundled version of joblib.
"""
# no labels? generate some standard ones
if topic_labels is None:
topic_labels = []
for i in range( len(term_rankings) ):
topic_labels.append( "C%02d" % (i+1) )
log.info( "Saving NMF results to %s" % out_path )
joblib.dump((doc_ids, terms, term_rankings, partition, W, H, topic_labels), out_path )
示例26
def load_nmf_results( in_path ):
"""
Load NMF results using Joblib. Note that we use the scikit-learn bundled version of joblib.
"""
(doc_ids, terms, term_rankings, partition, W, H, labels) = joblib.load( in_path )
return (doc_ids, terms, term_rankings, partition, W, H, labels)
示例27
def build_model(self, baskets, use_probabilities=False):
# print 'build V'
self.__buildV(baskets, use_probabilities)
# print 'density', 1.0 * len(self.V.nonzero()[0]) / (self.V.shape[0] * self.V.shape[1])
sknmf = SKNMF(n_components=self.n_factor, init='random', solver='cd', tol=self.tol, max_iter=self.max_iter,
alpha=self.alpha, l1_ratio=self.l1_ratio, beta=self.beta)
self.W = sknmf.fit_transform(self.V)
self.H = sknmf.components_
self.R = np.dot(self.W, self.H)
self.__state = 'built'
return self
示例28
def __init__(self, n_components=None, init=None, solver='cd', beta_loss='frobenius', tol=0.0001, max_iter=200, random_state=None, alpha=0.0, l1_ratio=0.0, verbose=0, shuffle=False):
self._hyperparams = {
'n_components': n_components,
'init': init,
'solver': solver,
'beta_loss': beta_loss,
'tol': tol,
'max_iter': max_iter,
'random_state': random_state,
'alpha': alpha,
'l1_ratio': l1_ratio,
'verbose': verbose,
'shuffle': shuffle}
self._wrapped_model = SKLModel(**self._hyperparams)
示例29
def __init__(self, options):
self.handle_options(options)
out_params = convert_params(
options.get('params', {}),
floats=['beta_loss','tol','alpha','l1_ratio'],
strs=['init','solver'],
ints=['k','max_iter','random_state'],
bools=['versbose','shuffle'],
aliases={'k': 'n_components'}
)
self.estimator = _NMF(**out_params)
示例30
def build_topic_model(self):
print('%s: building NMF topic model' %
time.strftime("%Y/%m/%d-%H:%M:%S"))
self.topic_model = decomposition.NMF(
init="nndsvd", n_components=self.num_topics,
max_iter=Constants.TOPIC_MODEL_ITERATIONS)
self.document_topic_matrix =\
self.topic_model.fit_transform(self.document_term_matrix)
self.topic_term_matrix = self.topic_model.components_
print('%s: topic model built' %
time.strftime("%Y/%m/%d-%H:%M:%S"))