Python源码示例:sklearn.decomposition.LatentDirichletAllocation()
示例1
def test_lda_preplexity_mismatch():
# test dimension mismatch in `perplexity` method
rng = np.random.RandomState(0)
n_components = rng.randint(3, 6)
n_samples = rng.randint(6, 10)
X = np.random.randint(4, size=(n_samples, 10))
lda = LatentDirichletAllocation(n_components=n_components,
learning_offset=5., total_samples=20,
random_state=rng)
lda.fit(X)
# invalid samples
invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_components))
assert_raises_regexp(ValueError, r'Number of samples',
lda._perplexity_precomp_distr, X, invalid_n_samples)
# invalid topic number
invalid_n_components = rng.randint(4, size=(n_samples, n_components + 1))
assert_raises_regexp(ValueError, r'Number of topics',
lda._perplexity_precomp_distr, X,
invalid_n_components)
示例2
def test_lda_perplexity(method):
# Test LDA perplexity for batch training
# perplexity should be lower after each iteration
n_components, X = _build_sparse_mtx()
lda_1 = LatentDirichletAllocation(n_components=n_components,
max_iter=1, learning_method=method,
total_samples=100, random_state=0)
lda_2 = LatentDirichletAllocation(n_components=n_components,
max_iter=10, learning_method=method,
total_samples=100, random_state=0)
lda_1.fit(X)
perp_1 = lda_1.perplexity(X, sub_sampling=False)
lda_2.fit(X)
perp_2 = lda_2.perplexity(X, sub_sampling=False)
assert_greater_equal(perp_1, perp_2)
perp_1_subsampling = lda_1.perplexity(X, sub_sampling=True)
perp_2_subsampling = lda_2.perplexity(X, sub_sampling=True)
assert_greater_equal(perp_1_subsampling, perp_2_subsampling)
示例3
def test_lda_score(method):
# Test LDA score for batch training
# score should be higher after each iteration
n_components, X = _build_sparse_mtx()
lda_1 = LatentDirichletAllocation(n_components=n_components,
max_iter=1, learning_method=method,
total_samples=100, random_state=0)
lda_2 = LatentDirichletAllocation(n_components=n_components,
max_iter=10, learning_method=method,
total_samples=100, random_state=0)
lda_1.fit_transform(X)
score_1 = lda_1.score(X)
lda_2.fit_transform(X)
score_2 = lda_2.score(X)
assert_greater_equal(score_2, score_1)
示例4
def test_lda_fit_perplexity():
# Test that the perplexity computed during fit is consistent with what is
# returned by the perplexity method
n_components, X = _build_sparse_mtx()
lda = LatentDirichletAllocation(n_components=n_components, max_iter=1,
learning_method='batch', random_state=0,
evaluate_every=1)
lda.fit(X)
# Perplexity computed at end of fit method
perplexity1 = lda.bound_
# Result of perplexity method on the train set
perplexity2 = lda.perplexity(X)
assert_almost_equal(perplexity1, perplexity2)
示例5
def check_verbosity(verbose, evaluate_every, expected_lines,
expected_perplexities):
n_components, X = _build_sparse_mtx()
lda = LatentDirichletAllocation(n_components=n_components, max_iter=3,
learning_method='batch',
verbose=verbose,
evaluate_every=evaluate_every,
random_state=0)
out = StringIO()
old_out, sys.stdout = sys.stdout, out
try:
lda.fit(X)
finally:
sys.stdout = old_out
n_lines = out.getvalue().count('\n')
n_perplexity = out.getvalue().count('perplexity')
assert_equal(expected_lines, n_lines)
assert_equal(expected_perplexities, n_perplexity)
示例6
def word2vec(word_list,n_features=1000,topics = 5):
tf_vectorizer = CountVectorizer(strip_accents='unicode',
max_features=n_features,
#stop_words='english',
max_df=0.5,
min_df=10)
tf = tf_vectorizer.fit_transform(word_list)
lda = LatentDirichletAllocation(n_components=topics,#主题数
learning_method='batch',#样本量不大只是用来学习的话用"batch"比较好,这样可以少很多参数要调
)
#用变分贝叶斯方法训练模型
lda.fit(tf)
#依次输出每个主题的关键词表
tf_feature_names = tf_vectorizer.get_feature_names()
return lda,tf,tf_feature_names,tf_vectorizer
#将主题以可视化结果展现出来
示例7
def __init__(self, n_topics=50, estimator='LDA'):
"""
n_topics is the desired number of topics
To use Latent Semantic Analysis, set estimator to 'LSA',
To use Non-Negative Matrix Factorization, set estimator to 'NMF',
otherwise, defaults to Latent Dirichlet Allocation ('LDA').
"""
self.n_topics = n_topics
if estimator == 'LSA':
self.estimator = TruncatedSVD(n_components=self.n_topics)
elif estimator == 'NMF':
self.estimator = NMF(n_components=self.n_topics)
else:
self.estimator = LatentDirichletAllocation(n_topics=self.n_topics)
self.model = Pipeline([
('norm', TextNormalizer()),
('tfidf', CountVectorizer(tokenizer=identity,
preprocessor=None, lowercase=False)),
('model', self.estimator)
])
示例8
def train_lda(corpus, n_topics=10, max_df=0.95, min_df=2,
cleaning=clearstring, stop_words='english'):
if cleaning is not None:
for i in range(len(corpus)):
corpus[i] = cleaning(corpus[i])
tf_vectorizer = CountVectorizer(
max_df=max_df,
min_df=min_df,
stop_words=stop_words)
tf = tf_vectorizer.fit_transform(corpus)
tf_features = tf_vectorizer.get_feature_names()
lda = LatentDirichletAllocation(
n_topics=n_topics,
max_iter=5,
learning_method='online',
learning_offset=50.,
random_state=0).fit(tf)
return TOPIC(tf_features, lda)
示例9
def test_objectmapper(self):
df = pdml.ModelFrame([])
self.assertIs(df.decomposition.PCA, decomposition.PCA)
self.assertIs(df.decomposition.IncrementalPCA,
decomposition.IncrementalPCA)
self.assertIs(df.decomposition.KernelPCA, decomposition.KernelPCA)
self.assertIs(df.decomposition.FactorAnalysis,
decomposition.FactorAnalysis)
self.assertIs(df.decomposition.FastICA, decomposition.FastICA)
self.assertIs(df.decomposition.TruncatedSVD, decomposition.TruncatedSVD)
self.assertIs(df.decomposition.NMF, decomposition.NMF)
self.assertIs(df.decomposition.SparsePCA, decomposition.SparsePCA)
self.assertIs(df.decomposition.MiniBatchSparsePCA,
decomposition.MiniBatchSparsePCA)
self.assertIs(df.decomposition.SparseCoder, decomposition.SparseCoder)
self.assertIs(df.decomposition.DictionaryLearning,
decomposition.DictionaryLearning)
self.assertIs(df.decomposition.MiniBatchDictionaryLearning,
decomposition.MiniBatchDictionaryLearning)
self.assertIs(df.decomposition.LatentDirichletAllocation,
decomposition.LatentDirichletAllocation)
示例10
def test_lda_preplexity_mismatch():
# test dimension mismatch in `perplexity` method
rng = np.random.RandomState(0)
n_components = rng.randint(3, 6)
n_samples = rng.randint(6, 10)
X = np.random.randint(4, size=(n_samples, 10))
lda = LatentDirichletAllocation(n_components=n_components,
learning_offset=5., total_samples=20,
random_state=rng)
lda.fit(X)
# invalid samples
invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_components))
assert_raises_regexp(ValueError, r'Number of samples',
lda._perplexity_precomp_distr, X, invalid_n_samples)
# invalid topic number
invalid_n_components = rng.randint(4, size=(n_samples, n_components + 1))
assert_raises_regexp(ValueError, r'Number of topics',
lda._perplexity_precomp_distr, X,
invalid_n_components)
示例11
def test_lda_perplexity():
# Test LDA perplexity for batch training
# perplexity should be lower after each iteration
n_components, X = _build_sparse_mtx()
for method in ('online', 'batch'):
lda_1 = LatentDirichletAllocation(n_components=n_components,
max_iter=1, learning_method=method,
total_samples=100, random_state=0)
lda_2 = LatentDirichletAllocation(n_components=n_components,
max_iter=10, learning_method=method,
total_samples=100, random_state=0)
lda_1.fit(X)
perp_1 = lda_1.perplexity(X, sub_sampling=False)
lda_2.fit(X)
perp_2 = lda_2.perplexity(X, sub_sampling=False)
assert_greater_equal(perp_1, perp_2)
perp_1_subsampling = lda_1.perplexity(X, sub_sampling=True)
perp_2_subsampling = lda_2.perplexity(X, sub_sampling=True)
assert_greater_equal(perp_1_subsampling, perp_2_subsampling)
示例12
def test_lda_score():
# Test LDA score for batch training
# score should be higher after each iteration
n_components, X = _build_sparse_mtx()
for method in ('online', 'batch'):
lda_1 = LatentDirichletAllocation(n_components=n_components,
max_iter=1, learning_method=method,
total_samples=100, random_state=0)
lda_2 = LatentDirichletAllocation(n_components=n_components,
max_iter=10, learning_method=method,
total_samples=100, random_state=0)
lda_1.fit_transform(X)
score_1 = lda_1.score(X)
lda_2.fit_transform(X)
score_2 = lda_2.score(X)
assert_greater_equal(score_2, score_1)
示例13
def test_lda_fit_perplexity():
# Test that the perplexity computed during fit is consistent with what is
# returned by the perplexity method
n_components, X = _build_sparse_mtx()
lda = LatentDirichletAllocation(n_components=n_components, max_iter=1,
learning_method='batch', random_state=0,
evaluate_every=1)
lda.fit(X)
# Perplexity computed at end of fit method
perplexity1 = lda.bound_
# Result of perplexity method on the train set
perplexity2 = lda.perplexity(X)
assert_almost_equal(perplexity1, perplexity2)
示例14
def check_verbosity(verbose, evaluate_every, expected_lines,
expected_perplexities):
n_components, X = _build_sparse_mtx()
lda = LatentDirichletAllocation(n_components=n_components, max_iter=3,
learning_method='batch',
verbose=verbose,
evaluate_every=evaluate_every,
random_state=0)
out = StringIO()
old_out, sys.stdout = sys.stdout, out
try:
lda.fit(X)
finally:
sys.stdout = old_out
n_lines = out.getvalue().count('\n')
n_perplexity = out.getvalue().count('perplexity')
assert_equal(expected_lines, n_lines)
assert_equal(expected_perplexities, n_perplexity)
示例15
def test_evaluation_sklearn_all_metrics():
passed_params = {'n_components', 'learning_method', 'evaluate_every', 'max_iter', 'n_jobs', 'random_state'}
varying_params = [dict(n_components=k) for k in range(2, 5)]
const_params = dict(learning_method='batch', evaluate_every=1, max_iter=3, n_jobs=1, random_state=1)
evaluate_topic_models_kwargs = dict(
metric=tm_sklearn.AVAILABLE_METRICS,
held_out_documents_wallach09_n_samples=10,
held_out_documents_wallach09_n_folds=2,
coherence_gensim_vocab=EVALUATION_TEST_VOCAB,
coherence_gensim_texts=EVALUATION_TEST_TOKENS,
return_models=True,
)
eval_res = tm_sklearn.evaluate_topic_models(EVALUATION_TEST_DTM, varying_params, const_params,
**evaluate_topic_models_kwargs)
assert len(eval_res) == len(varying_params)
for param_set, metric_results in eval_res:
assert set(param_set.keys()) == passed_params
assert set(metric_results.keys()) == set(tm_sklearn.AVAILABLE_METRICS + ('model',))
assert metric_results['perplexity'] > 0
assert 0 <= metric_results['cao_juan_2009'] <= 1
assert 0 <= metric_results['arun_2010']
assert metric_results['coherence_mimno_2011'] < 0
assert np.isclose(metric_results['coherence_gensim_u_mass'], metric_results['coherence_mimno_2011'])
assert 0 <= metric_results['coherence_gensim_c_v'] <= 1
assert metric_results['coherence_gensim_c_uci'] < 0
assert metric_results['coherence_gensim_c_npmi'] < 0
if 'held_out_documents_wallach09' in tm_lda.AVAILABLE_METRICS: # only if gmpy2 is installed
assert metric_results['held_out_documents_wallach09'] < 0
assert isinstance(metric_results['model'], LatentDirichletAllocation)
示例16
def test_compute_models_parallel_sklearn():
passed_params = {'n_components', 'learning_method', 'evaluate_every', 'max_iter', 'n_jobs'}
varying_params = [dict(n_components=k) for k in range(2, 5)]
const_params = dict(learning_method='batch', evaluate_every=1, max_iter=3, n_jobs=1)
models = tm_sklearn.compute_models_parallel(EVALUATION_TEST_DTM, varying_params, const_params)
assert len(models) == len(varying_params)
for param_set, model in models:
assert set(param_set.keys()) == passed_params
assert isinstance(model, LatentDirichletAllocation)
assert isinstance(model.components_, np.ndarray)
示例17
def fitTopicModel(self, numTopics, max_iter=100, **kwargs):
self.lda = LatentDirichletAllocation(n_topics=numTopics,learning_method=self.learningMethod,random_state=self.seed,
n_jobs=1, max_iter=max_iter, batch_size=self.chunksize, **kwargs)
if self.fragM.shape[0] > self.chunksize:
# fit the model in chunks
self.lda.learning_method = 'online'
self.lda.fit(self.fragM)
else:
self.lda.fit(self.fragM)
示例18
def learn_topics(X, X_dev, K=50):
lda = LatentDirichletAllocation(n_components=K, learning_method='online', verbose=1)
print("Fitting", K, "topics...")
lda.fit(X)
score = lda.perplexity(X_dev)
print("Log likelihood:", score)
topics = lda.components_
return score, lda, topics
示例19
def learn_topics(X, X_dev, K=50):
lda = LatentDirichletAllocation(n_components=K, learning_method='online', verbose=1)
print("Fitting", K, "topics...")
lda.fit(X)
score = lda.perplexity(X_dev)
print("Log likelihood:", score)
topics = lda.components_
return score, lda, topics
示例20
def learn_topics(X, X_dev, K=50):
lda = LatentDirichletAllocation(n_components=K, learning_method='online', verbose=1)
print("Fitting", K, "topics...")
lda.fit(X)
score = lda.perplexity(X_dev)
print("Log likelihood:", score)
topics = lda.components_
return score, lda, topics
示例21
def test_lda_default_prior_params():
# default prior parameter should be `1 / topics`
# and verbose params should not affect result
n_components, X = _build_sparse_mtx()
prior = 1. / n_components
lda_1 = LatentDirichletAllocation(n_components=n_components,
doc_topic_prior=prior,
topic_word_prior=prior, random_state=0)
lda_2 = LatentDirichletAllocation(n_components=n_components,
random_state=0)
topic_distr_1 = lda_1.fit_transform(X)
topic_distr_2 = lda_2.fit_transform(X)
assert_almost_equal(topic_distr_1, topic_distr_2)
示例22
def test_lda_fit_batch():
# Test LDA batch learning_offset (`fit` method with 'batch' learning)
rng = np.random.RandomState(0)
n_components, X = _build_sparse_mtx()
lda = LatentDirichletAllocation(n_components=n_components,
evaluate_every=1, learning_method='batch',
random_state=rng)
lda.fit(X)
correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
for component in lda.components_:
# Find top 3 words in each LDA component
top_idx = set(component.argsort()[-3:][::-1])
assert tuple(sorted(top_idx)) in correct_idx_grps
示例23
def test_lda_fit_online():
# Test LDA online learning (`fit` method with 'online' learning)
rng = np.random.RandomState(0)
n_components, X = _build_sparse_mtx()
lda = LatentDirichletAllocation(n_components=n_components,
learning_offset=10., evaluate_every=1,
learning_method='online', random_state=rng)
lda.fit(X)
correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
for component in lda.components_:
# Find top 3 words in each LDA component
top_idx = set(component.argsort()[-3:][::-1])
assert tuple(sorted(top_idx)) in correct_idx_grps
示例24
def test_lda_partial_fit():
# Test LDA online learning (`partial_fit` method)
# (same as test_lda_batch)
rng = np.random.RandomState(0)
n_components, X = _build_sparse_mtx()
lda = LatentDirichletAllocation(n_components=n_components,
learning_offset=10., total_samples=100,
random_state=rng)
for i in range(3):
lda.partial_fit(X)
correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
for c in lda.components_:
top_idx = set(c.argsort()[-3:][::-1])
assert tuple(sorted(top_idx)) in correct_idx_grps
示例25
def test_lda_dense_input():
# Test LDA with dense input.
rng = np.random.RandomState(0)
n_components, X = _build_sparse_mtx()
lda = LatentDirichletAllocation(n_components=n_components,
learning_method='batch', random_state=rng)
lda.fit(X.toarray())
correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
for component in lda.components_:
# Find top 3 words in each LDA component
top_idx = set(component.argsort()[-3:][::-1])
assert tuple(sorted(top_idx)) in correct_idx_grps
示例26
def test_lda_fit_transform(method):
# Test LDA fit_transform & transform
# fit_transform and transform result should be the same
rng = np.random.RandomState(0)
X = rng.randint(10, size=(50, 20))
lda = LatentDirichletAllocation(n_components=5, learning_method=method,
random_state=rng)
X_fit = lda.fit_transform(X)
X_trans = lda.transform(X)
assert_array_almost_equal(X_fit, X_trans, 4)
示例27
def test_lda_partial_fit_dim_mismatch():
# test `n_features` mismatch in `partial_fit`
rng = np.random.RandomState(0)
n_components = rng.randint(3, 6)
n_col = rng.randint(6, 10)
X_1 = np.random.randint(4, size=(10, n_col))
X_2 = np.random.randint(4, size=(10, n_col + 1))
lda = LatentDirichletAllocation(n_components=n_components,
learning_offset=5., total_samples=20,
random_state=rng)
lda.partial_fit(X_1)
assert_raises_regexp(ValueError, r"^The provided data has",
lda.partial_fit, X_2)
示例28
def test_invalid_params():
# test `_check_params` method
X = np.ones((5, 10))
invalid_models = (
('n_components', LatentDirichletAllocation(n_components=0)),
('learning_method',
LatentDirichletAllocation(learning_method='unknown')),
('total_samples', LatentDirichletAllocation(total_samples=0)),
('learning_offset', LatentDirichletAllocation(learning_offset=-1)),
)
for param, model in invalid_models:
regex = r"^Invalid %r parameter" % param
assert_raises_regexp(ValueError, regex, model.fit, X)
示例29
def test_lda_negative_input():
# test pass dense matrix with sparse negative input.
X = np.full((5, 10), -1.)
lda = LatentDirichletAllocation()
regex = r"^Negative values in data passed"
assert_raises_regexp(ValueError, regex, lda.fit, X)
示例30
def test_lda_no_component_error():
# test `transform` and `perplexity` before `fit`
rng = np.random.RandomState(0)
X = rng.randint(4, size=(20, 10))
lda = LatentDirichletAllocation()
regex = r"^no 'components_' attribute"
assert_raises_regexp(NotFittedError, regex, lda.transform, X)
assert_raises_regexp(NotFittedError, regex, lda.perplexity, X)