implement optimal lda from gridsearch
This commit is contained in:
parent
942c8113d6
commit
793ddcc632
@ -80,6 +80,18 @@ def lda_model_identification(data_vectorized):
|
|||||||
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))
|
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))
|
||||||
|
|
||||||
#TODO: implement best LDA model here
|
#TODO: implement best LDA model here
|
||||||
|
def best_lda_model(data_vectorized):
|
||||||
|
#Best Log Likelihood Score: -502085.9749390023
|
||||||
|
#Model Perplexity: 1689.0943431883845
|
||||||
|
lda = LatentDirichletAllocation(n_components=5, learning_decay = 0.9, batch_size = 128, max_iter = 50)
|
||||||
|
id_topic = lda.fit_transform(data_vectorized)
|
||||||
|
topic_words = {}
|
||||||
|
for topic, comp in enumerate(lda.components_):
|
||||||
|
word_idx = np.argsort(comp)[::-1][:10]
|
||||||
|
topic_words[topic] = [vocab[i] for i in word_idx]
|
||||||
|
for topic, words in topic_words.items():
|
||||||
|
print('Topic: %d' % topic)
|
||||||
|
print(' %s' % ', '.join(words))
|
||||||
|
|
||||||
#TODO: evaluate model and identified topics
|
#TODO: evaluate model and identified topics
|
||||||
|
|
||||||
@ -101,6 +113,7 @@ if __name__ == "__main__":
|
|||||||
token_pattern='[a-zA-Z0-9]{2,}',
|
token_pattern='[a-zA-Z0-9]{2,}',
|
||||||
)
|
)
|
||||||
data_vectorized = vectorizer.fit_transform(lemmatized_corpus)
|
data_vectorized = vectorizer.fit_transform(lemmatized_corpus)
|
||||||
lda_model_identification(data_vectorized)
|
#lda_model_identification(data_vectorized)
|
||||||
|
best_lda_model(data_vectorized)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user