implement optimal lda from gridsearch
This commit is contained in:
parent
942c8113d6
commit
793ddcc632
@ -80,6 +80,18 @@ def lda_model_identification(data_vectorized):
|
||||
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))
|
||||
|
||||
#TODO: implement best LDA model here
|
||||
def best_lda_model(data_vectorized):
|
||||
#Best Log Likelihood Score: -502085.9749390023
|
||||
#Model Perplexity: 1689.0943431883845
|
||||
lda = LatentDirichletAllocation(n_components=5, learning_decay = 0.9, batch_size = 128, max_iter = 50)
|
||||
id_topic = lda.fit_transform(data_vectorized)
|
||||
topic_words = {}
|
||||
for topic, comp in enumerate(lda.components_):
|
||||
word_idx = np.argsort(comp)[::-1][:10]
|
||||
topic_words[topic] = [vocab[i] for i in word_idx]
|
||||
for topic, words in topic_words.items():
|
||||
print('Topic: %d' % topic)
|
||||
print(' %s' % ', '.join(words))
|
||||
|
||||
#TODO: evaluate model and identified topics
|
||||
|
||||
@ -101,6 +113,7 @@ if __name__ == "__main__":
|
||||
token_pattern='[a-zA-Z0-9]{2,}',
|
||||
)
|
||||
data_vectorized = vectorizer.fit_transform(lemmatized_corpus)
|
||||
lda_model_identification(data_vectorized)
|
||||
#lda_model_identification(data_vectorized)
|
||||
best_lda_model(data_vectorized)
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user