implement optimal lda from gridsearch

This commit is contained in:
Matthew Gaughan 2024-04-30 23:20:18 -05:00
parent 942c8113d6
commit 793ddcc632

View File

@ -80,6 +80,18 @@ def lda_model_identification(data_vectorized):
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized)) print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))
#TODO: implement best LDA model here #TODO: implement best LDA model here
def best_lda_model(data_vectorized):
#Best Log Likelihood Score: -502085.9749390023
#Model Perplexity: 1689.0943431883845
lda = LatentDirichletAllocation(n_components=5, learning_decay = 0.9, batch_size = 128, max_iter = 50)
id_topic = lda.fit_transform(data_vectorized)
topic_words = {}
for topic, comp in enumerate(lda.components_):
word_idx = np.argsort(comp)[::-1][:10]
topic_words[topic] = [vocab[i] for i in word_idx]
for topic, words in topic_words.items():
print('Topic: %d' % topic)
print(' %s' % ', '.join(words))
#TODO: evaluate model and identified topics #TODO: evaluate model and identified topics
@ -101,6 +113,7 @@ if __name__ == "__main__":
token_pattern='[a-zA-Z0-9]{2,}', token_pattern='[a-zA-Z0-9]{2,}',
) )
data_vectorized = vectorizer.fit_transform(lemmatized_corpus) data_vectorized = vectorizer.fit_transform(lemmatized_corpus)
lda_model_identification(data_vectorized) #lda_model_identification(data_vectorized)
best_lda_model(data_vectorized)