From 793ddcc632f0a6ab0ac09301a73db199f27e136f Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Tue, 30 Apr 2024 23:20:18 -0500 Subject: [PATCH] implement optimal lda from gridsearch --- text_analysis/topicModel.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/text_analysis/topicModel.py b/text_analysis/topicModel.py index 1583e32..5eba149 100644 --- a/text_analysis/topicModel.py +++ b/text_analysis/topicModel.py @@ -80,6 +80,18 @@ def lda_model_identification(data_vectorized): print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized)) #TODO: implement best LDA model here +def best_lda_model(data_vectorized): + #Best Log Likelihood Score: -502085.9749390023 + #Model Perplexity: 1689.0943431883845 + lda = LatentDirichletAllocation(n_components=5, learning_decay = 0.9, batch_size = 128, max_iter = 50) + id_topic = lda.fit_transform(data_vectorized) + topic_words = {} + for topic, comp in enumerate(lda.components_): + word_idx = np.argsort(comp)[::-1][:10] + topic_words[topic] = [vocab[i] for i in word_idx] + for topic, words in topic_words.items(): + print('Topic: %d' % topic) + print(' %s' % ', '.join(words)) #TODO: evaluate model and identified topics @@ -101,6 +113,7 @@ if __name__ == "__main__": token_pattern='[a-zA-Z0-9]{2,}', ) data_vectorized = vectorizer.fit_transform(lemmatized_corpus) - lda_model_identification(data_vectorized) + #lda_model_identification(data_vectorized) + best_lda_model(data_vectorized)