From 793ddcc632f0a6ab0ac09301a73db199f27e136f Mon Sep 17 00:00:00 2001
From: Matthew Gaughan <gaughan@u.northwestern.edu>
Date: Tue, 30 Apr 2024 23:20:18 -0500
Subject: [PATCH] implement optimal lda from gridsearch

---
 text_analysis/topicModel.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/text_analysis/topicModel.py b/text_analysis/topicModel.py
index 1583e32..5eba149 100644
--- a/text_analysis/topicModel.py
+++ b/text_analysis/topicModel.py
@@ -80,6 +80,18 @@ def lda_model_identification(data_vectorized):
     print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))
 
 #TODO: implement best LDA model here 
+def best_lda_model(data_vectorized):
+    #Best Log Likelihood Score:  -502085.9749390023
+    #Model Perplexity:  1689.0943431883845
+    lda = LatentDirichletAllocation(n_components=5, learning_decay = 0.9, batch_size = 128, max_iter = 50)
+    id_topic = lda.fit_transform(data_vectorized)
+    topic_words = {}
+    for topic, comp in enumerate(lda.components_):
+        word_idx = np.argsort(comp)[::-1][:10]
+        topic_words[topic] = [vocab[i] for i in word_idx]
+    for topic, words in topic_words.items():
+        print('Topic: %d' % topic)
+        print('  %s' % ', '.join(words))
 
 #TODO: evaluate model and identified topics 
 
@@ -101,6 +113,7 @@ if __name__ == "__main__":
                              token_pattern='[a-zA-Z0-9]{2,}',  
                             )
     data_vectorized = vectorizer.fit_transform(lemmatized_corpus)                        
-    lda_model_identification(data_vectorized)
+    #lda_model_identification(data_vectorized)
+    best_lda_model(data_vectorized)