implement optimal lda from gridsearch

2024-04-30 23:20:18 -05:00 · 2024-04-30 23:20:18 -05:00 · 793ddcc632
commit 793ddcc632
parent 942c8113d6
1 changed files with 14 additions and 1 deletions
--- a/text_analysis/topicModel.py
+++ b/text_analysis/topicModel.py
@ -80,6 +80,18 @@ def lda_model_identification(data_vectorized):
    print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

 #TODO: implement best LDA model here 
+def best_lda_model(data_vectorized):
+    #Best Log Likelihood Score:  -502085.9749390023
+    #Model Perplexity:  1689.0943431883845
+    lda = LatentDirichletAllocation(n_components=5, learning_decay = 0.9, batch_size = 128, max_iter = 50)
+    id_topic = lda.fit_transform(data_vectorized)
+    topic_words = {}
+    for topic, comp in enumerate(lda.components_):
+        word_idx = np.argsort(comp)[::-1][:10]
+        topic_words[topic] = [vocab[i] for i in word_idx]
+    for topic, words in topic_words.items():
+        print('Topic: %d' % topic)
+        print('  %s' % ', '.join(words))

 #TODO: evaluate model and identified topics 

@ -101,6 +113,7 @@ if __name__ == "__main__":
                             token_pattern='[a-zA-Z0-9]{2,}',  
                            )
    data_vectorized = vectorizer.fit_transform(lemmatized_corpus)                        
-    lda_model_identification(data_vectorized)
+    #lda_model_identification(data_vectorized)
+    best_lda_model(data_vectorized)