From 9f6f7e9423c0e99967ae243a319db1a2a2023c91 Mon Sep 17 00:00:00 2001
From: Matthew Gaughan <gaughan@u.northwestern.edu>
Date: Tue, 30 Apr 2024 16:30:06 -0500
Subject: [PATCH] prepping gridsearch

---
 text_analysis/topicModel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/text_analysis/topicModel.py b/text_analysis/topicModel.py
index 43ab457..1583e32 100644
--- a/text_analysis/topicModel.py
+++ b/text_analysis/topicModel.py
@@ -49,7 +49,7 @@ def preprocess(corpus_list):
     #mvp right now, can certainly be expanded as iterations of text analysis are done
     D = [[token for token in simple_preprocess(doc) if token not in stopwords]for doc in D]
     lemmatizer = WordNetLemmatizer()
-    D_lemma = [[lemmatizer.lemmatize(token) for token in doc] for doc in D]
+    D_lemma = [" ".join([lemmatizer.lemmatize(token) for token in doc]) for doc in D]
     return D_lemma
 
 #preparing processed data for model usage
@@ -72,7 +72,7 @@ def text_preparation(lemmatized_text):
 def lda_model_identification(data_vectorized):
     lda = LatentDirichletAllocation()
     search_params = {'n_components': [5, 10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9], 'max_iter': [10, 20, 50], 'batch_size':[128, 256]}
-    model = GridSearchCV(lda, param_grid=search_params)
+    model = GridSearchCV(lda, param_grid=search_params, verbose=10)
     model.fit(data_vectorized)
     best_lda_model = model.best_estimator_
     print("Best Model's Params: ", model.best_params_)