From 9f6f7e9423c0e99967ae243a319db1a2a2023c91 Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Tue, 30 Apr 2024 16:30:06 -0500 Subject: [PATCH] prepping gridsearch --- text_analysis/topicModel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/text_analysis/topicModel.py b/text_analysis/topicModel.py index 43ab457..1583e32 100644 --- a/text_analysis/topicModel.py +++ b/text_analysis/topicModel.py @@ -49,7 +49,7 @@ def preprocess(corpus_list): #mvp right now, can certainly be expanded as iterations of text analysis are done D = [[token for token in simple_preprocess(doc) if token not in stopwords]for doc in D] lemmatizer = WordNetLemmatizer() - D_lemma = [[lemmatizer.lemmatize(token) for token in doc] for doc in D] + D_lemma = [" ".join([lemmatizer.lemmatize(token) for token in doc]) for doc in D] return D_lemma #preparing processed data for model usage @@ -72,7 +72,7 @@ def text_preparation(lemmatized_text): def lda_model_identification(data_vectorized): lda = LatentDirichletAllocation() search_params = {'n_components': [5, 10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9], 'max_iter': [10, 20, 50], 'batch_size':[128, 256]} - model = GridSearchCV(lda, param_grid=search_params) + model = GridSearchCV(lda, param_grid=search_params, verbose=10) model.fit(data_vectorized) best_lda_model = model.best_estimator_ print("Best Model's Params: ", model.best_params_)