diff --git a/text_analysis/topicModel.py b/text_analysis/topicModel.py index 43ab457..1583e32 100644 --- a/text_analysis/topicModel.py +++ b/text_analysis/topicModel.py @@ -49,7 +49,7 @@ def preprocess(corpus_list): #mvp right now, can certainly be expanded as iterations of text analysis are done D = [[token for token in simple_preprocess(doc) if token not in stopwords]for doc in D] lemmatizer = WordNetLemmatizer() - D_lemma = [[lemmatizer.lemmatize(token) for token in doc] for doc in D] + D_lemma = [" ".join([lemmatizer.lemmatize(token) for token in doc]) for doc in D] return D_lemma #preparing processed data for model usage @@ -72,7 +72,7 @@ def text_preparation(lemmatized_text): def lda_model_identification(data_vectorized): lda = LatentDirichletAllocation() search_params = {'n_components': [5, 10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9], 'max_iter': [10, 20, 50], 'batch_size':[128, 256]} - model = GridSearchCV(lda, param_grid=search_params) + model = GridSearchCV(lda, param_grid=search_params, verbose=10) model.fit(data_vectorized) best_lda_model = model.best_estimator_ print("Best Model's Params: ", model.best_params_)