more text cleaning

This commit is contained in:
Matthew Gaughan 2024-05-02 11:31:24 -05:00
parent 9d9bb8da1e
commit 1184069921

View File

@ -4,6 +4,7 @@ import pandas as pd
import glob import glob
import copy import copy
from statistics import mean, median from statistics import mean, median
from strip_markdown import strip_markdown
from getMetadata import metadata_for_file from getMetadata import metadata_for_file
@ -51,6 +52,10 @@ def preprocess(corpus_list):
"lineno", "python", "php", "ruby", "api"] "lineno", "python", "php", "ruby", "api"]
stopwords.extend(specific_stopwords) stopwords.extend(specific_stopwords)
D = copy.copy(corpus_list) D = copy.copy(corpus_list)
#stripping markdown from documents
D = [strip_markdown(doc) for doc in D]
#strip html
D = [re.sub(r'<!--.*?-->', '', doc, flags=re.DOTALL) for doc in D]
#mvp right now, can certainly be expanded as iterations of text analysis are done #mvp right now, can certainly be expanded as iterations of text analysis are done
D = [[token for token in simple_preprocess(doc) if token not in stopwords and len(token) > 2]for doc in D] D = [[token for token in simple_preprocess(doc) if token not in stopwords and len(token) > 2]for doc in D]
lemmatizer = WordNetLemmatizer() lemmatizer = WordNetLemmatizer()
@ -76,7 +81,7 @@ def text_preparation(lemmatized_text):
#TODO: identify best LDA model here #TODO: identify best LDA model here
def lda_model_identification(data_vectorized): def lda_model_identification(data_vectorized):
lda = LatentDirichletAllocation() lda = LatentDirichletAllocation()
search_params = {'n_components': [5, 10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9], 'max_iter': [10, 20, 50], 'batch_size':[128, 256]} search_params = {'n_components': [3, 5, 10, 15, 20, 25, 30]}
model = GridSearchCV(lda, param_grid=search_params, verbose=10) model = GridSearchCV(lda, param_grid=search_params, verbose=10)
model.fit(data_vectorized) model.fit(data_vectorized)
best_lda_model = model.best_estimator_ best_lda_model = model.best_estimator_
@ -88,7 +93,7 @@ def lda_model_identification(data_vectorized):
def best_lda_model(data_vectorized, vocab): def best_lda_model(data_vectorized, vocab):
#Best Log Likelihood Score: -502085.9749390023 #Best Log Likelihood Score: -502085.9749390023
#Model Perplexity: 1689.0943431883845 #Model Perplexity: 1689.0943431883845
lda = LatentDirichletAllocation(n_components=5, learning_decay = 0.9, batch_size = 128, max_iter = 50) lda = LatentDirichletAllocation(n_components=3, learning_decay = 0.9, batch_size = 128, max_iter = 50)
id_topic = lda.fit_transform(data_vectorized) id_topic = lda.fit_transform(data_vectorized)
topic_words = {} topic_words = {}
for topic, comp in enumerate(lda.components_): for topic, comp in enumerate(lda.components_):