From 9d9bb8da1ef8dcaea6c906fabc15b16731e31ba2 Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Thu, 2 May 2024 10:55:50 -0500 Subject: [PATCH] first draft of the topic model --- text_analysis/topicModel.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/text_analysis/topicModel.py b/text_analysis/topicModel.py index 5eba149..a7d2027 100644 --- a/text_analysis/topicModel.py +++ b/text_analysis/topicModel.py @@ -26,6 +26,7 @@ from nltk.corpus import stopwords from nltk.stem.wordnet import WordNetLemmatizer stopwords = stopwords.words('english') +#https://nlp.stanford.edu/IR-book/html/htmledition/dropping-common-terms-stop-words-1.html #loading data in, getting misc descriptors def get_data_from_dir(directory): @@ -45,9 +46,13 @@ def get_data_from_dir(directory): #preprocessing text data def preprocess(corpus_list): + #extending stopwords + specific_stopwords = ["http", "com", "www", "org", "file", "code", "time", "software", "use", "user", "set", "line", "run", "source", "github", + "lineno", "python", "php", "ruby", "api"] + stopwords.extend(specific_stopwords) D = copy.copy(corpus_list) #mvp right now, can certainly be expanded as iterations of text analysis are done - D = [[token for token in simple_preprocess(doc) if token not in stopwords]for doc in D] + D = [[token for token in simple_preprocess(doc) if token not in stopwords and len(token) > 2]for doc in D] lemmatizer = WordNetLemmatizer() D_lemma = [" ".join([lemmatizer.lemmatize(token) for token in doc]) for doc in D] return D_lemma @@ -63,7 +68,7 @@ def text_preparation(lemmatized_text): D_bigrams[i].append(token) #id2word id2word = corpora.Dictionary(D_bigrams) - id2word.filter_extremes(no_below=2, no_above=0.5) + id2word.filter_extremes(no_below=5, no_above=0.5) #bow representation bag_of_words = [id2word.doc2bow(doc) for doc in D_bigrams] return bag_of_words, id2word @@ -80,7 +85,7 @@ def lda_model_identification(data_vectorized): print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized)) #TODO: implement best LDA model here -def best_lda_model(data_vectorized): +def best_lda_model(data_vectorized, vocab): #Best Log Likelihood Score: -502085.9749390023 #Model Perplexity: 1689.0943431883845 lda = LatentDirichletAllocation(n_components=5, learning_decay = 0.9, batch_size = 128, max_iter = 50) @@ -92,6 +97,7 @@ def best_lda_model(data_vectorized): for topic, words in topic_words.items(): print('Topic: %d' % topic) print(' %s' % ', '.join(words)) + #lda.print_topics(num_words=10) #TODO: evaluate model and identified topics @@ -114,6 +120,9 @@ if __name__ == "__main__": ) data_vectorized = vectorizer.fit_transform(lemmatized_corpus) #lda_model_identification(data_vectorized) - best_lda_model(data_vectorized) + #freqs = zip(vectorizer.get_feature_names_out(), data_vectorized.sum(axis=0).tolist()[0]) + # sort from largest to smallest + #print(sorted(freqs, key=lambda x: -x[1])[:25]) + best_lda_model(data_vectorized, vectorizer.get_feature_names_out())