From fb1cf405912e1d3d014f18d36c46681a9da92e8a Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Tue, 30 Apr 2024 13:49:28 -0500 Subject: [PATCH] some progress on fitting LDA model --- text_analysis/getMetadata.py | 17 +++++++++ text_analysis/topicModel.py | 72 +++++++++++++++++++++++------------- 2 files changed, 63 insertions(+), 26 deletions(-) create mode 100644 text_analysis/getMetadata.py diff --git a/text_analysis/getMetadata.py b/text_analysis/getMetadata.py new file mode 100644 index 0000000..f05ea28 --- /dev/null +++ b/text_analysis/getMetadata.py @@ -0,0 +1,17 @@ +import csv +import os +import nltk +import pandas as pd +from statistics import mean, median +import json + +def metadata_for_file(file): + word_list = file.split() + word_count = len(word_list) + #print(word_list) + if word_count == 0: + avg_word_length = 0 + else: + avg_word_length = sum(map(len, word_list)) / len(word_list) + #return number of paragraphs + return word_count, avg_word_length \ No newline at end of file diff --git a/text_analysis/topicModel.py b/text_analysis/topicModel.py index 8e51cf5..43ab457 100644 --- a/text_analysis/topicModel.py +++ b/text_analysis/topicModel.py @@ -3,14 +3,24 @@ import numpy as np import pandas as pd import glob import copy +from statistics import mean, median + +from getMetadata import metadata_for_file # Gensim import gensim -import gensim.corpora as corpora, Dictionary +import gensim.corpora as corpora from gensim.utils import simple_preprocess -from gensim.models import CoherenceModelfrom, Phrases +from gensim.models import CoherenceModel +from gensim.models.phrases import Phrases + +from sklearn.decomposition import LatentDirichletAllocation +from sklearn.model_selection import GridSearchCV +from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # spacy and nltk for lemmatization +import nltk +#nltk.download('stopwords') import spacy from nltk.corpus import stopwords from nltk.stem.wordnet import WordNetLemmatizer @@ -21,11 +31,17 @@ stopwords = stopwords.words('english') def get_data_from_dir(directory): files = glob.glob(f"{directory}/*") data_list = [] + word_counts = [] + avg_word_lengths = [] for file in files: text = open(file, encoding='utf-8').read() - #TODO: here is where we can get data about word length and document length + #here's some of the descriptive text analysis + word_count, avg_word_length = metadata_for_file(text) + word_counts.append(word_count) + avg_word_lengths.append(avg_word_length) + #adding the data to the list of text data_list.append(text) - return data_list + return data_list, word_counts, avg_word_lengths #preprocessing text data def preprocess(corpus_list): @@ -46,31 +62,22 @@ def text_preparation(lemmatized_text): if '_' in token: D_bigrams[i].append(token) #id2word - id2word = Dictionary(D_bigrams) + id2word = corpora.Dictionary(D_bigrams) id2word.filter_extremes(no_below=2, no_above=0.5) #bow representation bag_of_words = [id2word.doc2bow(doc) for doc in D_bigrams] return bag_of_words, id2word #TODO: identify best LDA model here -def lda_model_identification(bow, id2word, bigrams ): - coherence = [] - perplexity = [] - # For between 1 and 6 topics - for k in [5, 10, 15, 20]: - print('num of topics // k =: '+ str(k)) - Lda = gensim.models.ldamodel.LdaModel - # Train a model for that number of topics - ldamodel = Lda(bow, num_topics=k, id2word = id2word, passes=40,\ - iterations=200, chunksize = 1000, eval_every = None) - # Get the coherence value for the trained model - cm = gensim.models.coherencemodel.CoherenceModel(model=ldamodel, texts=bigrams,\ - dictionary=id2word, coherence='c_v') - coherence.append((k,cm.get_coherence())) - # Append the perplexity for the trained model - perplexity.append((k,ldamodel.log_perplexity(bow))) - print(coherence) - print(perplexity) +def lda_model_identification(data_vectorized): + lda = LatentDirichletAllocation() + search_params = {'n_components': [5, 10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9], 'max_iter': [10, 20, 50], 'batch_size':[128, 256]} + model = GridSearchCV(lda, param_grid=search_params) + model.fit(data_vectorized) + best_lda_model = model.best_estimator_ + print("Best Model's Params: ", model.best_params_) + print("Best Log Likelihood Score: ", model.best_score_) + print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized)) #TODO: implement best LDA model here @@ -78,9 +85,22 @@ def lda_model_identification(bow, id2word, bigrams ): if __name__ == "__main__": - document_directory = "TKTKTKKT" - listed_corpus = get_data_from_dir(document_directory) + readme_directory = "/data/users/mgaughan/kkex/time_specific_files/readme2" + contributing_directory = "/data/users/mgaughan/kkex/time_specific_files/contributing2" + listed_corpus, wordcounts, wordlengths = get_data_from_dir(readme_directory) + print("Mean wordcount: ", mean(wordcounts)) + print("Median wordcount: ", median(wordcounts)) + print("Mean wordlength: ", mean(wordlengths)) + print("Median wordlength: ", median(wordlengths)) lemmatized_corpus = preprocess(listed_corpus) - prepped_corpus, id2word = text_preparation(lemmatized_corpus) + #prepped_corpus, id2word = text_preparation(lemmatized_corpus) + vectorizer = CountVectorizer(analyzer='word', + min_df=2, + stop_words='english', + lowercase=True, + token_pattern='[a-zA-Z0-9]{2,}', + ) + data_vectorized = vectorizer.fit_transform(lemmatized_corpus) + lda_model_identification(data_vectorized)