From a9770091ceb677802c15c16b7ad99dd17de50645 Mon Sep 17 00:00:00 2001 From: mjgaughan Date: Wed, 24 Apr 2024 23:57:49 -0500 Subject: [PATCH] draft of lda topic modeling --- text_analysis/topicModel.py | 86 +++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 text_analysis/topicModel.py diff --git a/text_analysis/topicModel.py b/text_analysis/topicModel.py new file mode 100644 index 0000000..8e51cf5 --- /dev/null +++ b/text_analysis/topicModel.py @@ -0,0 +1,86 @@ +import re +import numpy as np +import pandas as pd +import glob +import copy + +# Gensim +import gensim +import gensim.corpora as corpora, Dictionary +from gensim.utils import simple_preprocess +from gensim.models import CoherenceModelfrom, Phrases + +# spacy and nltk for lemmatization +import spacy +from nltk.corpus import stopwords +from nltk.stem.wordnet import WordNetLemmatizer + +stopwords = stopwords.words('english') + +#loading data in, getting misc descriptors +def get_data_from_dir(directory): + files = glob.glob(f"{directory}/*") + data_list = [] + for file in files: + text = open(file, encoding='utf-8').read() + #TODO: here is where we can get data about word length and document length + data_list.append(text) + return data_list + +#preprocessing text data +def preprocess(corpus_list): + D = copy.copy(corpus_list) + #mvp right now, can certainly be expanded as iterations of text analysis are done + D = [[token for token in simple_preprocess(doc) if token not in stopwords]for doc in D] + lemmatizer = WordNetLemmatizer() + D_lemma = [[lemmatizer.lemmatize(token) for token in doc] for doc in D] + return D_lemma + +#preparing processed data for model usage +def text_preparation(lemmatized_text): + #bigrams + D_bigrams = copy.copy(lemmatized_text) + bigram = Phrases(D_bigrams, min_count=2) + for i in range(len(lemmatized_text)): + for token in bigram[D_bigrams[i]]: + if '_' in token: + D_bigrams[i].append(token) + #id2word + id2word = Dictionary(D_bigrams) + id2word.filter_extremes(no_below=2, no_above=0.5) + #bow representation + bag_of_words = [id2word.doc2bow(doc) for doc in D_bigrams] + return bag_of_words, id2word + +#TODO: identify best LDA model here +def lda_model_identification(bow, id2word, bigrams ): + coherence = [] + perplexity = [] + # For between 1 and 6 topics + for k in [5, 10, 15, 20]: + print('num of topics // k =: '+ str(k)) + Lda = gensim.models.ldamodel.LdaModel + # Train a model for that number of topics + ldamodel = Lda(bow, num_topics=k, id2word = id2word, passes=40,\ + iterations=200, chunksize = 1000, eval_every = None) + # Get the coherence value for the trained model + cm = gensim.models.coherencemodel.CoherenceModel(model=ldamodel, texts=bigrams,\ + dictionary=id2word, coherence='c_v') + coherence.append((k,cm.get_coherence())) + # Append the perplexity for the trained model + perplexity.append((k,ldamodel.log_perplexity(bow))) + print(coherence) + print(perplexity) + +#TODO: implement best LDA model here + +#TODO: evaluate model and identified topics + + +if __name__ == "__main__": + document_directory = "TKTKTKKT" + listed_corpus = get_data_from_dir(document_directory) + lemmatized_corpus = preprocess(listed_corpus) + prepped_corpus, id2word = text_preparation(lemmatized_corpus) + +