From a9770091ceb677802c15c16b7ad99dd17de50645 Mon Sep 17 00:00:00 2001
From: mjgaughan <mgaughan@proton.me>
Date: Wed, 24 Apr 2024 23:57:49 -0500
Subject: [PATCH] draft of lda topic modeling

---
 text_analysis/topicModel.py | 86 +++++++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 text_analysis/topicModel.py

diff --git a/text_analysis/topicModel.py b/text_analysis/topicModel.py
new file mode 100644
index 0000000..8e51cf5
--- /dev/null
+++ b/text_analysis/topicModel.py
@@ -0,0 +1,86 @@
+import re
+import numpy as np
+import pandas as pd
+import glob
+import copy
+
+# Gensim
+import gensim
+import gensim.corpora as corpora, Dictionary
+from gensim.utils import simple_preprocess
+from gensim.models import CoherenceModelfrom, Phrases
+
+# spacy and nltk for lemmatization
+import spacy
+from nltk.corpus import stopwords
+from nltk.stem.wordnet import WordNetLemmatizer
+
+stopwords = stopwords.words('english')
+
+#loading data in, getting misc descriptors
+def get_data_from_dir(directory):
+    files = glob.glob(f"{directory}/*")
+    data_list = []
+    for file in files:
+        text = open(file, encoding='utf-8').read()
+        #TODO: here is where we can get data about word length and document length
+        data_list.append(text)
+    return data_list
+
+#preprocessing text data
+def preprocess(corpus_list):
+    D = copy.copy(corpus_list)
+    #mvp right now, can certainly be expanded as iterations of text analysis are done
+    D = [[token for token in simple_preprocess(doc) if token not in stopwords]for doc in D]
+    lemmatizer = WordNetLemmatizer()
+    D_lemma = [[lemmatizer.lemmatize(token) for token in doc] for doc in D]
+    return D_lemma
+
+#preparing processed data for model usage
+def text_preparation(lemmatized_text):
+    #bigrams
+    D_bigrams = copy.copy(lemmatized_text)
+    bigram = Phrases(D_bigrams, min_count=2)
+    for i in range(len(lemmatized_text)):
+        for token in bigram[D_bigrams[i]]:
+            if '_' in token:
+                D_bigrams[i].append(token)
+    #id2word
+    id2word = Dictionary(D_bigrams)
+    id2word.filter_extremes(no_below=2, no_above=0.5)
+    #bow representation 
+    bag_of_words = [id2word.doc2bow(doc) for doc in D_bigrams]
+    return bag_of_words, id2word
+
+#TODO: identify best LDA model here
+def lda_model_identification(bow, id2word, bigrams ):
+    coherence = []
+    perplexity = []
+    # For between 1 and 6 topics
+    for k in [5, 10, 15, 20]: 
+        print('num of topics // k =: '+ str(k))
+        Lda = gensim.models.ldamodel.LdaModel
+        # Train a model for that number of topics
+        ldamodel = Lda(bow, num_topics=k, id2word = id2word, passes=40,\
+                    iterations=200, chunksize = 1000, eval_every = None)
+        # Get the coherence value for the trained model
+        cm = gensim.models.coherencemodel.CoherenceModel(model=ldamodel, texts=bigrams,\
+                                                        dictionary=id2word, coherence='c_v')
+        coherence.append((k,cm.get_coherence()))
+        # Append the perplexity for the trained model
+        perplexity.append((k,ldamodel.log_perplexity(bow)))
+    print(coherence)
+    print(perplexity)   
+
+#TODO: implement best LDA model here 
+
+#TODO: evaluate model and identified topics 
+
+
+if __name__ == "__main__":
+    document_directory = "TKTKTKKT"
+    listed_corpus = get_data_from_dir(document_directory)
+    lemmatized_corpus = preprocess(listed_corpus)
+    prepped_corpus, id2word = text_preparation(lemmatized_corpus)
+
+