updating with fit topic model

2025-06-25 13:20:18 -05:00 · 2025-06-25 13:20:18 -05:00 · 7141d0d9ad
commit 7141d0d9ad
parent 609039e5cc
5 changed files with 31 additions and 33389 deletions
--- a/models/062525bertopic
+++ b/models/062525bertopic
--- a/models/bertopic_job.log
+++ b/models/bertopic_job.log
@ -0,0 +1,4 @@
+setting up the environment by loading in conda environment at Wed Jun 25 12:06:59 CDT 2025
+running the bertopic job at Wed Jun 25 12:06:59 CDT 2025
+2025-06-25 12:09:02,952 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.
+350
--- a/models/bertopic_job.sh
+++ b/models/bertopic_job.sh
@ -8,10 +8,14 @@
 #SBATCH --mem=64G
 #SBATCH --cpus-per-task=4
 #SBATCH --job-name=SLR_OCR 
-#SBATCH --output=slr_ocr_job.log
+#SBATCH --output=bertopic_job.log
 #SBATCH --mail-type=BEGIN,END,FAIL
 #SBATCH --mail-user=gaughan@u.northwestern.edu

+module purge
+
+eval "$(conda shell.bash hook)"
+
 echo "setting up the environment by loading in conda environment at $(date)"

 conda activate bertopic-env
--- a/models/bertopic_modeling.py
+++ b/models/bertopic_modeling.py
@ -1,9 +1,10 @@
 from bertopic import BERTopic
-
+from nltk.corpus import stopwords
 import os
 import re
 from markdown import markdown
 from bs4 import BeautifulSoup
+import string 

 #function generated by GitHub CoPilot
 def strip_markdown(md_text):
@ -73,11 +74,29 @@ def get_all_md_sections(directory):
                all_sections.extend(clean_sections)
    return all_sections

+#function generated by GitHubCopilot 
+def clean_text(text, stop_words):
+    # Remove punctuation
+    text = text.translate(str.maketrans('', '', string.punctuation))
+    # Remove stopwords
+    words = text.split()
+    stop_words.add("software")
+    stop_words.add("project")
+    words = [word for word in words if word.lower() not in stop_words]
+    return ' '.join(words)
+
+
 if __name__ == "__main__":
    directory = "/home/nws8519/git/adaptation-slr/studies/" 
    docs = get_all_md_sections(directory)
+    #cleaning (largely just removing stopwords and punctuation)
+    #nltk.download('stopwords')
+    stop_words = set(stopwords.words('english'))
+    cleaned_docs = [clean_text(d, stop_words) for d in docs]
+    print(len(cleaned_docs))
+    #handing off to topic 
    topic_model = BERTopic()
-    topics, probabilities = topic_model.fit_transform(docs)
+    topics, probabilities = topic_model.fit_transform(cleaned_docs)
    topic_model.get_topic_info()
    topic_model.get_document_info(docs)
-    topic_model.save("/home/nws8519/git/adaptation-slr/models/", serialization="pickle")
+    topic_model.save("/home/nws8519/git/adaptation-slr/models/062525bertopic", serialization="pickle")
--- a/models/slr_ocr_job.log
+++ b/models/slr_ocr_job.log