1
0

updating with fit topic model

This commit is contained in:
mgaughan 2025-06-25 13:20:18 -05:00
parent 609039e5cc
commit 7141d0d9ad
5 changed files with 31 additions and 33389 deletions

BIN
models/062525bertopic Normal file

Binary file not shown.

4
models/bertopic_job.log Normal file
View File

@ -0,0 +1,4 @@
setting up the environment by loading in conda environment at Wed Jun 25 12:06:59 CDT 2025
running the bertopic job at Wed Jun 25 12:06:59 CDT 2025
2025-06-25 12:09:02,952 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.
350

View File

@ -8,10 +8,14 @@
#SBATCH --mem=64G #SBATCH --mem=64G
#SBATCH --cpus-per-task=4 #SBATCH --cpus-per-task=4
#SBATCH --job-name=SLR_OCR #SBATCH --job-name=SLR_OCR
#SBATCH --output=slr_ocr_job.log #SBATCH --output=bertopic_job.log
#SBATCH --mail-type=BEGIN,END,FAIL #SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH --mail-user=gaughan@u.northwestern.edu #SBATCH --mail-user=gaughan@u.northwestern.edu
module purge
eval "$(conda shell.bash hook)"
echo "setting up the environment by loading in conda environment at $(date)" echo "setting up the environment by loading in conda environment at $(date)"
conda activate bertopic-env conda activate bertopic-env

View File

@ -1,9 +1,10 @@
from bertopic import BERTopic from bertopic import BERTopic
from nltk.corpus import stopwords
import os import os
import re import re
from markdown import markdown from markdown import markdown
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import string
#function generated by GitHub CoPilot #function generated by GitHub CoPilot
def strip_markdown(md_text): def strip_markdown(md_text):
@ -73,11 +74,29 @@ def get_all_md_sections(directory):
all_sections.extend(clean_sections) all_sections.extend(clean_sections)
return all_sections return all_sections
#function generated by GitHubCopilot
def clean_text(text, stop_words):
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Remove stopwords
words = text.split()
stop_words.add("software")
stop_words.add("project")
words = [word for word in words if word.lower() not in stop_words]
return ' '.join(words)
if __name__ == "__main__": if __name__ == "__main__":
directory = "/home/nws8519/git/adaptation-slr/studies/" directory = "/home/nws8519/git/adaptation-slr/studies/"
docs = get_all_md_sections(directory) docs = get_all_md_sections(directory)
#cleaning (largely just removing stopwords and punctuation)
#nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
cleaned_docs = [clean_text(d, stop_words) for d in docs]
print(len(cleaned_docs))
#handing off to topic
topic_model = BERTopic() topic_model = BERTopic()
topics, probabilities = topic_model.fit_transform(docs) topics, probabilities = topic_model.fit_transform(cleaned_docs)
topic_model.get_topic_info() topic_model.get_topic_info()
topic_model.get_document_info(docs) topic_model.get_document_info(docs)
topic_model.save("/home/nws8519/git/adaptation-slr/models/", serialization="pickle") topic_model.save("/home/nws8519/git/adaptation-slr/models/062525bertopic", serialization="pickle")

File diff suppressed because it is too large Load Diff