1
0

updating with fit topic model

This commit is contained in:
mgaughan 2025-06-25 13:20:18 -05:00
parent 609039e5cc
commit 7141d0d9ad
5 changed files with 31 additions and 33389 deletions

BIN
models/062525bertopic Normal file

Binary file not shown.

4
models/bertopic_job.log Normal file
View File

@ -0,0 +1,4 @@
setting up the environment by loading in conda environment at Wed Jun 25 12:06:59 CDT 2025
running the bertopic job at Wed Jun 25 12:06:59 CDT 2025
2025-06-25 12:09:02,952 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.
350

View File

@ -8,10 +8,14 @@
#SBATCH --mem=64G
#SBATCH --cpus-per-task=4
#SBATCH --job-name=SLR_OCR
#SBATCH --output=slr_ocr_job.log
#SBATCH --output=bertopic_job.log
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH --mail-user=gaughan@u.northwestern.edu
module purge
eval "$(conda shell.bash hook)"
echo "setting up the environment by loading in conda environment at $(date)"
conda activate bertopic-env

View File

@ -1,9 +1,10 @@
from bertopic import BERTopic
from nltk.corpus import stopwords
import os
import re
from markdown import markdown
from bs4 import BeautifulSoup
import string
#function generated by GitHub CoPilot
def strip_markdown(md_text):
@ -73,11 +74,29 @@ def get_all_md_sections(directory):
all_sections.extend(clean_sections)
return all_sections
#function generated by GitHubCopilot
def clean_text(text, stop_words):
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Remove stopwords
words = text.split()
stop_words.add("software")
stop_words.add("project")
words = [word for word in words if word.lower() not in stop_words]
return ' '.join(words)
if __name__ == "__main__":
directory = "/home/nws8519/git/adaptation-slr/studies/"
docs = get_all_md_sections(directory)
#cleaning (largely just removing stopwords and punctuation)
#nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
cleaned_docs = [clean_text(d, stop_words) for d in docs]
print(len(cleaned_docs))
#handing off to topic
topic_model = BERTopic()
topics, probabilities = topic_model.fit_transform(docs)
topics, probabilities = topic_model.fit_transform(cleaned_docs)
topic_model.get_topic_info()
topic_model.get_document_info(docs)
topic_model.save("/home/nws8519/git/adaptation-slr/models/", serialization="pickle")
topic_model.save("/home/nws8519/git/adaptation-slr/models/062525bertopic", serialization="pickle")

File diff suppressed because it is too large Load Diff