updating with fit topic model
This commit is contained in:
parent
609039e5cc
commit
7141d0d9ad
BIN
models/062525bertopic
Normal file
BIN
models/062525bertopic
Normal file
Binary file not shown.
4
models/bertopic_job.log
Normal file
4
models/bertopic_job.log
Normal file
@ -0,0 +1,4 @@
|
||||
setting up the environment by loading in conda environment at Wed Jun 25 12:06:59 CDT 2025
|
||||
running the bertopic job at Wed Jun 25 12:06:59 CDT 2025
|
||||
2025-06-25 12:09:02,952 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.
|
||||
350
|
@ -8,10 +8,14 @@
|
||||
#SBATCH --mem=64G
|
||||
#SBATCH --cpus-per-task=4
|
||||
#SBATCH --job-name=SLR_OCR
|
||||
#SBATCH --output=slr_ocr_job.log
|
||||
#SBATCH --output=bertopic_job.log
|
||||
#SBATCH --mail-type=BEGIN,END,FAIL
|
||||
#SBATCH --mail-user=gaughan@u.northwestern.edu
|
||||
|
||||
module purge
|
||||
|
||||
eval "$(conda shell.bash hook)"
|
||||
|
||||
echo "setting up the environment by loading in conda environment at $(date)"
|
||||
|
||||
conda activate bertopic-env
|
||||
|
@ -1,9 +1,10 @@
|
||||
from bertopic import BERTopic
|
||||
|
||||
from nltk.corpus import stopwords
|
||||
import os
|
||||
import re
|
||||
from markdown import markdown
|
||||
from bs4 import BeautifulSoup
|
||||
import string
|
||||
|
||||
#function generated by GitHub CoPilot
|
||||
def strip_markdown(md_text):
|
||||
@ -73,11 +74,29 @@ def get_all_md_sections(directory):
|
||||
all_sections.extend(clean_sections)
|
||||
return all_sections
|
||||
|
||||
#function generated by GitHubCopilot
|
||||
def clean_text(text, stop_words):
|
||||
# Remove punctuation
|
||||
text = text.translate(str.maketrans('', '', string.punctuation))
|
||||
# Remove stopwords
|
||||
words = text.split()
|
||||
stop_words.add("software")
|
||||
stop_words.add("project")
|
||||
words = [word for word in words if word.lower() not in stop_words]
|
||||
return ' '.join(words)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
directory = "/home/nws8519/git/adaptation-slr/studies/"
|
||||
docs = get_all_md_sections(directory)
|
||||
#cleaning (largely just removing stopwords and punctuation)
|
||||
#nltk.download('stopwords')
|
||||
stop_words = set(stopwords.words('english'))
|
||||
cleaned_docs = [clean_text(d, stop_words) for d in docs]
|
||||
print(len(cleaned_docs))
|
||||
#handing off to topic
|
||||
topic_model = BERTopic()
|
||||
topics, probabilities = topic_model.fit_transform(docs)
|
||||
topics, probabilities = topic_model.fit_transform(cleaned_docs)
|
||||
topic_model.get_topic_info()
|
||||
topic_model.get_document_info(docs)
|
||||
topic_model.save("/home/nws8519/git/adaptation-slr/models/", serialization="pickle")
|
||||
topic_model.save("/home/nws8519/git/adaptation-slr/models/062525bertopic", serialization="pickle")
|
||||
|
33385
models/slr_ocr_job.log
33385
models/slr_ocr_job.log
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user