bertopic information added

2025-07-23 17:18:33 -05:00 · 2025-07-23 17:18:33 -05:00 · 7d0b1339fc
commit 7d0b1339fc
parent 28cad5a5fd
7 changed files with 8591 additions and 0 deletions
--- a/models/.bertopic_analysis.py.swp
+++ b/models/.bertopic_analysis.py.swp
--- a/models/062725_topic_viz
+++ b/models/062725_topic_viz
--- a/models/062725_topic_viz.html
+++ b/models/062725_topic_viz.html
--- a/models/bertopic_analysis.py
+++ b/models/bertopic_analysis.py
@ -0,0 +1,91 @@
+from bertopic import BERTopic
+from nltk.corpus import stopwords
+import os
+import re
+from markdown import markdown
+from bs4 import BeautifulSoup
+import string 
+
+#function generated by GitHub CoPilot
+def strip_markdown(md_text):
+    # Convert markdown to HTML, then extract plaintext
+    html = markdown(md_text)
+    soup = BeautifulSoup(html, "html.parser")
+    return soup.get_text(separator="\n").strip()
+
+#function generate by GitHub CoPilot
+def split_md_sections(md_content):
+    sections = []
+    current_section = []
+    lines = md_content.splitlines()
+    num_lines = len(lines)
+
+    def is_heading(line):
+        return re.match(r'^#{1,6} ', line)
+
+    def is_title_line(idx):
+        # A title line is surrounded by blank lines and is not itself blank or a heading
+        if is_heading(lines[idx]) or not lines[idx].strip():
+            return False
+        before_blank = (idx == 0) or not lines[idx-1].strip()
+        after_blank = (idx == num_lines-1) or not lines[idx+1].strip()
+        # Exclude if the line is too short (e.g., just a number)
+        line = lines[idx].strip()
+        substantial = bool(re.match(r'^\d+ [^\d\.].*', line))
+        return before_blank and after_blank and substantial
+
+    for i, line in enumerate(lines):
+        if is_heading(line) or is_title_line(i):
+            if current_section:
+                sections.append('\n'.join(current_section))
+                current_section = []
+        current_section.append(line)
+    if current_section:
+        sections.append('\n'.join(current_section))
+    return sections
+
+
+#function generated by GitHub CoPilot
+def get_all_md_sections(directory):
+    all_sections = []
+    for filename in os.listdir(directory):
+        if filename.endswith('.md'):
+            filepath = os.path.join(directory, filename)
+            with open(filepath, encoding="utf-8") as f:
+                content = f.read()
+                sections = split_md_sections(content)
+                clean_sections = [strip_markdown(section) for section in sections if section.strip()]
+                all_sections.extend(clean_sections)
+    return all_sections
+
+#function generated by GitHubCopilot 
+def clean_text(text, stop_words):
+    # Remove punctuation
+    text = text.translate(str.maketrans('', '', string.punctuation))
+    # Remove stopwords
+    words = text.split()
+    stop_words.add("software")
+    stop_words.add("project")
+    words = [word for word in words if word.lower() not in stop_words]
+    return ' '.join(words)
+
+if __name__ == "__main__":
+    directory = "/home/nws8519/git/adaptation-slr/studies/"
+    docs = get_all_md_sections(directory)
+    #cleaning (largely just removing stopwords and punctuation)
+    #nltk.download('stopwords')
+    stop_words = set(stopwords.words('english'))
+    cleaned_docs = [clean_text(d, stop_words) for d in docs]
+    print(len(cleaned_docs))
+    with open('bertopic_docs.txt', 'w') as f:
+        for doc in cleaned_docs: 
+            f.write(doc + "\n::::\n")
+    #topic_model = BERTopic.load('/home/nws8519/git/adaptation-slr/models/062525bertopic')
+    
+    #document_info = topic_model.get_document_info(cleaned_docs)
+    #for each document in document_i
+    #print(document_info)
+    #print(topic_model.get_representative_docs())
+    
+
+
--- a/models/bertopic_analysis.sh
+++ b/models/bertopic_analysis.sh
@ -0,0 +1,27 @@
+#!/bin/bash
+#SBATCH -A p32852
+#SBATCH -p gengpu
+#SBATCH --gres=gpu:a100:1
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=1
+#SBATCH --time=24:00:00
+#SBATCH --mem=64G
+#SBATCH --cpus-per-task=4
+#SBATCH --job-name=SLR_BERTopic_topic_analysis
+#SBATCH --output=bertopic_topic_analysis.log
+#SBATCH --mail-type=BEGIN,END,FAIL
+#SBATCH --mail-user=gaughan@u.northwestern.edu
+
+
+module purge
+
+eval "$(conda shell.bash hook)"
+
+echo "setting up the environment by loading in conda environment at $(date)"
+
+conda activate bertopic-env
+
+echo "running the bertopic job at $(date)"
+
+python /home/nws8519/git/adaptation-slr/models/bertopic_analysis.py
+
--- a/models/bertopic_docs.txt
+++ b/models/bertopic_docs.txt
--- a/models/bertopic_topic_analysis.log
+++ b/models/bertopic_topic_analysis.log
@ -0,0 +1,3 @@
+setting up the environment by loading in conda environment at Thu Jun 26 15:43:35 CDT 2025
+running the bertopic job at Thu Jun 26 15:43:35 CDT 2025
+350