bertopic information added
This commit is contained in:
parent
28cad5a5fd
commit
7d0b1339fc
BIN
models/.bertopic_analysis.py.swp
Normal file
BIN
models/.bertopic_analysis.py.swp
Normal file
Binary file not shown.
3885
models/062725_topic_viz
Normal file
3885
models/062725_topic_viz
Normal file
File diff suppressed because one or more lines are too long
3885
models/062725_topic_viz.html
Normal file
3885
models/062725_topic_viz.html
Normal file
File diff suppressed because one or more lines are too long
91
models/bertopic_analysis.py
Normal file
91
models/bertopic_analysis.py
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
from bertopic import BERTopic
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from markdown import markdown
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import string
|
||||||
|
|
||||||
|
#function generated by GitHub CoPilot
|
||||||
|
def strip_markdown(md_text):
|
||||||
|
# Convert markdown to HTML, then extract plaintext
|
||||||
|
html = markdown(md_text)
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
return soup.get_text(separator="\n").strip()
|
||||||
|
|
||||||
|
#function generate by GitHub CoPilot
|
||||||
|
def split_md_sections(md_content):
|
||||||
|
sections = []
|
||||||
|
current_section = []
|
||||||
|
lines = md_content.splitlines()
|
||||||
|
num_lines = len(lines)
|
||||||
|
|
||||||
|
def is_heading(line):
|
||||||
|
return re.match(r'^#{1,6} ', line)
|
||||||
|
|
||||||
|
def is_title_line(idx):
|
||||||
|
# A title line is surrounded by blank lines and is not itself blank or a heading
|
||||||
|
if is_heading(lines[idx]) or not lines[idx].strip():
|
||||||
|
return False
|
||||||
|
before_blank = (idx == 0) or not lines[idx-1].strip()
|
||||||
|
after_blank = (idx == num_lines-1) or not lines[idx+1].strip()
|
||||||
|
# Exclude if the line is too short (e.g., just a number)
|
||||||
|
line = lines[idx].strip()
|
||||||
|
substantial = bool(re.match(r'^\d+ [^\d\.].*', line))
|
||||||
|
return before_blank and after_blank and substantial
|
||||||
|
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
if is_heading(line) or is_title_line(i):
|
||||||
|
if current_section:
|
||||||
|
sections.append('\n'.join(current_section))
|
||||||
|
current_section = []
|
||||||
|
current_section.append(line)
|
||||||
|
if current_section:
|
||||||
|
sections.append('\n'.join(current_section))
|
||||||
|
return sections
|
||||||
|
|
||||||
|
|
||||||
|
#function generated by GitHub CoPilot
|
||||||
|
def get_all_md_sections(directory):
|
||||||
|
all_sections = []
|
||||||
|
for filename in os.listdir(directory):
|
||||||
|
if filename.endswith('.md'):
|
||||||
|
filepath = os.path.join(directory, filename)
|
||||||
|
with open(filepath, encoding="utf-8") as f:
|
||||||
|
content = f.read()
|
||||||
|
sections = split_md_sections(content)
|
||||||
|
clean_sections = [strip_markdown(section) for section in sections if section.strip()]
|
||||||
|
all_sections.extend(clean_sections)
|
||||||
|
return all_sections
|
||||||
|
|
||||||
|
#function generated by GitHubCopilot
|
||||||
|
def clean_text(text, stop_words):
|
||||||
|
# Remove punctuation
|
||||||
|
text = text.translate(str.maketrans('', '', string.punctuation))
|
||||||
|
# Remove stopwords
|
||||||
|
words = text.split()
|
||||||
|
stop_words.add("software")
|
||||||
|
stop_words.add("project")
|
||||||
|
words = [word for word in words if word.lower() not in stop_words]
|
||||||
|
return ' '.join(words)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
directory = "/home/nws8519/git/adaptation-slr/studies/"
|
||||||
|
docs = get_all_md_sections(directory)
|
||||||
|
#cleaning (largely just removing stopwords and punctuation)
|
||||||
|
#nltk.download('stopwords')
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
cleaned_docs = [clean_text(d, stop_words) for d in docs]
|
||||||
|
print(len(cleaned_docs))
|
||||||
|
with open('bertopic_docs.txt', 'w') as f:
|
||||||
|
for doc in cleaned_docs:
|
||||||
|
f.write(doc + "\n::::\n")
|
||||||
|
#topic_model = BERTopic.load('/home/nws8519/git/adaptation-slr/models/062525bertopic')
|
||||||
|
|
||||||
|
#document_info = topic_model.get_document_info(cleaned_docs)
|
||||||
|
#for each document in document_i
|
||||||
|
#print(document_info)
|
||||||
|
#print(topic_model.get_representative_docs())
|
||||||
|
|
||||||
|
|
||||||
|
|
27
models/bertopic_analysis.sh
Normal file
27
models/bertopic_analysis.sh
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#SBATCH -A p32852
|
||||||
|
#SBATCH -p gengpu
|
||||||
|
#SBATCH --gres=gpu:a100:1
|
||||||
|
#SBATCH --nodes=2
|
||||||
|
#SBATCH --ntasks-per-node=1
|
||||||
|
#SBATCH --time=24:00:00
|
||||||
|
#SBATCH --mem=64G
|
||||||
|
#SBATCH --cpus-per-task=4
|
||||||
|
#SBATCH --job-name=SLR_BERTopic_topic_analysis
|
||||||
|
#SBATCH --output=bertopic_topic_analysis.log
|
||||||
|
#SBATCH --mail-type=BEGIN,END,FAIL
|
||||||
|
#SBATCH --mail-user=gaughan@u.northwestern.edu
|
||||||
|
|
||||||
|
|
||||||
|
module purge
|
||||||
|
|
||||||
|
eval "$(conda shell.bash hook)"
|
||||||
|
|
||||||
|
echo "setting up the environment by loading in conda environment at $(date)"
|
||||||
|
|
||||||
|
conda activate bertopic-env
|
||||||
|
|
||||||
|
echo "running the bertopic job at $(date)"
|
||||||
|
|
||||||
|
python /home/nws8519/git/adaptation-slr/models/bertopic_analysis.py
|
||||||
|
|
700
models/bertopic_docs.txt
Normal file
700
models/bertopic_docs.txt
Normal file
File diff suppressed because one or more lines are too long
3
models/bertopic_topic_analysis.log
Normal file
3
models/bertopic_topic_analysis.log
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
setting up the environment by loading in conda environment at Thu Jun 26 15:43:35 CDT 2025
|
||||||
|
running the bertopic job at Thu Jun 26 15:43:35 CDT 2025
|
||||||
|
350
|
Loading…
Reference in New Issue
Block a user