1
0
adaptation-slr/models/bertopic_analysis.py
2025-07-23 17:18:33 -05:00

92 lines
3.2 KiB
Python

from bertopic import BERTopic
from nltk.corpus import stopwords
import os
import re
from markdown import markdown
from bs4 import BeautifulSoup
import string
#function generated by GitHub CoPilot
def strip_markdown(md_text):
# Convert markdown to HTML, then extract plaintext
html = markdown(md_text)
soup = BeautifulSoup(html, "html.parser")
return soup.get_text(separator="\n").strip()
#function generate by GitHub CoPilot
def split_md_sections(md_content):
sections = []
current_section = []
lines = md_content.splitlines()
num_lines = len(lines)
def is_heading(line):
return re.match(r'^#{1,6} ', line)
def is_title_line(idx):
# A title line is surrounded by blank lines and is not itself blank or a heading
if is_heading(lines[idx]) or not lines[idx].strip():
return False
before_blank = (idx == 0) or not lines[idx-1].strip()
after_blank = (idx == num_lines-1) or not lines[idx+1].strip()
# Exclude if the line is too short (e.g., just a number)
line = lines[idx].strip()
substantial = bool(re.match(r'^\d+ [^\d\.].*', line))
return before_blank and after_blank and substantial
for i, line in enumerate(lines):
if is_heading(line) or is_title_line(i):
if current_section:
sections.append('\n'.join(current_section))
current_section = []
current_section.append(line)
if current_section:
sections.append('\n'.join(current_section))
return sections
#function generated by GitHub CoPilot
def get_all_md_sections(directory):
all_sections = []
for filename in os.listdir(directory):
if filename.endswith('.md'):
filepath = os.path.join(directory, filename)
with open(filepath, encoding="utf-8") as f:
content = f.read()
sections = split_md_sections(content)
clean_sections = [strip_markdown(section) for section in sections if section.strip()]
all_sections.extend(clean_sections)
return all_sections
#function generated by GitHubCopilot
def clean_text(text, stop_words):
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Remove stopwords
words = text.split()
stop_words.add("software")
stop_words.add("project")
words = [word for word in words if word.lower() not in stop_words]
return ' '.join(words)
if __name__ == "__main__":
directory = "/home/nws8519/git/adaptation-slr/studies/"
docs = get_all_md_sections(directory)
#cleaning (largely just removing stopwords and punctuation)
#nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
cleaned_docs = [clean_text(d, stop_words) for d in docs]
print(len(cleaned_docs))
with open('bertopic_docs.txt', 'w') as f:
for doc in cleaned_docs:
f.write(doc + "\n::::\n")
#topic_model = BERTopic.load('/home/nws8519/git/adaptation-slr/models/062525bertopic')
#document_info = topic_model.get_document_info(cleaned_docs)
#for each document in document_i
#print(document_info)
#print(topic_model.get_representative_docs())