84 lines
2.9 KiB
Python
84 lines
2.9 KiB
Python
from bertopic import BERTopic
|
|
|
|
import os
|
|
import re
|
|
from markdown import markdown
|
|
from bs4 import BeautifulSoup
|
|
|
|
#function generated by GitHub CoPilot
|
|
def strip_markdown(md_text):
|
|
# Convert markdown to HTML, then extract plaintext
|
|
html = markdown(md_text)
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
return soup.get_text(separator="\n").strip()
|
|
|
|
#function generated by GitHub CoPilot
|
|
def split_omd_sections(md_content):
|
|
# Use headings (lines starting with #) as section delimiters
|
|
sections = []
|
|
current_section = []
|
|
lines = md_content.splitlines()
|
|
for line in lines:
|
|
if re.match(r'^#{1,6} ', line): # Heading line
|
|
if current_section:
|
|
sections.append('\n'.join(current_section))
|
|
current_section = []
|
|
current_section.append(line)
|
|
if current_section:
|
|
sections.append('\n'.join(current_section))
|
|
return sections
|
|
|
|
#function generate by GitHub CoPilot
|
|
def split_md_sections(md_content):
|
|
sections = []
|
|
current_section = []
|
|
lines = md_content.splitlines()
|
|
num_lines = len(lines)
|
|
|
|
def is_heading(line):
|
|
return re.match(r'^#{1,6} ', line)
|
|
|
|
def is_title_line(idx):
|
|
# A title line is surrounded by blank lines and is not itself blank or a heading
|
|
if is_heading(lines[idx]) or not lines[idx].strip():
|
|
return False
|
|
before_blank = (idx == 0) or not lines[idx-1].strip()
|
|
after_blank = (idx == num_lines-1) or not lines[idx+1].strip()
|
|
# Exclude if the line is too short (e.g., just a number)
|
|
line = lines[idx].strip()
|
|
substantial = bool(re.match(r'^\d+ [^\d\.].*', line))
|
|
return before_blank and after_blank and substantial
|
|
|
|
for i, line in enumerate(lines):
|
|
if is_heading(line) or is_title_line(i):
|
|
if current_section:
|
|
sections.append('\n'.join(current_section))
|
|
current_section = []
|
|
current_section.append(line)
|
|
if current_section:
|
|
sections.append('\n'.join(current_section))
|
|
return sections
|
|
|
|
|
|
#function generated by GitHub CoPilot
|
|
def get_all_md_sections(directory):
|
|
all_sections = []
|
|
for filename in os.listdir(directory):
|
|
if filename.endswith('.md'):
|
|
filepath = os.path.join(directory, filename)
|
|
with open(filepath, encoding="utf-8") as f:
|
|
content = f.read()
|
|
sections = split_md_sections(content)
|
|
clean_sections = [strip_markdown(section) for section in sections if section.strip()]
|
|
all_sections.extend(clean_sections)
|
|
return all_sections
|
|
|
|
if __name__ == "__main__":
|
|
directory = "/home/nws8519/git/adaptation-slr/studies/"
|
|
docs = get_all_md_sections(directory)
|
|
topic_model = BERTopic()
|
|
topics, probabilities = topic_model.fit_transform(docs)
|
|
topic_model.get_topic_info()
|
|
topic_model.get_document_info(docs)
|
|
topic_model.save("/home/nws8519/git/adaptation-slr/models/", serialization="pickle")
|