adaptation-slr/models/bertopic_modeling.py

from bertopic import BERTopic

import os
import re
from markdown import markdown
from bs4 import BeautifulSoup

#function generated by GitHub CoPilot
def strip_markdown(md_text):
    # Convert markdown to HTML, then extract plaintext
    html = markdown(md_text)
    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text(separator="\n").strip()

#function generated by GitHub CoPilot
def split_omd_sections(md_content):
    # Use headings (lines starting with #) as section delimiters
    sections = []
    current_section = []
    lines = md_content.splitlines()
    for line in lines:
        if re.match(r'^#{1,6} ', line):  # Heading line
            if current_section:
                sections.append('\n'.join(current_section))
                current_section = []
        current_section.append(line)
    if current_section:
        sections.append('\n'.join(current_section))
    return sections

#function generate by GitHub CoPilot
def split_md_sections(md_content):
    sections = []
    current_section = []
    lines = md_content.splitlines()
    num_lines = len(lines)

    def is_heading(line):
        return re.match(r'^#{1,6} ', line)

    def is_title_line(idx):
        # A title line is surrounded by blank lines and is not itself blank or a heading
        if is_heading(lines[idx]) or not lines[idx].strip():
            return False
        before_blank = (idx == 0) or not lines[idx-1].strip()
        after_blank = (idx == num_lines-1) or not lines[idx+1].strip()
        # Exclude if the line is too short (e.g., just a number)
        line = lines[idx].strip()
        substantial = bool(re.match(r'^\d+ [^\d\.].*', line))
        return before_blank and after_blank and substantial

    for i, line in enumerate(lines):
        if is_heading(line) or is_title_line(i):
            if current_section:
                sections.append('\n'.join(current_section))
                current_section = []
        current_section.append(line)
    if current_section:
        sections.append('\n'.join(current_section))
    return sections


#function generated by GitHub CoPilot
def get_all_md_sections(directory):
    all_sections = []
    for filename in os.listdir(directory):
        if filename.endswith('.md'):
            filepath = os.path.join(directory, filename)
            with open(filepath, encoding="utf-8") as f:
                content = f.read()
                sections = split_md_sections(content)
                clean_sections = [strip_markdown(section) for section in sections if section.strip()]
                all_sections.extend(clean_sections)
    return all_sections

if __name__ == "__main__":
    directory = "/home/nws8519/git/adaptation-slr/studies/"
    docs = get_all_md_sections(directory)
    topic_model = BERTopic()
    topics, probabilities = topic_model.fit_transform(docs)
    topic_model.get_topic_info()
    topic_model.get_document_info(docs)
    topic_model.save("/home/nws8519/git/adaptation-slr/models/", serialization="pickle")