from bertopic import BERTopic from nltk.corpus import stopwords import os import re from markdown import markdown from bs4 import BeautifulSoup import string #function generated by GitHub CoPilot def strip_markdown(md_text): # Convert markdown to HTML, then extract plaintext html = markdown(md_text) soup = BeautifulSoup(html, "html.parser") return soup.get_text(separator="\n").strip() #function generate by GitHub CoPilot def split_md_sections(md_content): sections = [] current_section = [] lines = md_content.splitlines() num_lines = len(lines) def is_heading(line): return re.match(r'^#{1,6} ', line) def is_title_line(idx): # A title line is surrounded by blank lines and is not itself blank or a heading if is_heading(lines[idx]) or not lines[idx].strip(): return False before_blank = (idx == 0) or not lines[idx-1].strip() after_blank = (idx == num_lines-1) or not lines[idx+1].strip() # Exclude if the line is too short (e.g., just a number) line = lines[idx].strip() substantial = bool(re.match(r'^\d+ [^\d\.].*', line)) return before_blank and after_blank and substantial for i, line in enumerate(lines): if is_heading(line) or is_title_line(i): if current_section: sections.append('\n'.join(current_section)) current_section = [] current_section.append(line) if current_section: sections.append('\n'.join(current_section)) return sections #function generated by GitHub CoPilot def get_all_md_sections(directory): all_sections = [] for filename in os.listdir(directory): if filename.endswith('.md'): filepath = os.path.join(directory, filename) with open(filepath, encoding="utf-8") as f: content = f.read() sections = split_md_sections(content) clean_sections = [strip_markdown(section) for section in sections if section.strip()] all_sections.extend(clean_sections) return all_sections #function generated by GitHubCopilot def clean_text(text, stop_words): # Remove punctuation text = text.translate(str.maketrans('', '', string.punctuation)) # Remove stopwords words = text.split() stop_words.add("software") stop_words.add("project") words = [word for word in words if word.lower() not in stop_words] return ' '.join(words) if __name__ == "__main__": directory = "/home/nws8519/git/adaptation-slr/studies/" docs = get_all_md_sections(directory) #cleaning (largely just removing stopwords and punctuation) #nltk.download('stopwords') stop_words = set(stopwords.words('english')) cleaned_docs = [clean_text(d, stop_words) for d in docs] print(len(cleaned_docs)) with open('bertopic_docs.txt', 'w') as f: for doc in cleaned_docs: f.write(doc + "\n::::\n") #topic_model = BERTopic.load('/home/nws8519/git/adaptation-slr/models/062525bertopic') #document_info = topic_model.get_document_info(cleaned_docs) #for each document in document_i #print(document_info) #print(topic_model.get_representative_docs())