import os import re import datetime import mwparserfromhell def read_file(filename): with open(filename, 'r') as file: file_content = file.read() return file_content def parse_talkpage(wikitext): wikicode = mwparserfromhell.parse(wikitext) arraytext = wikicode.split('\n') user_pattern = r"\[\[User:[^\|]+\|[^\]]+\]\]" comment = "" current_title = "" current_header = "" subheader = "" comment_dict = {} for cell in arraytext: if re.search(r"^==.*?==$", cell): current_title = cell continue if re.search(r"^===.*?===$", cell): current_header = cell continue if re.search(r"^====.*?====$", cell): subheader = cell continue comment += cell match = re.search(r"\[\[User talk:[^\|]+\|[^\]]+\]\]", cell) if match: user = match.group() split_comment = comment.split(user) comment_dict['text'] = split_comment[0] comment_dict['title'] = current_title comment_dict['header'] = current_header comment_dict['subheader'] = subheader comment_dict['author_talk'] = user string_time = split_comment[-1].split(" ")[-5:] if string_time[-1] == "": string_time = split_comment[-1].split(" ")[-6:] if string_time[-1] == "(UTC)": comment_dict['draft_time'] = string_time #comment_dict['time'] = datetime.datetime.strptime(" ".join(string_time), "%H:%M, %d %B %Y (UTC)") #print(comment_dict) comment = "" if __name__ == "__main__": file_directory = os.path.dirname(os.path.abspath(__file__)) os.chdir(file_directory) wikitext = read_file('../../wikitext-dump/ve-rfc-dump.txt') parse_talkpage(wikitext)