mw-convo-collections/src/lib/wiki_get.py

import os
import re
import datetime

import mwparserfromhell

def read_file(filename):
    with open(filename, 'r') as file:
        file_content = file.read()
        return file_content

def parse_talkpage(wikitext):
    wikicode = mwparserfromhell.parse(wikitext)
    arraytext = wikicode.split('\n')
    user_pattern = r"\[\[User:[^\|]+\|[^\]]+\]\]"
    comment = ""
    current_title = ""
    current_header = ""
    subheader = ""
    comment_dict = {}
    for cell in arraytext:
        if re.search(r"^==.*?==$", cell):
            current_title = cell
            continue
        if re.search(r"^===.*?===$", cell):
            current_header = cell
            continue
        if re.search(r"^====.*?====$", cell):
            subheader = cell
            continue
        comment += cell
        match = re.search(r"\[\[User talk:[^\|]+\|[^\]]+\]\]", cell)
        if match:
            user = match.group()
            split_comment = comment.split(user)
            comment_dict['text'] = split_comment[0]
            comment_dict['title'] = current_title
            comment_dict['header'] = current_header
            comment_dict['subheader'] = subheader
            comment_dict['author_talk'] = user
            string_time = split_comment[-1].split(" ")[-5:]
            if string_time[-1] == "":
                string_time = split_comment[-1].split(" ")[-6:]
            if string_time[-1] == "(UTC)":
                comment_dict['draft_time'] = string_time
                #comment_dict['time'] = datetime.datetime.strptime(" ".join(string_time), "%H:%M, %d %B %Y (UTC)")
            #print(comment_dict)
            comment = ""

if __name__ == "__main__":
    file_directory = os.path.dirname(os.path.abspath(__file__))
    os.chdir(file_directory)
    wikitext = read_file('../../wikitext-dump/ve-rfc-dump.txt')
    parse_talkpage(wikitext)