1
0
mw-convo-collections/src/lib/wiki_get.py

54 lines
1.8 KiB
Python

import os
import re
import datetime
import mwparserfromhell
def read_file(filename):
with open(filename, 'r') as file:
file_content = file.read()
return file_content
def parse_talkpage(wikitext):
wikicode = mwparserfromhell.parse(wikitext)
arraytext = wikicode.split('\n')
user_pattern = r"\[\[User:[^\|]+\|[^\]]+\]\]"
comment = ""
current_title = ""
current_header = ""
subheader = ""
comment_dict = {}
for cell in arraytext:
if re.search(r"^==.*?==$", cell):
current_title = cell
continue
if re.search(r"^===.*?===$", cell):
current_header = cell
continue
if re.search(r"^====.*?====$", cell):
subheader = cell
continue
comment += cell
match = re.search(r"\[\[User talk:[^\|]+\|[^\]]+\]\]", cell)
if match:
user = match.group()
split_comment = comment.split(user)
comment_dict['text'] = split_comment[0]
comment_dict['title'] = current_title
comment_dict['header'] = current_header
comment_dict['subheader'] = subheader
comment_dict['author_talk'] = user
string_time = split_comment[-1].split(" ")[-5:]
if string_time[-1] == "":
string_time = split_comment[-1].split(" ")[-6:]
if string_time[-1] == "(UTC)":
comment_dict['draft_time'] = string_time
#comment_dict['time'] = datetime.datetime.strptime(" ".join(string_time), "%H:%M, %d %B %Y (UTC)")
#print(comment_dict)
comment = ""
if __name__ == "__main__":
file_directory = os.path.dirname(os.path.abspath(__file__))
os.chdir(file_directory)
wikitext = read_file('../../wikitext-dump/ve-rfc-dump.txt')
parse_talkpage(wikitext)