54 lines
1.8 KiB
Python
54 lines
1.8 KiB
Python
import os
|
|
import re
|
|
import datetime
|
|
|
|
import mwparserfromhell
|
|
|
|
def read_file(filename):
|
|
with open(filename, 'r') as file:
|
|
file_content = file.read()
|
|
return file_content
|
|
|
|
def parse_talkpage(wikitext):
|
|
wikicode = mwparserfromhell.parse(wikitext)
|
|
arraytext = wikicode.split('\n')
|
|
user_pattern = r"\[\[User:[^\|]+\|[^\]]+\]\]"
|
|
comment = ""
|
|
current_title = ""
|
|
current_header = ""
|
|
subheader = ""
|
|
comment_dict = {}
|
|
for cell in arraytext:
|
|
if re.search(r"^==.*?==$", cell):
|
|
current_title = cell
|
|
continue
|
|
if re.search(r"^===.*?===$", cell):
|
|
current_header = cell
|
|
continue
|
|
if re.search(r"^====.*?====$", cell):
|
|
subheader = cell
|
|
continue
|
|
comment += cell
|
|
match = re.search(r"\[\[User talk:[^\|]+\|[^\]]+\]\]", cell)
|
|
if match:
|
|
user = match.group()
|
|
split_comment = comment.split(user)
|
|
comment_dict['text'] = split_comment[0]
|
|
comment_dict['title'] = current_title
|
|
comment_dict['header'] = current_header
|
|
comment_dict['subheader'] = subheader
|
|
comment_dict['author_talk'] = user
|
|
string_time = split_comment[-1].split(" ")[-5:]
|
|
if string_time[-1] == "":
|
|
string_time = split_comment[-1].split(" ")[-6:]
|
|
if string_time[-1] == "(UTC)":
|
|
comment_dict['draft_time'] = string_time
|
|
#comment_dict['time'] = datetime.datetime.strptime(" ".join(string_time), "%H:%M, %d %B %Y (UTC)")
|
|
#print(comment_dict)
|
|
comment = ""
|
|
|
|
if __name__ == "__main__":
|
|
file_directory = os.path.dirname(os.path.abspath(__file__))
|
|
os.chdir(file_directory)
|
|
wikitext = read_file('../../wikitext-dump/ve-rfc-dump.txt')
|
|
parse_talkpage(wikitext) |