1
0

first draft of conversation parsing for wiki talk pages

This commit is contained in:
mgaughan 2024-12-09 14:56:31 -06:00
parent aa30d02c48
commit b57e4c15a3
2 changed files with 40 additions and 7 deletions

View File

@ -7,7 +7,7 @@
[ ] page edit histories
[ ] who said it
[ ~ ] talk pages?
[ ~ ] who said what and when
[ ] in response to who
[ ] structure both of those in json files
[ x ] who said what and when
[ x ] in response to who
[ ~ ] structure both of those in json files
[ ] construct folders of the tagged json files

View File

@ -1,6 +1,8 @@
import os
import re
import datetime
import uuid
import json
import mwparserfromhell
@ -18,6 +20,8 @@ def parse_talkpage(wikitext):
current_header = ""
subheader = ""
comment_dict = {}
thread_array = []
discussion_array = []
for cell in arraytext:
if re.search(r"^==.*?==$", cell):
current_title = cell
@ -29,26 +33,55 @@ def parse_talkpage(wikitext):
subheader = cell
continue
comment += cell
match = re.search(r"\[\[User talk:[^\|]+\|[^\]]+\]\]", cell)
match = re.search(r"\[\[(?:User talk|WP):[^\|]+\|[^\]]+\]\]", cell)
#match = re.search(r"\[\[User:[^\|]+\|(?:<[^>]+>[^<]+<\/[^>]+>|[^]]+)\]\] \(\[\[[^\|]+\|(?:<[^>]+>\(.*?\) <\/[^>]+>|.*?)]\]\) \d{2}:\d{2}, \d{1,2} [A-Za-z]+ \d{4} \(UTC\)", cell)
if match:
comment_id = uuid.uuid4()
user = match.group()
split_comment = comment.split(user)
comment_dict['id'] = str(comment_id)
comment_dict['text'] = split_comment[0]
comment_dict['title'] = current_title
comment_dict['header'] = current_header
comment_dict['subheader'] = subheader
comment_dict['author_talk'] = user
comment_dict['author'] = user.split("|")[0][12:]
# doing stuff to figure out replies
if not comment_dict['text'].startswith('*') and not comment_dict['text'].startswith(':'):
comment_dict['thread'] = []
thread_array = [comment_dict['id']]
else:
comment_ = comment_dict['text']
level = 0
while comment_.startswith('*') or comment_.startswith(':'):
level += 1
comment_ = comment_[1:]
thread_array = thread_array[:level]
comment_dict['thread'] = thread_array
thread_array.append(comment_dict['id'])
# doing stuff to get the timestamp
string_time = split_comment[-1].split(" ")[-5:]
if string_time[-1] == "":
string_time = split_comment[-1].split(" ")[-6:]
string_time[0] = string_time[0][-6:]
if string_time[-1] == "(UTC)":
comment_dict['draft_time'] = string_time
comment_dict['time'] = " ".join(string_time)
#comment_dict['time'] = datetime.datetime.strptime(" ".join(string_time), "%H:%M, %d %B %Y (UTC)")
#print(comment_dict)
discussion_array.append(comment_dict)
comment = ""
comment_dict = {}
return discussion_array
def json_it(array_of_dicts, filename):
json_ = json.dumps(array_of_dicts)
with open(filename, 'w') as json_file:
json_file.write(json_)
if __name__ == "__main__":
file_directory = os.path.dirname(os.path.abspath(__file__))
os.chdir(file_directory)
wikitext = read_file('../../wikitext-dump/ve-rfc-dump.txt')
parse_talkpage(wikitext)
json_discussion = parse_talkpage(wikitext)
json_it(json_discussion, "test.json")