first draft of conversation parsing for wiki talk pages
This commit is contained in:
parent
aa30d02c48
commit
b57e4c15a3
6
spec.txt
6
spec.txt
@ -7,7 +7,7 @@
|
||||
[ ] page edit histories
|
||||
[ ] who said it
|
||||
[ ~ ] talk pages?
|
||||
[ ~ ] who said what and when
|
||||
[ ] in response to who
|
||||
[ ] structure both of those in json files
|
||||
[ x ] who said what and when
|
||||
[ x ] in response to who
|
||||
[ ~ ] structure both of those in json files
|
||||
[ ] construct folders of the tagged json files
|
||||
|
@ -1,6 +1,8 @@
|
||||
import os
|
||||
import re
|
||||
import datetime
|
||||
import uuid
|
||||
import json
|
||||
|
||||
import mwparserfromhell
|
||||
|
||||
@ -18,6 +20,8 @@ def parse_talkpage(wikitext):
|
||||
current_header = ""
|
||||
subheader = ""
|
||||
comment_dict = {}
|
||||
thread_array = []
|
||||
discussion_array = []
|
||||
for cell in arraytext:
|
||||
if re.search(r"^==.*?==$", cell):
|
||||
current_title = cell
|
||||
@ -29,26 +33,55 @@ def parse_talkpage(wikitext):
|
||||
subheader = cell
|
||||
continue
|
||||
comment += cell
|
||||
match = re.search(r"\[\[User talk:[^\|]+\|[^\]]+\]\]", cell)
|
||||
match = re.search(r"\[\[(?:User talk|WP):[^\|]+\|[^\]]+\]\]", cell)
|
||||
#match = re.search(r"\[\[User:[^\|]+\|(?:<[^>]+>[^<]+<\/[^>]+>|[^]]+)\]\] \(\[\[[^\|]+\|(?:<[^>]+>\(.*?\) <\/[^>]+>|.*?)]\]\) \d{2}:\d{2}, \d{1,2} [A-Za-z]+ \d{4} \(UTC\)", cell)
|
||||
if match:
|
||||
comment_id = uuid.uuid4()
|
||||
user = match.group()
|
||||
split_comment = comment.split(user)
|
||||
comment_dict['id'] = str(comment_id)
|
||||
comment_dict['text'] = split_comment[0]
|
||||
comment_dict['title'] = current_title
|
||||
comment_dict['header'] = current_header
|
||||
comment_dict['subheader'] = subheader
|
||||
comment_dict['author_talk'] = user
|
||||
comment_dict['author'] = user.split("|")[0][12:]
|
||||
# doing stuff to figure out replies
|
||||
if not comment_dict['text'].startswith('*') and not comment_dict['text'].startswith(':'):
|
||||
comment_dict['thread'] = []
|
||||
thread_array = [comment_dict['id']]
|
||||
else:
|
||||
comment_ = comment_dict['text']
|
||||
level = 0
|
||||
while comment_.startswith('*') or comment_.startswith(':'):
|
||||
level += 1
|
||||
comment_ = comment_[1:]
|
||||
thread_array = thread_array[:level]
|
||||
comment_dict['thread'] = thread_array
|
||||
thread_array.append(comment_dict['id'])
|
||||
# doing stuff to get the timestamp
|
||||
string_time = split_comment[-1].split(" ")[-5:]
|
||||
if string_time[-1] == "":
|
||||
string_time = split_comment[-1].split(" ")[-6:]
|
||||
string_time[0] = string_time[0][-6:]
|
||||
if string_time[-1] == "(UTC)":
|
||||
comment_dict['draft_time'] = string_time
|
||||
comment_dict['time'] = " ".join(string_time)
|
||||
#comment_dict['time'] = datetime.datetime.strptime(" ".join(string_time), "%H:%M, %d %B %Y (UTC)")
|
||||
#print(comment_dict)
|
||||
discussion_array.append(comment_dict)
|
||||
comment = ""
|
||||
comment_dict = {}
|
||||
return discussion_array
|
||||
|
||||
def json_it(array_of_dicts, filename):
|
||||
json_ = json.dumps(array_of_dicts)
|
||||
with open(filename, 'w') as json_file:
|
||||
json_file.write(json_)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
file_directory = os.path.dirname(os.path.abspath(__file__))
|
||||
os.chdir(file_directory)
|
||||
wikitext = read_file('../../wikitext-dump/ve-rfc-dump.txt')
|
||||
parse_talkpage(wikitext)
|
||||
json_discussion = parse_talkpage(wikitext)
|
||||
json_it(json_discussion, "test.json")
|
Loading…
Reference in New Issue
Block a user