first draft of conversation parsing for wiki talk pages
This commit is contained in:
parent
aa30d02c48
commit
b57e4c15a3
6
spec.txt
6
spec.txt
@ -7,7 +7,7 @@
|
|||||||
[ ] page edit histories
|
[ ] page edit histories
|
||||||
[ ] who said it
|
[ ] who said it
|
||||||
[ ~ ] talk pages?
|
[ ~ ] talk pages?
|
||||||
[ ~ ] who said what and when
|
[ x ] who said what and when
|
||||||
[ ] in response to who
|
[ x ] in response to who
|
||||||
[ ] structure both of those in json files
|
[ ~ ] structure both of those in json files
|
||||||
[ ] construct folders of the tagged json files
|
[ ] construct folders of the tagged json files
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import datetime
|
import datetime
|
||||||
|
import uuid
|
||||||
|
import json
|
||||||
|
|
||||||
import mwparserfromhell
|
import mwparserfromhell
|
||||||
|
|
||||||
@ -18,6 +20,8 @@ def parse_talkpage(wikitext):
|
|||||||
current_header = ""
|
current_header = ""
|
||||||
subheader = ""
|
subheader = ""
|
||||||
comment_dict = {}
|
comment_dict = {}
|
||||||
|
thread_array = []
|
||||||
|
discussion_array = []
|
||||||
for cell in arraytext:
|
for cell in arraytext:
|
||||||
if re.search(r"^==.*?==$", cell):
|
if re.search(r"^==.*?==$", cell):
|
||||||
current_title = cell
|
current_title = cell
|
||||||
@ -29,26 +33,55 @@ def parse_talkpage(wikitext):
|
|||||||
subheader = cell
|
subheader = cell
|
||||||
continue
|
continue
|
||||||
comment += cell
|
comment += cell
|
||||||
match = re.search(r"\[\[User talk:[^\|]+\|[^\]]+\]\]", cell)
|
match = re.search(r"\[\[(?:User talk|WP):[^\|]+\|[^\]]+\]\]", cell)
|
||||||
|
#match = re.search(r"\[\[User:[^\|]+\|(?:<[^>]+>[^<]+<\/[^>]+>|[^]]+)\]\] \(\[\[[^\|]+\|(?:<[^>]+>\(.*?\) <\/[^>]+>|.*?)]\]\) \d{2}:\d{2}, \d{1,2} [A-Za-z]+ \d{4} \(UTC\)", cell)
|
||||||
if match:
|
if match:
|
||||||
|
comment_id = uuid.uuid4()
|
||||||
user = match.group()
|
user = match.group()
|
||||||
split_comment = comment.split(user)
|
split_comment = comment.split(user)
|
||||||
|
comment_dict['id'] = str(comment_id)
|
||||||
comment_dict['text'] = split_comment[0]
|
comment_dict['text'] = split_comment[0]
|
||||||
comment_dict['title'] = current_title
|
comment_dict['title'] = current_title
|
||||||
comment_dict['header'] = current_header
|
comment_dict['header'] = current_header
|
||||||
comment_dict['subheader'] = subheader
|
comment_dict['subheader'] = subheader
|
||||||
comment_dict['author_talk'] = user
|
comment_dict['author'] = user.split("|")[0][12:]
|
||||||
|
# doing stuff to figure out replies
|
||||||
|
if not comment_dict['text'].startswith('*') and not comment_dict['text'].startswith(':'):
|
||||||
|
comment_dict['thread'] = []
|
||||||
|
thread_array = [comment_dict['id']]
|
||||||
|
else:
|
||||||
|
comment_ = comment_dict['text']
|
||||||
|
level = 0
|
||||||
|
while comment_.startswith('*') or comment_.startswith(':'):
|
||||||
|
level += 1
|
||||||
|
comment_ = comment_[1:]
|
||||||
|
thread_array = thread_array[:level]
|
||||||
|
comment_dict['thread'] = thread_array
|
||||||
|
thread_array.append(comment_dict['id'])
|
||||||
|
# doing stuff to get the timestamp
|
||||||
string_time = split_comment[-1].split(" ")[-5:]
|
string_time = split_comment[-1].split(" ")[-5:]
|
||||||
if string_time[-1] == "":
|
if string_time[-1] == "":
|
||||||
string_time = split_comment[-1].split(" ")[-6:]
|
string_time = split_comment[-1].split(" ")[-6:]
|
||||||
|
string_time[0] = string_time[0][-6:]
|
||||||
if string_time[-1] == "(UTC)":
|
if string_time[-1] == "(UTC)":
|
||||||
comment_dict['draft_time'] = string_time
|
comment_dict['time'] = " ".join(string_time)
|
||||||
#comment_dict['time'] = datetime.datetime.strptime(" ".join(string_time), "%H:%M, %d %B %Y (UTC)")
|
#comment_dict['time'] = datetime.datetime.strptime(" ".join(string_time), "%H:%M, %d %B %Y (UTC)")
|
||||||
#print(comment_dict)
|
#print(comment_dict)
|
||||||
|
discussion_array.append(comment_dict)
|
||||||
comment = ""
|
comment = ""
|
||||||
|
comment_dict = {}
|
||||||
|
return discussion_array
|
||||||
|
|
||||||
|
def json_it(array_of_dicts, filename):
|
||||||
|
json_ = json.dumps(array_of_dicts)
|
||||||
|
with open(filename, 'w') as json_file:
|
||||||
|
json_file.write(json_)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
file_directory = os.path.dirname(os.path.abspath(__file__))
|
file_directory = os.path.dirname(os.path.abspath(__file__))
|
||||||
os.chdir(file_directory)
|
os.chdir(file_directory)
|
||||||
wikitext = read_file('../../wikitext-dump/ve-rfc-dump.txt')
|
wikitext = read_file('../../wikitext-dump/ve-rfc-dump.txt')
|
||||||
parse_talkpage(wikitext)
|
json_discussion = parse_talkpage(wikitext)
|
||||||
|
json_it(json_discussion, "test.json")
|
Loading…
Reference in New Issue
Block a user