diff --git a/spec.txt b/spec.txt index 1c49fc9..c7ecd4a 100644 --- a/spec.txt +++ b/spec.txt @@ -7,7 +7,7 @@ [ ] page edit histories [ ] who said it [ ~ ] talk pages? - [ ~ ] who said what and when - [ ] in response to who -[ ] structure both of those in json files + [ x ] who said what and when + [ x ] in response to who +[ ~ ] structure both of those in json files [ ] construct folders of the tagged json files diff --git a/src/lib/wiki_get.py b/src/lib/wiki_get.py index ada5ee5..6f9d7f6 100644 --- a/src/lib/wiki_get.py +++ b/src/lib/wiki_get.py @@ -1,6 +1,8 @@ import os import re import datetime +import uuid +import json import mwparserfromhell @@ -18,6 +20,8 @@ def parse_talkpage(wikitext): current_header = "" subheader = "" comment_dict = {} + thread_array = [] + discussion_array = [] for cell in arraytext: if re.search(r"^==.*?==$", cell): current_title = cell @@ -29,26 +33,55 @@ def parse_talkpage(wikitext): subheader = cell continue comment += cell - match = re.search(r"\[\[User talk:[^\|]+\|[^\]]+\]\]", cell) + match = re.search(r"\[\[(?:User talk|WP):[^\|]+\|[^\]]+\]\]", cell) + #match = re.search(r"\[\[User:[^\|]+\|(?:<[^>]+>[^<]+<\/[^>]+>|[^]]+)\]\] \(\[\[[^\|]+\|(?:<[^>]+>\(.*?\) <\/[^>]+>|.*?)]\]\) \d{2}:\d{2}, \d{1,2} [A-Za-z]+ \d{4} \(UTC\)", cell) if match: + comment_id = uuid.uuid4() user = match.group() split_comment = comment.split(user) + comment_dict['id'] = str(comment_id) comment_dict['text'] = split_comment[0] comment_dict['title'] = current_title comment_dict['header'] = current_header comment_dict['subheader'] = subheader - comment_dict['author_talk'] = user + comment_dict['author'] = user.split("|")[0][12:] + # doing stuff to figure out replies + if not comment_dict['text'].startswith('*') and not comment_dict['text'].startswith(':'): + comment_dict['thread'] = [] + thread_array = [comment_dict['id']] + else: + comment_ = comment_dict['text'] + level = 0 + while comment_.startswith('*') or comment_.startswith(':'): + level += 1 + comment_ = comment_[1:] + thread_array = thread_array[:level] + comment_dict['thread'] = thread_array + thread_array.append(comment_dict['id']) + # doing stuff to get the timestamp string_time = split_comment[-1].split(" ")[-5:] if string_time[-1] == "": string_time = split_comment[-1].split(" ")[-6:] + string_time[0] = string_time[0][-6:] if string_time[-1] == "(UTC)": - comment_dict['draft_time'] = string_time + comment_dict['time'] = " ".join(string_time) #comment_dict['time'] = datetime.datetime.strptime(" ".join(string_time), "%H:%M, %d %B %Y (UTC)") #print(comment_dict) + discussion_array.append(comment_dict) comment = "" + comment_dict = {} + return discussion_array + +def json_it(array_of_dicts, filename): + json_ = json.dumps(array_of_dicts) + with open(filename, 'w') as json_file: + json_file.write(json_) + + if __name__ == "__main__": file_directory = os.path.dirname(os.path.abspath(__file__)) os.chdir(file_directory) wikitext = read_file('../../wikitext-dump/ve-rfc-dump.txt') - parse_talkpage(wikitext) \ No newline at end of file + json_discussion = parse_talkpage(wikitext) + json_it(json_discussion, "test.json") \ No newline at end of file