import os import re import datetime import uuid import json import mwparserfromhell import mwchatter as mwc def read_file(filename): with open(filename, 'r') as file: file_content = file.read() return file_content def parse_talkpage(wikitext): wikicode = mwparserfromhell.parse(wikitext) arraytext = wikicode.split('\n') user_pattern = r"\[\[User:[^\|]+\|[^\]]+\]\]" comment = "" current_title = "" current_header = "" subheader = "" comment_dict = {} thread_array = [] discussion_array = [] for cell in arraytext: if re.search(r"^==.*?==$", cell): current_title = cell continue if re.search(r"^===.*?===$", cell): current_header = cell continue if re.search(r"^====.*?====$", cell): subheader = cell continue comment += cell match = re.search(r"\[\[(?:User talk|WP):[^\|]+\|[^\]]+\]\]", cell) #match = re.search(r"\[\[User:[^\|]+\|(?:<[^>]+>[^<]+<\/[^>]+>|[^]]+)\]\] \(\[\[[^\|]+\|(?:<[^>]+>\(.*?\) <\/[^>]+>|.*?)]\]\) \d{2}:\d{2}, \d{1,2} [A-Za-z]+ \d{4} \(UTC\)", cell) if match: comment_id = uuid.uuid4() user = match.group() split_comment = comment.split(user) comment_dict['id'] = str(comment_id) comment_dict['text'] = split_comment[0] comment_dict['title'] = current_title comment_dict['header'] = current_header comment_dict['subheader'] = subheader comment_dict['author'] = user.split("|")[0][12:] # doing stuff to figure out replies if not comment_dict['text'].startswith('*') and not comment_dict['text'].startswith(':'): comment_dict['thread'] = [] thread_array = [comment_dict['id']] else: comment_ = comment_dict['text'] level = 0 while comment_.startswith('*') or comment_.startswith(':'): level += 1 comment_ = comment_[1:] thread_array = thread_array[:level] comment_dict['thread'] = thread_array thread_array.append(comment_dict['id']) # doing stuff to get the timestamp string_time = split_comment[-1].split(" ")[-5:] if string_time[-1] == "": string_time = split_comment[-1].split(" ")[-6:] string_time[0] = string_time[0][-6:] if string_time[-1] == "(UTC)": comment_dict['time'] = " ".join(string_time) #comment_dict['time'] = datetime.datetime.strptime(" ".join(string_time), "%H:%M, %d %B %Y (UTC)") #print(comment_dict) discussion_array.append(comment_dict) comment = "" comment_dict = {} return discussion_array def parse_talkpage2(wikitext): parsed_text = mwc.parse(wikitext) return parsed_text def json_it(array_of_dicts, filename): json_ = json.dumps(array_of_dicts) with open(filename, 'w') as json_file: json_file.write(json_) def parse_tech_news(wikitext): wikicode = mwparserfromhell.parse(wikitext) arraytext = wikicode.split('\n') message_array = [] comment_dict = {} text_dict = {} raw_message = "" current_section = "header" text_dict[current_section] = [] for cell in arraytext: raw_message += cell if re.search(r"^==.*?==$", cell): #issue = cell.split("Tech News: ")[1] comment_dict['issue'] = cell if re.search(r"^'''.*?'''$", cell): current_section = cell[2:-3] text_dict[current_section] = [] continue text_dict[current_section].append(cell) if "" in cell: comment_dict['raw'] = raw_message comment_dict['structured text'] = text_dict message_array.append(comment_dict) raw_message = "" comment_dict = {} text_dict = {} current_section = "header" text_dict[current_section] = [] return message_array if __name__ == "__main__": #dir_path = "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/ve-rfcs" #ve-rfcs #files = os.listdir(dir_path) #print(files) #file_wikitext = read_file(dir_path + "/" + 'bnb-archive-8-raw.txt') #json_discussion = parse_talkpage2(file_wikitext) ''' for file in files: print(file) file_wikitext = read_file(dir_path + "/" + file) json_discussion = parse_talkpage2(file_wikitext) json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/visualeditor/" + file.split(".")[0] + ".json") ''' file_wikitext = read_file("/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/https-talk-raw.txt") json_discussion = parse_talkpage2(file_wikitext) json_it(json_discussion, "test.json") #json_discussion = parse_talkpage(file_wikitext) #json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/parsoid/parsoid-talk-archive-2.json")