1
0
mw-convo-collections/src/lib/wiki_get.py

136 lines
5.1 KiB
Python

import os
import re
import datetime
import uuid
import json
import mwparserfromhell
import mwchatter as mwc
def read_file(filename):
with open(filename, 'r') as file:
file_content = file.read()
return file_content
def parse_talkpage(wikitext):
wikicode = mwparserfromhell.parse(wikitext)
arraytext = wikicode.split('\n')
user_pattern = r"\[\[User:[^\|]+\|[^\]]+\]\]"
comment = ""
current_title = ""
current_header = ""
subheader = ""
comment_dict = {}
thread_array = []
discussion_array = []
for cell in arraytext:
if re.search(r"^==.*?==$", cell):
current_title = cell
continue
if re.search(r"^===.*?===$", cell):
current_header = cell
continue
if re.search(r"^====.*?====$", cell):
subheader = cell
continue
comment += cell
match = re.search(r"\[\[(?:User talk|WP):[^\|]+\|[^\]]+\]\]", cell)
#match = re.search(r"\[\[User:[^\|]+\|(?:<[^>]+>[^<]+<\/[^>]+>|[^]]+)\]\] \(\[\[[^\|]+\|(?:<[^>]+>\(.*?\) <\/[^>]+>|.*?)]\]\) \d{2}:\d{2}, \d{1,2} [A-Za-z]+ \d{4} \(UTC\)", cell)
if match:
comment_id = uuid.uuid4()
user = match.group()
split_comment = comment.split(user)
comment_dict['id'] = str(comment_id)
comment_dict['text'] = split_comment[0]
comment_dict['title'] = current_title
comment_dict['header'] = current_header
comment_dict['subheader'] = subheader
comment_dict['author'] = user.split("|")[0][12:]
# doing stuff to figure out replies
if not comment_dict['text'].startswith('*') and not comment_dict['text'].startswith(':'):
comment_dict['thread'] = []
thread_array = [comment_dict['id']]
else:
comment_ = comment_dict['text']
level = 0
while comment_.startswith('*') or comment_.startswith(':'):
level += 1
comment_ = comment_[1:]
thread_array = thread_array[:level]
comment_dict['thread'] = thread_array
thread_array.append(comment_dict['id'])
# doing stuff to get the timestamp
string_time = split_comment[-1].split(" ")[-5:]
if string_time[-1] == "":
string_time = split_comment[-1].split(" ")[-6:]
string_time[0] = string_time[0][-6:]
if string_time[-1] == "(UTC)":
comment_dict['time'] = " ".join(string_time)
#comment_dict['time'] = datetime.datetime.strptime(" ".join(string_time), "%H:%M, %d %B %Y (UTC)")
#print(comment_dict)
discussion_array.append(comment_dict)
comment = ""
comment_dict = {}
return discussion_array
def parse_talkpage2(wikitext):
parsed_text = mwc.parse(wikitext)
return parsed_text
def json_it(array_of_dicts, filename):
json_ = json.dumps(array_of_dicts)
with open(filename, 'w') as json_file:
json_file.write(json_)
def parse_tech_news(wikitext):
wikicode = mwparserfromhell.parse(wikitext)
arraytext = wikicode.split('\n')
message_array = []
comment_dict = {}
text_dict = {}
raw_message = ""
current_section = "header"
text_dict[current_section] = []
for cell in arraytext:
raw_message += cell
if re.search(r"^==.*?==$", cell):
#issue = cell.split("Tech News: ")[1]
comment_dict['issue'] = cell
if re.search(r"^'''.*?'''$", cell):
current_section = cell[2:-3]
text_dict[current_section] = []
continue
text_dict[current_section].append(cell)
if "<!--" in cell and "-->" in cell:
comment_dict['raw'] = raw_message
comment_dict['structured text'] = text_dict
message_array.append(comment_dict)
raw_message = ""
comment_dict = {}
text_dict = {}
current_section = "header"
text_dict[current_section] = []
return message_array
if __name__ == "__main__":
#dir_path = "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/ve-rfcs"
#ve-rfcs
#files = os.listdir(dir_path)
#print(files)
#file_wikitext = read_file(dir_path + "/" + 'bnb-archive-8-raw.txt')
#json_discussion = parse_talkpage2(file_wikitext)
'''
for file in files:
print(file)
file_wikitext = read_file(dir_path + "/" + file)
json_discussion = parse_talkpage2(file_wikitext)
json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/visualeditor/" + file.split(".")[0] + ".json")
'''
file_wikitext = read_file("/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/https-talk-raw.txt")
json_discussion = parse_talkpage2(file_wikitext)
json_it(json_discussion, "test.json")
#json_discussion = parse_talkpage(file_wikitext)
#json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/parsoid/parsoid-talk-archive-2.json")