136 lines
5.1 KiB
Python
136 lines
5.1 KiB
Python
import os
|
|
import re
|
|
import datetime
|
|
import uuid
|
|
import json
|
|
|
|
import mwparserfromhell
|
|
|
|
import mwchatter as mwc
|
|
|
|
def read_file(filename):
|
|
with open(filename, 'r') as file:
|
|
file_content = file.read()
|
|
return file_content
|
|
|
|
def parse_talkpage(wikitext):
|
|
wikicode = mwparserfromhell.parse(wikitext)
|
|
arraytext = wikicode.split('\n')
|
|
user_pattern = r"\[\[User:[^\|]+\|[^\]]+\]\]"
|
|
comment = ""
|
|
current_title = ""
|
|
current_header = ""
|
|
subheader = ""
|
|
comment_dict = {}
|
|
thread_array = []
|
|
discussion_array = []
|
|
for cell in arraytext:
|
|
if re.search(r"^==.*?==$", cell):
|
|
current_title = cell
|
|
continue
|
|
if re.search(r"^===.*?===$", cell):
|
|
current_header = cell
|
|
continue
|
|
if re.search(r"^====.*?====$", cell):
|
|
subheader = cell
|
|
continue
|
|
comment += cell
|
|
match = re.search(r"\[\[(?:User talk|WP):[^\|]+\|[^\]]+\]\]", cell)
|
|
#match = re.search(r"\[\[User:[^\|]+\|(?:<[^>]+>[^<]+<\/[^>]+>|[^]]+)\]\] \(\[\[[^\|]+\|(?:<[^>]+>\(.*?\) <\/[^>]+>|.*?)]\]\) \d{2}:\d{2}, \d{1,2} [A-Za-z]+ \d{4} \(UTC\)", cell)
|
|
if match:
|
|
comment_id = uuid.uuid4()
|
|
user = match.group()
|
|
split_comment = comment.split(user)
|
|
comment_dict['id'] = str(comment_id)
|
|
comment_dict['text'] = split_comment[0]
|
|
comment_dict['title'] = current_title
|
|
comment_dict['header'] = current_header
|
|
comment_dict['subheader'] = subheader
|
|
comment_dict['author'] = user.split("|")[0][12:]
|
|
# doing stuff to figure out replies
|
|
if not comment_dict['text'].startswith('*') and not comment_dict['text'].startswith(':'):
|
|
comment_dict['thread'] = []
|
|
thread_array = [comment_dict['id']]
|
|
else:
|
|
comment_ = comment_dict['text']
|
|
level = 0
|
|
while comment_.startswith('*') or comment_.startswith(':'):
|
|
level += 1
|
|
comment_ = comment_[1:]
|
|
thread_array = thread_array[:level]
|
|
comment_dict['thread'] = thread_array
|
|
thread_array.append(comment_dict['id'])
|
|
# doing stuff to get the timestamp
|
|
string_time = split_comment[-1].split(" ")[-5:]
|
|
if string_time[-1] == "":
|
|
string_time = split_comment[-1].split(" ")[-6:]
|
|
string_time[0] = string_time[0][-6:]
|
|
if string_time[-1] == "(UTC)":
|
|
comment_dict['time'] = " ".join(string_time)
|
|
#comment_dict['time'] = datetime.datetime.strptime(" ".join(string_time), "%H:%M, %d %B %Y (UTC)")
|
|
#print(comment_dict)
|
|
discussion_array.append(comment_dict)
|
|
comment = ""
|
|
comment_dict = {}
|
|
return discussion_array
|
|
|
|
def parse_talkpage2(wikitext):
|
|
parsed_text = mwc.parse(wikitext)
|
|
return parsed_text
|
|
|
|
def json_it(array_of_dicts, filename):
|
|
json_ = json.dumps(array_of_dicts)
|
|
with open(filename, 'w') as json_file:
|
|
json_file.write(json_)
|
|
|
|
def parse_tech_news(wikitext):
|
|
wikicode = mwparserfromhell.parse(wikitext)
|
|
arraytext = wikicode.split('\n')
|
|
message_array = []
|
|
comment_dict = {}
|
|
text_dict = {}
|
|
raw_message = ""
|
|
current_section = "header"
|
|
text_dict[current_section] = []
|
|
for cell in arraytext:
|
|
raw_message += cell
|
|
if re.search(r"^==.*?==$", cell):
|
|
#issue = cell.split("Tech News: ")[1]
|
|
comment_dict['issue'] = cell
|
|
if re.search(r"^'''.*?'''$", cell):
|
|
current_section = cell[2:-3]
|
|
text_dict[current_section] = []
|
|
continue
|
|
text_dict[current_section].append(cell)
|
|
if "<!--" in cell and "-->" in cell:
|
|
comment_dict['raw'] = raw_message
|
|
comment_dict['structured text'] = text_dict
|
|
message_array.append(comment_dict)
|
|
raw_message = ""
|
|
comment_dict = {}
|
|
text_dict = {}
|
|
current_section = "header"
|
|
text_dict[current_section] = []
|
|
return message_array
|
|
|
|
if __name__ == "__main__":
|
|
|
|
#dir_path = "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/ve-rfcs"
|
|
#ve-rfcs
|
|
#files = os.listdir(dir_path)
|
|
#print(files)
|
|
#file_wikitext = read_file(dir_path + "/" + 'bnb-archive-8-raw.txt')
|
|
#json_discussion = parse_talkpage2(file_wikitext)
|
|
'''
|
|
for file in files:
|
|
print(file)
|
|
file_wikitext = read_file(dir_path + "/" + file)
|
|
json_discussion = parse_talkpage2(file_wikitext)
|
|
json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/visualeditor/" + file.split(".")[0] + ".json")
|
|
'''
|
|
file_wikitext = read_file("/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/https-talk-raw.txt")
|
|
json_discussion = parse_talkpage2(file_wikitext)
|
|
json_it(json_discussion, "test.json")
|
|
#json_discussion = parse_talkpage(file_wikitext)
|
|
#json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/parsoid/parsoid-talk-archive-2.json")
|
|
|