mw-convo-collections/src/lib/wiki_get.py

import os
import re
import datetime
import uuid
import json

import mwparserfromhell

import mwchatter as mwc

def read_file(filename):
    with open(filename, 'r') as file:
        file_content = file.read()
        return file_content

def parse_talkpage(wikitext):
    wikicode = mwparserfromhell.parse(wikitext)
    arraytext = wikicode.split('\n')
    user_pattern = r"\[\[User:[^\|]+\|[^\]]+\]\]"
    comment = ""
    current_title = ""
    current_header = ""
    subheader = ""
    comment_dict = {}
    thread_array = []
    discussion_array = []
    for cell in arraytext:
        if re.search(r"^==.*?==$", cell):
            current_title = cell
            continue
        if re.search(r"^===.*?===$", cell):
            current_header = cell
            continue
        if re.search(r"^====.*?====$", cell):
            subheader = cell
            continue
        comment += cell
        match = re.search(r"\[\[(?:User talk|WP):[^\|]+\|[^\]]+\]\]", cell)
        #match = re.search(r"\[\[User:[^\|]+\|(?:<[^>]+>[^<]+<\/[^>]+>|[^]]+)\]\] \(\[\[[^\|]+\|(?:<[^>]+>\(.*?\) <\/[^>]+>|.*?)]\]\) \d{2}:\d{2}, \d{1,2} [A-Za-z]+ \d{4} \(UTC\)", cell)
        if match:
            comment_id = uuid.uuid4()
            user = match.group()
            split_comment = comment.split(user)
            comment_dict['id'] = str(comment_id)
            comment_dict['text'] = split_comment[0]
            comment_dict['title'] = current_title
            comment_dict['header'] = current_header
            comment_dict['subheader'] = subheader
            comment_dict['author'] = user.split("|")[0][12:]
            # doing stuff to figure out replies
            if not comment_dict['text'].startswith('*') and not comment_dict['text'].startswith(':'):
                comment_dict['thread'] = []
                thread_array = [comment_dict['id']]
            else:
                comment_ = comment_dict['text']
                level = 0
                while comment_.startswith('*') or comment_.startswith(':'):
                    level += 1
                    comment_ = comment_[1:]
                thread_array = thread_array[:level]
                comment_dict['thread'] = thread_array
                thread_array.append(comment_dict['id'])
            # doing stuff to get the timestamp
            string_time = split_comment[-1].split(" ")[-5:]
            if string_time[-1] == "":
                string_time = split_comment[-1].split(" ")[-6:]
            string_time[0] = string_time[0][-6:]
            if string_time[-1] == "(UTC)":
                comment_dict['time'] = " ".join(string_time)
                #comment_dict['time'] = datetime.datetime.strptime(" ".join(string_time), "%H:%M, %d %B %Y (UTC)")
            #print(comment_dict)
            discussion_array.append(comment_dict)
            comment = ""
            comment_dict = {}
    return discussion_array

def parse_talkpage2(wikitext):
    parsed_text = mwc.parse(wikitext)
    return parsed_text

def json_it(array_of_dicts, filename):
    json_ = json.dumps(array_of_dicts)
    with open(filename, 'w') as json_file:
        json_file.write(json_)

def parse_tech_news(wikitext):
    wikicode = mwparserfromhell.parse(wikitext)
    arraytext = wikicode.split('\n')
    message_array = []
    comment_dict = {}
    text_dict = {}
    raw_message = ""
    current_section = "header"
    text_dict[current_section] = []
    for cell in arraytext:
        raw_message += cell
        if re.search(r"^==.*?==$", cell):
            #issue = cell.split("Tech News: ")[1]
            comment_dict['issue'] = cell
        if re.search(r"^'''.*?'''$", cell):
            current_section = cell[2:-3]
            text_dict[current_section] = []
            continue
        text_dict[current_section].append(cell)
        if "<!--" in cell and "-->" in cell:
            comment_dict['raw'] = raw_message
            comment_dict['structured text'] = text_dict
            message_array.append(comment_dict)
            raw_message = ""
            comment_dict = {}
            text_dict = {}
            current_section = "header"
            text_dict[current_section] = []
    return message_array

if __name__ == "__main__":

    #dir_path = "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/ve-rfcs"
    #ve-rfcs
    #files = os.listdir(dir_path)
    #print(files)
    #file_wikitext = read_file(dir_path + "/" + 'bnb-archive-8-raw.txt')
    #json_discussion = parse_talkpage2(file_wikitext)
    '''
    for file in files:
        print(file)
        file_wikitext = read_file(dir_path + "/" + file)
        json_discussion = parse_talkpage2(file_wikitext)
        json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/visualeditor/" + file.split(".")[0] + ".json")
    '''
    file_wikitext = read_file("/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/https-talk-raw.txt")
    json_discussion = parse_talkpage2(file_wikitext)
    json_it(json_discussion, "test.json")
    #json_discussion = parse_talkpage(file_wikitext)
    #json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/parsoid/parsoid-talk-archive-2.json")