From ab771e25b7e07230cebc335786d67eb201ecca27 Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Fri, 27 Dec 2024 13:52:24 -0600 Subject: [PATCH] first parse of tech news files --- src/lib/wiki_get.py | 52 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/src/lib/wiki_get.py b/src/lib/wiki_get.py index 6f9d7f6..cf7f2ce 100644 --- a/src/lib/wiki_get.py +++ b/src/lib/wiki_get.py @@ -77,11 +77,51 @@ def json_it(array_of_dicts, filename): with open(filename, 'w') as json_file: json_file.write(json_) - +def parse_tech_news(wikitext): + wikicode = mwparserfromhell.parse(wikitext) + arraytext = wikicode.split('\n') + message_array = [] + comment_dict = {} + text_dict = {} + raw_message = "" + current_section = "header" + text_dict[current_section] = [] + for cell in arraytext: + raw_message += cell + if re.search(r"^==.*?==$", cell): + #issue = cell.split("Tech News: ")[1] + comment_dict['issue'] = cell + if re.search(r"^'''.*?'''$", cell): + current_section = cell[2:-3] + text_dict[current_section] = [] + continue + text_dict[current_section].append(cell) + if "" in cell: + comment_dict['raw'] = raw_message + comment_dict['structured text'] = text_dict + message_array.append(comment_dict) + raw_message = "" + comment_dict = {} + text_dict = {} + current_section = "header" + text_dict[current_section] = [] + return message_array if __name__ == "__main__": - file_directory = os.path.dirname(os.path.abspath(__file__)) - os.chdir(file_directory) - wikitext = read_file('../../wikitext-dump/ve-rfc-dump.txt') - json_discussion = parse_talkpage(wikitext) - json_it(json_discussion, "test.json") \ No newline at end of file + + dir_path = "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/tech-news" + files = os.listdir(dir_path) + print(files) + for file in files: + file_wikitext = read_file(dir_path + "/" + file) + json_discussion = parse_tech_news(file_wikitext) + json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/core/" + file.split(".")[0] + ".json") + ''' + file_wikitext = read_file("/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/tech-news/tech-news-2022.txt") + json_discussion = parse_tech_news(file_wikitext) + print(json_discussion) + json_it(json_discussion, "test.json") + #json_discussion = parse_talkpage(file_wikitext) + #json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/parsoid/parsoid-talk-archive-2.json") + ''' + \ No newline at end of file