From 6962ad3e45d86b4c3464fbec27dda3cae29e221f Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Mon, 27 Jan 2025 11:53:59 -0600 Subject: [PATCH] updating convo parser selection --- src/get_dumps.py | 3 --- src/helper_scripts/repo_collection.py | 6 +++--- src/lib/wiki_get.py | 15 ++++++++++++--- 3 files changed, 15 insertions(+), 9 deletions(-) delete mode 100644 src/get_dumps.py diff --git a/src/get_dumps.py b/src/get_dumps.py deleted file mode 100644 index 20775e7..0000000 --- a/src/get_dumps.py +++ /dev/null @@ -1,3 +0,0 @@ -import wget -import os -import sys diff --git a/src/helper_scripts/repo_collection.py b/src/helper_scripts/repo_collection.py index e842c40..72a7e4d 100644 --- a/src/helper_scripts/repo_collection.py +++ b/src/helper_scripts/repo_collection.py @@ -24,9 +24,9 @@ repos = { ''' repos = { "visualeditor": { - "url": "https://gerrit.wikimedia.org/r/VisualEditor/VisualEditor", - "from_date": datetime.datetime(2012, 1, 1, 00, 00, 00, tzinfo=cst), - "to_date": datetime.datetime(2014, 12, 31, 00, 00, 00, tzinfo=cst) + "url": "https://gerrit.wikimedia.org/r/pywikibot/core", + "from_date": datetime.datetime(2010, 1, 1, 00, 00, 00, tzinfo=cst), + "to_date": datetime.datetime(2024, 12, 31, 00, 00, 00, tzinfo=cst) } } for repo in repos.keys(): diff --git a/src/lib/wiki_get.py b/src/lib/wiki_get.py index cf7f2ce..e6ddd9e 100644 --- a/src/lib/wiki_get.py +++ b/src/lib/wiki_get.py @@ -6,6 +6,8 @@ import json import mwparserfromhell +import mwchatter as mwc + def read_file(filename): with open(filename, 'r') as file: file_content = file.read() @@ -72,6 +74,10 @@ def parse_talkpage(wikitext): comment_dict = {} return discussion_array +def parse_talkpage2(wikitext): + parsed_text = mwc.parse(wikitext) + return parsed_text + def json_it(array_of_dicts, filename): json_ = json.dumps(array_of_dicts) with open(filename, 'w') as json_file: @@ -109,13 +115,16 @@ def parse_tech_news(wikitext): if __name__ == "__main__": - dir_path = "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/tech-news" + dir_path = "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/noticeboard" files = os.listdir(dir_path) print(files) + #file_wikitext = read_file(dir_path + "/" + 'bnb-archive-8-raw.txt') + #json_discussion = parse_talkpage2(file_wikitext) for file in files: + print(file) file_wikitext = read_file(dir_path + "/" + file) - json_discussion = parse_tech_news(file_wikitext) - json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/core/" + file.split(".")[0] + ".json") + json_discussion = parse_talkpage2(file_wikitext) + json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/core/bnb-archives/" + file.split(".")[0][:-4] + ".json") ''' file_wikitext = read_file("/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/tech-news/tech-news-2022.txt") json_discussion = parse_tech_news(file_wikitext)