1
0

updating convo parser selection

This commit is contained in:
Matthew Gaughan 2025-01-27 11:53:59 -06:00
parent e0874fe5c8
commit 6962ad3e45
3 changed files with 15 additions and 9 deletions

View File

@ -1,3 +0,0 @@
import wget
import os
import sys

View File

@ -24,9 +24,9 @@ repos = {
'''
repos = {
"visualeditor": {
"url": "https://gerrit.wikimedia.org/r/VisualEditor/VisualEditor",
"from_date": datetime.datetime(2012, 1, 1, 00, 00, 00, tzinfo=cst),
"to_date": datetime.datetime(2014, 12, 31, 00, 00, 00, tzinfo=cst)
"url": "https://gerrit.wikimedia.org/r/pywikibot/core",
"from_date": datetime.datetime(2010, 1, 1, 00, 00, 00, tzinfo=cst),
"to_date": datetime.datetime(2024, 12, 31, 00, 00, 00, tzinfo=cst)
}
}
for repo in repos.keys():

View File

@ -6,6 +6,8 @@ import json
import mwparserfromhell
import mwchatter as mwc
def read_file(filename):
with open(filename, 'r') as file:
file_content = file.read()
@ -72,6 +74,10 @@ def parse_talkpage(wikitext):
comment_dict = {}
return discussion_array
def parse_talkpage2(wikitext):
parsed_text = mwc.parse(wikitext)
return parsed_text
def json_it(array_of_dicts, filename):
json_ = json.dumps(array_of_dicts)
with open(filename, 'w') as json_file:
@ -109,13 +115,16 @@ def parse_tech_news(wikitext):
if __name__ == "__main__":
dir_path = "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/tech-news"
dir_path = "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/noticeboard"
files = os.listdir(dir_path)
print(files)
#file_wikitext = read_file(dir_path + "/" + 'bnb-archive-8-raw.txt')
#json_discussion = parse_talkpage2(file_wikitext)
for file in files:
print(file)
file_wikitext = read_file(dir_path + "/" + file)
json_discussion = parse_tech_news(file_wikitext)
json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/core/" + file.split(".")[0] + ".json")
json_discussion = parse_talkpage2(file_wikitext)
json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/core/bnb-archives/" + file.split(".")[0][:-4] + ".json")
'''
file_wikitext = read_file("/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/tech-news/tech-news-2022.txt")
json_discussion = parse_tech_news(file_wikitext)