1
0

updating convo parser selection

This commit is contained in:
Matthew Gaughan 2025-01-27 11:53:59 -06:00
parent e0874fe5c8
commit 6962ad3e45
3 changed files with 15 additions and 9 deletions

View File

@ -1,3 +0,0 @@
import wget
import os
import sys

View File

@ -24,9 +24,9 @@ repos = {
''' '''
repos = { repos = {
"visualeditor": { "visualeditor": {
"url": "https://gerrit.wikimedia.org/r/VisualEditor/VisualEditor", "url": "https://gerrit.wikimedia.org/r/pywikibot/core",
"from_date": datetime.datetime(2012, 1, 1, 00, 00, 00, tzinfo=cst), "from_date": datetime.datetime(2010, 1, 1, 00, 00, 00, tzinfo=cst),
"to_date": datetime.datetime(2014, 12, 31, 00, 00, 00, tzinfo=cst) "to_date": datetime.datetime(2024, 12, 31, 00, 00, 00, tzinfo=cst)
} }
} }
for repo in repos.keys(): for repo in repos.keys():

View File

@ -6,6 +6,8 @@ import json
import mwparserfromhell import mwparserfromhell
import mwchatter as mwc
def read_file(filename): def read_file(filename):
with open(filename, 'r') as file: with open(filename, 'r') as file:
file_content = file.read() file_content = file.read()
@ -72,6 +74,10 @@ def parse_talkpage(wikitext):
comment_dict = {} comment_dict = {}
return discussion_array return discussion_array
def parse_talkpage2(wikitext):
parsed_text = mwc.parse(wikitext)
return parsed_text
def json_it(array_of_dicts, filename): def json_it(array_of_dicts, filename):
json_ = json.dumps(array_of_dicts) json_ = json.dumps(array_of_dicts)
with open(filename, 'w') as json_file: with open(filename, 'w') as json_file:
@ -109,13 +115,16 @@ def parse_tech_news(wikitext):
if __name__ == "__main__": if __name__ == "__main__":
dir_path = "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/tech-news" dir_path = "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/noticeboard"
files = os.listdir(dir_path) files = os.listdir(dir_path)
print(files) print(files)
#file_wikitext = read_file(dir_path + "/" + 'bnb-archive-8-raw.txt')
#json_discussion = parse_talkpage2(file_wikitext)
for file in files: for file in files:
print(file)
file_wikitext = read_file(dir_path + "/" + file) file_wikitext = read_file(dir_path + "/" + file)
json_discussion = parse_tech_news(file_wikitext) json_discussion = parse_talkpage2(file_wikitext)
json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/core/" + file.split(".")[0] + ".json") json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/core/bnb-archives/" + file.split(".")[0][:-4] + ".json")
''' '''
file_wikitext = read_file("/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/tech-news/tech-news-2022.txt") file_wikitext = read_file("/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/tech-news/tech-news-2022.txt")
json_discussion = parse_tech_news(file_wikitext) json_discussion = parse_tech_news(file_wikitext)