updating convo parser selection
This commit is contained in:
parent
e0874fe5c8
commit
6962ad3e45
@ -1,3 +0,0 @@
|
||||
import wget
|
||||
import os
|
||||
import sys
|
@ -24,9 +24,9 @@ repos = {
|
||||
'''
|
||||
repos = {
|
||||
"visualeditor": {
|
||||
"url": "https://gerrit.wikimedia.org/r/VisualEditor/VisualEditor",
|
||||
"from_date": datetime.datetime(2012, 1, 1, 00, 00, 00, tzinfo=cst),
|
||||
"to_date": datetime.datetime(2014, 12, 31, 00, 00, 00, tzinfo=cst)
|
||||
"url": "https://gerrit.wikimedia.org/r/pywikibot/core",
|
||||
"from_date": datetime.datetime(2010, 1, 1, 00, 00, 00, tzinfo=cst),
|
||||
"to_date": datetime.datetime(2024, 12, 31, 00, 00, 00, tzinfo=cst)
|
||||
}
|
||||
}
|
||||
for repo in repos.keys():
|
||||
|
@ -6,6 +6,8 @@ import json
|
||||
|
||||
import mwparserfromhell
|
||||
|
||||
import mwchatter as mwc
|
||||
|
||||
def read_file(filename):
|
||||
with open(filename, 'r') as file:
|
||||
file_content = file.read()
|
||||
@ -72,6 +74,10 @@ def parse_talkpage(wikitext):
|
||||
comment_dict = {}
|
||||
return discussion_array
|
||||
|
||||
def parse_talkpage2(wikitext):
|
||||
parsed_text = mwc.parse(wikitext)
|
||||
return parsed_text
|
||||
|
||||
def json_it(array_of_dicts, filename):
|
||||
json_ = json.dumps(array_of_dicts)
|
||||
with open(filename, 'w') as json_file:
|
||||
@ -109,13 +115,16 @@ def parse_tech_news(wikitext):
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
dir_path = "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/tech-news"
|
||||
dir_path = "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/noticeboard"
|
||||
files = os.listdir(dir_path)
|
||||
print(files)
|
||||
#file_wikitext = read_file(dir_path + "/" + 'bnb-archive-8-raw.txt')
|
||||
#json_discussion = parse_talkpage2(file_wikitext)
|
||||
for file in files:
|
||||
print(file)
|
||||
file_wikitext = read_file(dir_path + "/" + file)
|
||||
json_discussion = parse_tech_news(file_wikitext)
|
||||
json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/core/" + file.split(".")[0] + ".json")
|
||||
json_discussion = parse_talkpage2(file_wikitext)
|
||||
json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/core/bnb-archives/" + file.split(".")[0][:-4] + ".json")
|
||||
'''
|
||||
file_wikitext = read_file("/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/tech-news/tech-news-2022.txt")
|
||||
json_discussion = parse_tech_news(file_wikitext)
|
||||
|
Loading…
Reference in New Issue
Block a user