updating convo parser selection
This commit is contained in:
parent
e0874fe5c8
commit
6962ad3e45
@ -1,3 +0,0 @@
|
|||||||
import wget
|
|
||||||
import os
|
|
||||||
import sys
|
|
@ -24,9 +24,9 @@ repos = {
|
|||||||
'''
|
'''
|
||||||
repos = {
|
repos = {
|
||||||
"visualeditor": {
|
"visualeditor": {
|
||||||
"url": "https://gerrit.wikimedia.org/r/VisualEditor/VisualEditor",
|
"url": "https://gerrit.wikimedia.org/r/pywikibot/core",
|
||||||
"from_date": datetime.datetime(2012, 1, 1, 00, 00, 00, tzinfo=cst),
|
"from_date": datetime.datetime(2010, 1, 1, 00, 00, 00, tzinfo=cst),
|
||||||
"to_date": datetime.datetime(2014, 12, 31, 00, 00, 00, tzinfo=cst)
|
"to_date": datetime.datetime(2024, 12, 31, 00, 00, 00, tzinfo=cst)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for repo in repos.keys():
|
for repo in repos.keys():
|
||||||
|
@ -6,6 +6,8 @@ import json
|
|||||||
|
|
||||||
import mwparserfromhell
|
import mwparserfromhell
|
||||||
|
|
||||||
|
import mwchatter as mwc
|
||||||
|
|
||||||
def read_file(filename):
|
def read_file(filename):
|
||||||
with open(filename, 'r') as file:
|
with open(filename, 'r') as file:
|
||||||
file_content = file.read()
|
file_content = file.read()
|
||||||
@ -72,6 +74,10 @@ def parse_talkpage(wikitext):
|
|||||||
comment_dict = {}
|
comment_dict = {}
|
||||||
return discussion_array
|
return discussion_array
|
||||||
|
|
||||||
|
def parse_talkpage2(wikitext):
|
||||||
|
parsed_text = mwc.parse(wikitext)
|
||||||
|
return parsed_text
|
||||||
|
|
||||||
def json_it(array_of_dicts, filename):
|
def json_it(array_of_dicts, filename):
|
||||||
json_ = json.dumps(array_of_dicts)
|
json_ = json.dumps(array_of_dicts)
|
||||||
with open(filename, 'w') as json_file:
|
with open(filename, 'w') as json_file:
|
||||||
@ -109,13 +115,16 @@ def parse_tech_news(wikitext):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
dir_path = "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/tech-news"
|
dir_path = "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/noticeboard"
|
||||||
files = os.listdir(dir_path)
|
files = os.listdir(dir_path)
|
||||||
print(files)
|
print(files)
|
||||||
|
#file_wikitext = read_file(dir_path + "/" + 'bnb-archive-8-raw.txt')
|
||||||
|
#json_discussion = parse_talkpage2(file_wikitext)
|
||||||
for file in files:
|
for file in files:
|
||||||
|
print(file)
|
||||||
file_wikitext = read_file(dir_path + "/" + file)
|
file_wikitext = read_file(dir_path + "/" + file)
|
||||||
json_discussion = parse_tech_news(file_wikitext)
|
json_discussion = parse_talkpage2(file_wikitext)
|
||||||
json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/core/" + file.split(".")[0] + ".json")
|
json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/core/bnb-archives/" + file.split(".")[0][:-4] + ".json")
|
||||||
'''
|
'''
|
||||||
file_wikitext = read_file("/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/tech-news/tech-news-2022.txt")
|
file_wikitext = read_file("/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/tech-news/tech-news-2022.txt")
|
||||||
json_discussion = parse_tech_news(file_wikitext)
|
json_discussion = parse_tech_news(file_wikitext)
|
||||||
|
Loading…
Reference in New Issue
Block a user