first parse of tech news files
This commit is contained in:
parent
9510f65255
commit
ab771e25b7
@ -77,11 +77,51 @@ def json_it(array_of_dicts, filename):
|
||||
with open(filename, 'w') as json_file:
|
||||
json_file.write(json_)
|
||||
|
||||
|
||||
def parse_tech_news(wikitext):
|
||||
wikicode = mwparserfromhell.parse(wikitext)
|
||||
arraytext = wikicode.split('\n')
|
||||
message_array = []
|
||||
comment_dict = {}
|
||||
text_dict = {}
|
||||
raw_message = ""
|
||||
current_section = "header"
|
||||
text_dict[current_section] = []
|
||||
for cell in arraytext:
|
||||
raw_message += cell
|
||||
if re.search(r"^==.*?==$", cell):
|
||||
#issue = cell.split("Tech News: ")[1]
|
||||
comment_dict['issue'] = cell
|
||||
if re.search(r"^'''.*?'''$", cell):
|
||||
current_section = cell[2:-3]
|
||||
text_dict[current_section] = []
|
||||
continue
|
||||
text_dict[current_section].append(cell)
|
||||
if "<!--" in cell and "-->" in cell:
|
||||
comment_dict['raw'] = raw_message
|
||||
comment_dict['structured text'] = text_dict
|
||||
message_array.append(comment_dict)
|
||||
raw_message = ""
|
||||
comment_dict = {}
|
||||
text_dict = {}
|
||||
current_section = "header"
|
||||
text_dict[current_section] = []
|
||||
return message_array
|
||||
|
||||
if __name__ == "__main__":
|
||||
file_directory = os.path.dirname(os.path.abspath(__file__))
|
||||
os.chdir(file_directory)
|
||||
wikitext = read_file('../../wikitext-dump/ve-rfc-dump.txt')
|
||||
json_discussion = parse_talkpage(wikitext)
|
||||
|
||||
dir_path = "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/tech-news"
|
||||
files = os.listdir(dir_path)
|
||||
print(files)
|
||||
for file in files:
|
||||
file_wikitext = read_file(dir_path + "/" + file)
|
||||
json_discussion = parse_tech_news(file_wikitext)
|
||||
json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/core/" + file.split(".")[0] + ".json")
|
||||
'''
|
||||
file_wikitext = read_file("/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/tech-news/tech-news-2022.txt")
|
||||
json_discussion = parse_tech_news(file_wikitext)
|
||||
print(json_discussion)
|
||||
json_it(json_discussion, "test.json")
|
||||
#json_discussion = parse_talkpage(file_wikitext)
|
||||
#json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/parsoid/parsoid-talk-archive-2.json")
|
||||
'''
|
||||
|
Loading…
Reference in New Issue
Block a user