1
0

first parse of tech news files

This commit is contained in:
Matthew Gaughan 2024-12-27 13:52:24 -06:00
parent 9510f65255
commit ab771e25b7

View File

@ -77,11 +77,51 @@ def json_it(array_of_dicts, filename):
with open(filename, 'w') as json_file: with open(filename, 'w') as json_file:
json_file.write(json_) json_file.write(json_)
def parse_tech_news(wikitext):
wikicode = mwparserfromhell.parse(wikitext)
arraytext = wikicode.split('\n')
message_array = []
comment_dict = {}
text_dict = {}
raw_message = ""
current_section = "header"
text_dict[current_section] = []
for cell in arraytext:
raw_message += cell
if re.search(r"^==.*?==$", cell):
#issue = cell.split("Tech News: ")[1]
comment_dict['issue'] = cell
if re.search(r"^'''.*?'''$", cell):
current_section = cell[2:-3]
text_dict[current_section] = []
continue
text_dict[current_section].append(cell)
if "<!--" in cell and "-->" in cell:
comment_dict['raw'] = raw_message
comment_dict['structured text'] = text_dict
message_array.append(comment_dict)
raw_message = ""
comment_dict = {}
text_dict = {}
current_section = "header"
text_dict[current_section] = []
return message_array
if __name__ == "__main__": if __name__ == "__main__":
file_directory = os.path.dirname(os.path.abspath(__file__))
os.chdir(file_directory) dir_path = "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/tech-news"
wikitext = read_file('../../wikitext-dump/ve-rfc-dump.txt') files = os.listdir(dir_path)
json_discussion = parse_talkpage(wikitext) print(files)
for file in files:
file_wikitext = read_file(dir_path + "/" + file)
json_discussion = parse_tech_news(file_wikitext)
json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/core/" + file.split(".")[0] + ".json")
'''
file_wikitext = read_file("/data/users/mgaughan/mw-repo-lifecycles/discussion_data/raw/tech-news/tech-news-2022.txt")
json_discussion = parse_tech_news(file_wikitext)
print(json_discussion)
json_it(json_discussion, "test.json") json_it(json_discussion, "test.json")
#json_discussion = parse_talkpage(file_wikitext)
#json_it(json_discussion, "/data/users/mgaughan/mw-repo-lifecycles/discussion_data/parsoid/parsoid-talk-archive-2.json")
'''