From a1ae286073b2cfe6cc7f7ec05c4019b66a0e3c22 Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Tue, 12 Dec 2023 13:23:04 -0600 Subject: [PATCH] poking at pre processing --- text_pp.py | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 text_pp.py diff --git a/text_pp.py b/text_pp.py new file mode 100644 index 0000000..64e866e --- /dev/null +++ b/text_pp.py @@ -0,0 +1,49 @@ +import os +import csv +import json + +path = '/data/users/mgaughan/kkex_comment_data_120523/' +empty_file_dict = {'data': {'repository': {'issues': {'edges': []}}}} + + + +#pruning directory of bad data files/things that cannot be used +def check_files_for_content(filelist): + x = 0 + bad_data_files = 0 + for file in filelist: + filepath = path + file + opened_file = open(filepath) + file_contents = json.load(opened_file) + bad_comment_data = 'errors' in file_contents.keys() or file_contents == empty_file_dict + x += 1 + if bad_comment_data: + bad_data_files += 1 + os.remove(filepath) + opened_file.close() + continue + list_of_issues= file_contents['data']['repository']['issues']['edges'] + handle_repo_issues(list_of_issues) + if x < 2: + print(list_of_issues[0]['node'].keys()) + else: + break + print(bad_data_files) + +def handle_repo_issues(list_of_issues): + for issue in list_of_issues: + print(issue['node']['author']['url']) + list_of_comments = issue['node']['comments']['edges'] + handle_issue_comments(list_of_comments) + +def handle_issue_comments(list_of_comments): + for comment in list_of_comments: + comment_body = comment['node']['body'] + comment_author = comment['node']['author'] + + print(comment_author) + print(comment_body) + + +if __name__ == "__main__": + check_files_for_content(os.listdir(path)) \ No newline at end of file