From a1ae286073b2cfe6cc7f7ec05c4019b66a0e3c22 Mon Sep 17 00:00:00 2001
From: Matthew Gaughan <gaughan@u.northwestern.edu>
Date: Tue, 12 Dec 2023 13:23:04 -0600
Subject: [PATCH] poking at pre processing

---
 text_pp.py | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 text_pp.py

diff --git a/text_pp.py b/text_pp.py
new file mode 100644
index 0000000..64e866e
--- /dev/null
+++ b/text_pp.py
@@ -0,0 +1,49 @@
+import os 
+import csv 
+import json 
+
+path = '/data/users/mgaughan/kkex_comment_data_120523/'
+empty_file_dict = {'data': {'repository': {'issues': {'edges': []}}}}
+
+
+
+#pruning directory of bad data files/things that cannot be used
+def check_files_for_content(filelist):
+    x = 0
+    bad_data_files = 0
+    for file in filelist:
+        filepath = path + file
+        opened_file = open(filepath)
+        file_contents = json.load(opened_file)
+        bad_comment_data = 'errors' in file_contents.keys() or file_contents == empty_file_dict
+        x += 1
+        if bad_comment_data:
+            bad_data_files += 1
+            os.remove(filepath)
+            opened_file.close()
+            continue
+        list_of_issues= file_contents['data']['repository']['issues']['edges']
+        handle_repo_issues(list_of_issues)
+        if x < 2:
+            print(list_of_issues[0]['node'].keys())
+        else: 
+            break
+    print(bad_data_files)
+            
+def handle_repo_issues(list_of_issues):
+    for issue in list_of_issues:
+        print(issue['node']['author']['url'])
+        list_of_comments = issue['node']['comments']['edges']
+        handle_issue_comments(list_of_comments)
+
+def handle_issue_comments(list_of_comments):
+    for comment in list_of_comments:
+        comment_body = comment['node']['body']
+        comment_author = comment['node']['author']
+
+        print(comment_author)
+        print(comment_body)
+
+
+if __name__ == "__main__":
+    check_files_for_content(os.listdir(path))
\ No newline at end of file