From d794f1b50dbd50fbf98ebde8ac774dffccd0d3ad Mon Sep 17 00:00:00 2001
From: mjgaughan <mgaughan@proton.me>
Date: Mon, 6 Nov 2023 16:20:35 -0600
Subject: [PATCH] updates to main functions, data collection

---
 gh_gsql_req.py    | 24 ++++++++++++++----------
 github_api_req.py | 13 +++++++++----
 main.py           | 37 ++++++++++++++++++++++++++-----------
 perceval_tasks.py | 32 +++++++++++++++++++++++++-------
 4 files changed, 74 insertions(+), 32 deletions(-)

diff --git a/gh_gsql_req.py b/gh_gsql_req.py
index 97118b2..b977c3a 100644
--- a/gh_gsql_req.py
+++ b/gh_gsql_req.py
@@ -11,7 +11,7 @@ def main(vcs, early_cutoff):
     repo_name = '"' + vcs_list[-1] + '"'
     repo_owner = '"' + vcs_list[-2] + '"'
     gsql_dict["original_returned_content"] = get_discussion_gql(repo_owner, repo_name)
-    gsql_dict["time_cleaned_comm"] = within_time(gsql_dict["original_returned_content"].content, early_cutoff)
+    gsql_dict["time_cleaned_comm"] = within_time(gsql_dict["original_returned_content"], early_cutoff)
     return gsql_dict
 
 def get_discussion_gql(repo_owner, repo_name):
@@ -61,17 +61,21 @@ def get_discussion_gql(repo_owner, repo_name):
     headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8', 'Authorization': 'bearer ' + key}
     r = requests.post(url=url, data=data_json, headers=headers)
     #print(r.content)
-    return r
+    return r.json()
 
 def within_time(comment_content, early_cutoff):
-    list_of_comments = json.loads(comment_content)["data"]["repository"]["discussions"]["edges"]
-    valid_comments = []
-    for comment in list_of_comments:
-        if dt.datetime.fromisoformat(comment['node']['createdAt'][:-1]) < early_cutoff:
-            break
-        else:
-            valid_comments.append(comment)
-    return valid_comments
+    try: 
+        list_of_comments = json.loads(comment_content)["data"]["repository"]["discussions"]["edges"]
+        valid_comments = []
+        for comment in list_of_comments:
+            if dt.datetime.fromisoformat(comment['node']['createdAt'][:-1]) < early_cutoff:
+                break
+            else:
+                valid_comments.append(comment)
+        return valid_comments
+    except TypeError:
+        print("no discussions found")
+        return []
     
 
 if __name__ == "__main__":
diff --git a/github_api_req.py b/github_api_req.py
index 623b941..738cac5 100644
--- a/github_api_req.py
+++ b/github_api_req.py
@@ -21,7 +21,8 @@ def get_milestone_information(repo_uri):
     repo_uri_list = repo_uri.split('/')
     print(repo_uri_list)
     api_url = "https://api.github.com/repos/" + repo_uri_list[-2]  + "/" + repo_uri_list[-1] + "/milestones"
-    response = requests.get(api_url)
+    headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8', 'Authorization': 'bearer ' + key}
+    response = requests.get(url = api_url, headers=headers)
     response_dict = response.json()
     return response_dict
 
@@ -32,9 +33,13 @@ def parse_milestones(milestones, earliest_date):
         # TODO: decide whether to use created_at or updated_at or closed_at
         # problem is that no one closes their milestones?! hardly seems representative?!
         # making this a note here, as both Tamburri and van Meijel use 'closed_at'
-        if entry['updated_at'] != None:
-            if dt.datetime.fromisoformat(entry['updated_at'][:-1]) > earliest_date:
-                count_of_milestones += 1
+        print(entry)
+        try:
+            if entry['updated_at'] != None:
+                if dt.datetime.fromisoformat(entry['updated_at'][:-1]) > earliest_date:
+                    count_of_milestones += 1
+        except TypeError:
+            print("string indices error? or I think maybe they just don't use milestones")
     return count_of_milestones
 
 
diff --git a/main.py b/main.py
index 0c30785..d17e214 100644
--- a/main.py
+++ b/main.py
@@ -2,18 +2,20 @@ import perceval
 import os
 import yaml
 import datetime as dt
-
+import json
+#functions from other files
 import perceval_tasks as pt
 import github_api_req as gha
 import gh_gsql_req as ghs
 
 def main():
     # we should discuss whether we're using the 93 day window that seems to be widely used or if we want a longer window
-    early_cutoff = dt.datetime(2023,10, 11)
+    early_cutoff = dt.datetime(2013,11, 6)
     print("Earliest date examined: " + str(early_cutoff))
     largest_object = {}
     #manifest = '../kaylea_dissertation/lifecycle/package_metadata/jupyter-notebook_manifest.yaml'
     directory='../kaylea_dissertation/lifecycle/package_metadata/'
+    count_of_dir = 0
     for filename in os.listdir(directory):
         f = os.path.join(directory, filename)
         # checking if it is a file
@@ -21,8 +23,15 @@ def main():
             print(f)   
         get_everything(f, largest_object, early_cutoff)
         #remove this and it should just run? for the most part at least I think
-        break
+        count_of_dir += 1
+        if count_of_dir > 2:
+            break
     print(largest_object.keys())
+    print(len(largest_object.keys()))
+    for repo in largest_object:
+        print(largest_object[repo]['new_formality'])
+    with open('result.json', 'w') as results_path:
+        json.dump(largest_object, results_path)
 
 def get_everything(manifest_path, largest_object, early_cutoff):
     with open(manifest_path, 'r') as stream:
@@ -30,24 +39,30 @@ def get_everything(manifest_path, largest_object, early_cutoff):
             config = yaml.safe_load(stream)
             #below lines will probably need to be refactored as tasks expand
             vcs_path = config['Upstream_VCS']
-            #print("------------------")
+            print("------------------")
             #print(vcs_path)
             repo_path = vcs_path[0]
             largest_object[repo_path] = {}
             largest_object[repo_path]["perceval_obj"] = pt.main(vcs_path, early_cutoff)
+            if len(largest_object[repo_path]["perceval_obj"]) == 0:
+                print("PERCEVAL ERROR")
+                del largest_object[repo_path]
+                return
             largest_object[repo_path]["gha_obj"] = gha.main(vcs_path, early_cutoff)
-            #these are the two variables in the denominator of the formality measure
-            #print("Age of Project: " + str(largest_object[repo_path]["perceval_obj"]['age_of_project']))
-            #print('Contributor Count: ' + str(len(largest_object[repo_path]["perceval_obj"]['contributors'])))
-            #print('Collaborator Count: ' + str(len(largest_object[repo_path]["perceval_obj"]['collaborators'])))
-            #print('Number of Milestones: ' + str(largest_object[repo_path]["gha_obj"]['milestone_count']))
-            largest_object[repo_path]['new_mmt'] = compute_new_mmt(len(largest_object[repo_path]["perceval_obj"]['contributors']), len(largest_object[repo_path]["perceval_obj"]['collaborators']))
+            '''
+            if largest_object[repo_path]["gha_obj"]['milestone_count'] == 0:
+                #del largest_object[repo_path]
+                #return
+                #this is to ensure that projects which don't use milestones are counted
+                largest_object[repo_path]["gha_obj"]['milestone_count'] = 0.1
+            largest_object[repo_path]['new_mmt'] = compute_new_mmt(largest_object[repo_path]["perceval_obj"]['contributors'], largest_object[repo_path]["perceval_obj"]['collaborators'])
             #print('New MMT: ' + str(largest_object[repo_path]['new_mmt']))
-            largest_object[repo_path]['old_mmt'] = compute_old_mmt(len(largest_object[repo_path]["perceval_obj"]['contributors']), len(largest_object[repo_path]["perceval_obj"]['collaborators']))
+            largest_object[repo_path]['old_mmt'] = compute_old_mmt(largest_object[repo_path]["perceval_obj"]['contributors'], largest_object[repo_path]["perceval_obj"]['collaborators'])
             #print('Old MMT: ' + str(largest_object[repo_path]['old_mmt']))
             #new mmt formality score
             largest_object[repo_path]['new_formality'] = compute_formality_score(largest_object[repo_path]['new_mmt'], largest_object[repo_path]["gha_obj"]['milestone_count'], largest_object[repo_path]["perceval_obj"]['age_of_project'])
             print(largest_object[repo_path]['new_formality'])
+            '''
             # testing out beneath:
             largest_object[repo_path]['ghs_obj'] = ghs.main(vcs_path, early_cutoff)
             #print(ghs_obj["time_cleaned_comm"])
diff --git a/perceval_tasks.py b/perceval_tasks.py
index 05d3b99..a2153eb 100644
--- a/perceval_tasks.py
+++ b/perceval_tasks.py
@@ -3,26 +3,43 @@ from perceval.backends.core.git import Git
 import argparse
 
 #globals
-repo_dir = '/tmp/'
+#repo_dir = '/tmp/'
 
 #main function for all subsequent tasks using perceval 
 def main(vcs_path, begin_date):
     perceval_info = {}
     perceval_info['list_of_commits'] = get_perceval_log(vcs_path, begin_date)
-    perceval_info['age_of_project'] = get_repo_age(perceval_info['list_of_commits'])
-    perceval_info['contributors'], perceval_info['collaborators'] = get_all_actors(perceval_info['list_of_commits'])
-    return perceval_info
+    if len(perceval_info['list_of_commits']) > 0:
+        perceval_info['age_of_project'] = get_repo_age(perceval_info['list_of_commits'])
+        perceval_info['contributors'], perceval_info['collaborators'] = get_all_actors(perceval_info['list_of_commits'])
+        del perceval_info['list_of_commits']
+        return perceval_info
+    else:
+        print('error, no commits found?')
+        return {}
+
 
 
 # this is the primary function for getting the list of commits from perceval
 def get_perceval_log(vcs_path, begin_date):
     print(vcs_path)
-    repo_dir = '/tmp/' + str(vcs_path[0].split('/')[-1])
-    #gitpath=repo_dir
+    repo_dir = '/Users/mgone/Desktop/tmp/' + str(vcs_path[0].split('/')[-1])
+    try:
+        #gitpath=repo_dir
+        repo = Git(uri=vcs_path[0], gitpath=repo_dir)
+        # this is a temporary date_from, will need to be more inclusive in the future
+        fetched_commits = repo.fetch(from_date=begin_date)
+        return list(fetched_commits)
+    except: 
+        print("error, cannot fetch repo data?")
+        return {}
+    '''
+     #gitpath=repo_dir
     repo = Git(uri=vcs_path[0], gitpath=repo_dir)
     # this is a temporary date_from, will need to be more inclusive in the future
     fetched_commits = repo.fetch(from_date=begin_date)
     return list(fetched_commits)
+    '''
 
 #this function is just to evaluate the repository age, as defined by Tamburri and used by van Meijel
 def get_repo_age(all_commits):
@@ -34,6 +51,7 @@ def get_repo_age(all_commits):
     project_life = last_date - first_date
     return project_life.total_seconds() / 86400
 
+
 #attempt at getting the rosters, though need to make sure that we can get the MR
 def get_all_actors(all_commits):
     #collaborators are more senior than contributors, doing it by author/commit
@@ -50,5 +68,5 @@ def get_all_actors(all_commits):
     for committer in committers:
         if committer in authors:
             authors.remove(committer)
-    return authors, committers
+    return len(authors), len(committers)