updates to main functions, data collection

2023-11-06 16:20:35 -06:00 · 2023-11-06 16:20:35 -06:00 · d794f1b50d
commit d794f1b50d
parent fd10bdfa33
4 changed files with 74 additions and 32 deletions
--- a/gh_gsql_req.py
+++ b/gh_gsql_req.py
@ -11,7 +11,7 @@ def main(vcs, early_cutoff):
    repo_name = '"' + vcs_list[-1] + '"'
    repo_owner = '"' + vcs_list[-2] + '"'
    gsql_dict["original_returned_content"] = get_discussion_gql(repo_owner, repo_name)
-    gsql_dict["time_cleaned_comm"] = within_time(gsql_dict["original_returned_content"].content, early_cutoff)
+    gsql_dict["time_cleaned_comm"] = within_time(gsql_dict["original_returned_content"], early_cutoff)
    return gsql_dict
 def get_discussion_gql(repo_owner, repo_name):
@ -61,17 +61,21 @@ def get_discussion_gql(repo_owner, repo_name):
    headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8', 'Authorization': 'bearer ' + key}
    r = requests.post(url=url, data=data_json, headers=headers)
    #print(r.content)
-    return r
+    return r.json()
 def within_time(comment_content, early_cutoff):
-    list_of_comments = json.loads(comment_content)["data"]["repository"]["discussions"]["edges"]
+    try: 
-    valid_comments = []
+        list_of_comments = json.loads(comment_content)["data"]["repository"]["discussions"]["edges"]
-    for comment in list_of_comments:
+        valid_comments = []
-        if dt.datetime.fromisoformat(comment['node']['createdAt'][:-1]) < early_cutoff:
+        for comment in list_of_comments:
-            break
+            if dt.datetime.fromisoformat(comment['node']['createdAt'][:-1]) < early_cutoff:
-        else:
+                break
-            valid_comments.append(comment)
+            else:
-    return valid_comments
+                valid_comments.append(comment)
        return valid_comments
    except TypeError:
        print("no discussions found")
        return []
 if __name__ == "__main__":
--- a/github_api_req.py
+++ b/github_api_req.py
@ -21,7 +21,8 @@ def get_milestone_information(repo_uri):
    repo_uri_list = repo_uri.split('/')
    print(repo_uri_list)
    api_url = "https://api.github.com/repos/" + repo_uri_list[-2]  + "/" + repo_uri_list[-1] + "/milestones"
-    response = requests.get(api_url)
+    headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8', 'Authorization': 'bearer ' + key}
    response = requests.get(url = api_url, headers=headers)
    response_dict = response.json()
    return response_dict
@ -32,9 +33,13 @@ def parse_milestones(milestones, earliest_date):
        # TODO: decide whether to use created_at or updated_at or closed_at
        # problem is that no one closes their milestones?! hardly seems representative?!
        # making this a note here, as both Tamburri and van Meijel use 'closed_at'
-        if entry['updated_at'] != None:
+        print(entry)
-            if dt.datetime.fromisoformat(entry['updated_at'][:-1]) > earliest_date:
+        try:
-                count_of_milestones += 1
+            if entry['updated_at'] != None:
                if dt.datetime.fromisoformat(entry['updated_at'][:-1]) > earliest_date:
                    count_of_milestones += 1
        except TypeError:
            print("string indices error? or I think maybe they just don't use milestones")
    return count_of_milestones
--- a/main.py
+++ b/main.py
@ -2,18 +2,20 @@ import perceval
 import os
 import yaml
 import datetime as dt
-
+import json
 #functions from other files
 import perceval_tasks as pt
 import github_api_req as gha
 import gh_gsql_req as ghs
 def main():
    # we should discuss whether we're using the 93 day window that seems to be widely used or if we want a longer window
-    early_cutoff = dt.datetime(2023,10, 11)
+    early_cutoff = dt.datetime(2013,11, 6)
    print("Earliest date examined: " + str(early_cutoff))
    largest_object = {}
    #manifest = '../kaylea_dissertation/lifecycle/package_metadata/jupyter-notebook_manifest.yaml'
    directory='../kaylea_dissertation/lifecycle/package_metadata/'
    count_of_dir = 0
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        # checking if it is a file
@ -21,8 +23,15 @@ def main():
            print(f)   
        get_everything(f, largest_object, early_cutoff)
        #remove this and it should just run? for the most part at least I think
-        break
+        count_of_dir += 1
        if count_of_dir > 2:
            break
    print(largest_object.keys())
    print(len(largest_object.keys()))
    for repo in largest_object:
        print(largest_object[repo]['new_formality'])
    with open('result.json', 'w') as results_path:
        json.dump(largest_object, results_path)
 def get_everything(manifest_path, largest_object, early_cutoff):
    with open(manifest_path, 'r') as stream:
@ -30,24 +39,30 @@ def get_everything(manifest_path, largest_object, early_cutoff):
            config = yaml.safe_load(stream)
            #below lines will probably need to be refactored as tasks expand
            vcs_path = config['Upstream_VCS']
-            #print("------------------")
+            print("------------------")
            #print(vcs_path)
            repo_path = vcs_path[0]
            largest_object[repo_path] = {}
            largest_object[repo_path]["perceval_obj"] = pt.main(vcs_path, early_cutoff)
            if len(largest_object[repo_path]["perceval_obj"]) == 0:
                print("PERCEVAL ERROR")
                del largest_object[repo_path]
                return
            largest_object[repo_path]["gha_obj"] = gha.main(vcs_path, early_cutoff)
-            #these are the two variables in the denominator of the formality measure
+            '''
-            #print("Age of Project: " + str(largest_object[repo_path]["perceval_obj"]['age_of_project']))
+            if largest_object[repo_path]["gha_obj"]['milestone_count'] == 0:
-            #print('Contributor Count: ' + str(len(largest_object[repo_path]["perceval_obj"]['contributors'])))
+                #del largest_object[repo_path]
-            #print('Collaborator Count: ' + str(len(largest_object[repo_path]["perceval_obj"]['collaborators'])))
+                #return
-            #print('Number of Milestones: ' + str(largest_object[repo_path]["gha_obj"]['milestone_count']))
+                #this is to ensure that projects which don't use milestones are counted
-            largest_object[repo_path]['new_mmt'] = compute_new_mmt(len(largest_object[repo_path]["perceval_obj"]['contributors']), len(largest_object[repo_path]["perceval_obj"]['collaborators']))
+                largest_object[repo_path]["gha_obj"]['milestone_count'] = 0.1
            largest_object[repo_path]['new_mmt'] = compute_new_mmt(largest_object[repo_path]["perceval_obj"]['contributors'], largest_object[repo_path]["perceval_obj"]['collaborators'])
            #print('New MMT: ' + str(largest_object[repo_path]['new_mmt']))
-            largest_object[repo_path]['old_mmt'] = compute_old_mmt(len(largest_object[repo_path]["perceval_obj"]['contributors']), len(largest_object[repo_path]["perceval_obj"]['collaborators']))
+            largest_object[repo_path]['old_mmt'] = compute_old_mmt(largest_object[repo_path]["perceval_obj"]['contributors'], largest_object[repo_path]["perceval_obj"]['collaborators'])
            #print('Old MMT: ' + str(largest_object[repo_path]['old_mmt']))
            #new mmt formality score
            largest_object[repo_path]['new_formality'] = compute_formality_score(largest_object[repo_path]['new_mmt'], largest_object[repo_path]["gha_obj"]['milestone_count'], largest_object[repo_path]["perceval_obj"]['age_of_project'])
            print(largest_object[repo_path]['new_formality'])
            '''
            # testing out beneath:
            largest_object[repo_path]['ghs_obj'] = ghs.main(vcs_path, early_cutoff)
            #print(ghs_obj["time_cleaned_comm"])
--- a/perceval_tasks.py
+++ b/perceval_tasks.py
@ -3,26 +3,43 @@ from perceval.backends.core.git import Git
 import argparse
 #globals
-repo_dir = '/tmp/'
+#repo_dir = '/tmp/'
 #main function for all subsequent tasks using perceval 
 def main(vcs_path, begin_date):
    perceval_info = {}
    perceval_info['list_of_commits'] = get_perceval_log(vcs_path, begin_date)
-    perceval_info['age_of_project'] = get_repo_age(perceval_info['list_of_commits'])
+    if len(perceval_info['list_of_commits']) > 0:
-    perceval_info['contributors'], perceval_info['collaborators'] = get_all_actors(perceval_info['list_of_commits'])
+        perceval_info['age_of_project'] = get_repo_age(perceval_info['list_of_commits'])
-    return perceval_info
+        perceval_info['contributors'], perceval_info['collaborators'] = get_all_actors(perceval_info['list_of_commits'])
        del perceval_info['list_of_commits']
        return perceval_info
    else:
        print('error, no commits found?')
        return {}
 # this is the primary function for getting the list of commits from perceval
 def get_perceval_log(vcs_path, begin_date):
    print(vcs_path)
-    repo_dir = '/tmp/' + str(vcs_path[0].split('/')[-1])
+    repo_dir = '/Users/mgone/Desktop/tmp/' + str(vcs_path[0].split('/')[-1])
-    #gitpath=repo_dir
+    try:
        #gitpath=repo_dir
        repo = Git(uri=vcs_path[0], gitpath=repo_dir)
        # this is a temporary date_from, will need to be more inclusive in the future
        fetched_commits = repo.fetch(from_date=begin_date)
        return list(fetched_commits)
    except: 
        print("error, cannot fetch repo data?")
        return {}
    '''
     #gitpath=repo_dir
    repo = Git(uri=vcs_path[0], gitpath=repo_dir)
    # this is a temporary date_from, will need to be more inclusive in the future
    fetched_commits = repo.fetch(from_date=begin_date)
    return list(fetched_commits)
    '''
 #this function is just to evaluate the repository age, as defined by Tamburri and used by van Meijel
 def get_repo_age(all_commits):
@ -34,6 +51,7 @@ def get_repo_age(all_commits):
    project_life = last_date - first_date
    return project_life.total_seconds() / 86400
 #attempt at getting the rosters, though need to make sure that we can get the MR
 def get_all_actors(all_commits):
    #collaborators are more senior than contributors, doing it by author/commit
@ -50,5 +68,5 @@ def get_all_actors(all_commits):
    for committer in committers:
        if committer in authors:
            authors.remove(committer)
-    return authors, committers
+    return len(authors), len(committers)