From d794f1b50dbd50fbf98ebde8ac774dffccd0d3ad Mon Sep 17 00:00:00 2001 From: mjgaughan Date: Mon, 6 Nov 2023 16:20:35 -0600 Subject: [PATCH] updates to main functions, data collection --- gh_gsql_req.py | 24 ++++++++++++++---------- github_api_req.py | 13 +++++++++---- main.py | 37 ++++++++++++++++++++++++++----------- perceval_tasks.py | 32 +++++++++++++++++++++++++------- 4 files changed, 74 insertions(+), 32 deletions(-) diff --git a/gh_gsql_req.py b/gh_gsql_req.py index 97118b2..b977c3a 100644 --- a/gh_gsql_req.py +++ b/gh_gsql_req.py @@ -11,7 +11,7 @@ def main(vcs, early_cutoff): repo_name = '"' + vcs_list[-1] + '"' repo_owner = '"' + vcs_list[-2] + '"' gsql_dict["original_returned_content"] = get_discussion_gql(repo_owner, repo_name) - gsql_dict["time_cleaned_comm"] = within_time(gsql_dict["original_returned_content"].content, early_cutoff) + gsql_dict["time_cleaned_comm"] = within_time(gsql_dict["original_returned_content"], early_cutoff) return gsql_dict def get_discussion_gql(repo_owner, repo_name): @@ -61,17 +61,21 @@ def get_discussion_gql(repo_owner, repo_name): headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8', 'Authorization': 'bearer ' + key} r = requests.post(url=url, data=data_json, headers=headers) #print(r.content) - return r + return r.json() def within_time(comment_content, early_cutoff): - list_of_comments = json.loads(comment_content)["data"]["repository"]["discussions"]["edges"] - valid_comments = [] - for comment in list_of_comments: - if dt.datetime.fromisoformat(comment['node']['createdAt'][:-1]) < early_cutoff: - break - else: - valid_comments.append(comment) - return valid_comments + try: + list_of_comments = json.loads(comment_content)["data"]["repository"]["discussions"]["edges"] + valid_comments = [] + for comment in list_of_comments: + if dt.datetime.fromisoformat(comment['node']['createdAt'][:-1]) < early_cutoff: + break + else: + valid_comments.append(comment) + return valid_comments + except TypeError: + print("no discussions found") + return [] if __name__ == "__main__": diff --git a/github_api_req.py b/github_api_req.py index 623b941..738cac5 100644 --- a/github_api_req.py +++ b/github_api_req.py @@ -21,7 +21,8 @@ def get_milestone_information(repo_uri): repo_uri_list = repo_uri.split('/') print(repo_uri_list) api_url = "https://api.github.com/repos/" + repo_uri_list[-2] + "/" + repo_uri_list[-1] + "/milestones" - response = requests.get(api_url) + headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8', 'Authorization': 'bearer ' + key} + response = requests.get(url = api_url, headers=headers) response_dict = response.json() return response_dict @@ -32,9 +33,13 @@ def parse_milestones(milestones, earliest_date): # TODO: decide whether to use created_at or updated_at or closed_at # problem is that no one closes their milestones?! hardly seems representative?! # making this a note here, as both Tamburri and van Meijel use 'closed_at' - if entry['updated_at'] != None: - if dt.datetime.fromisoformat(entry['updated_at'][:-1]) > earliest_date: - count_of_milestones += 1 + print(entry) + try: + if entry['updated_at'] != None: + if dt.datetime.fromisoformat(entry['updated_at'][:-1]) > earliest_date: + count_of_milestones += 1 + except TypeError: + print("string indices error? or I think maybe they just don't use milestones") return count_of_milestones diff --git a/main.py b/main.py index 0c30785..d17e214 100644 --- a/main.py +++ b/main.py @@ -2,18 +2,20 @@ import perceval import os import yaml import datetime as dt - +import json +#functions from other files import perceval_tasks as pt import github_api_req as gha import gh_gsql_req as ghs def main(): # we should discuss whether we're using the 93 day window that seems to be widely used or if we want a longer window - early_cutoff = dt.datetime(2023,10, 11) + early_cutoff = dt.datetime(2013,11, 6) print("Earliest date examined: " + str(early_cutoff)) largest_object = {} #manifest = '../kaylea_dissertation/lifecycle/package_metadata/jupyter-notebook_manifest.yaml' directory='../kaylea_dissertation/lifecycle/package_metadata/' + count_of_dir = 0 for filename in os.listdir(directory): f = os.path.join(directory, filename) # checking if it is a file @@ -21,8 +23,15 @@ def main(): print(f) get_everything(f, largest_object, early_cutoff) #remove this and it should just run? for the most part at least I think - break + count_of_dir += 1 + if count_of_dir > 2: + break print(largest_object.keys()) + print(len(largest_object.keys())) + for repo in largest_object: + print(largest_object[repo]['new_formality']) + with open('result.json', 'w') as results_path: + json.dump(largest_object, results_path) def get_everything(manifest_path, largest_object, early_cutoff): with open(manifest_path, 'r') as stream: @@ -30,24 +39,30 @@ def get_everything(manifest_path, largest_object, early_cutoff): config = yaml.safe_load(stream) #below lines will probably need to be refactored as tasks expand vcs_path = config['Upstream_VCS'] - #print("------------------") + print("------------------") #print(vcs_path) repo_path = vcs_path[0] largest_object[repo_path] = {} largest_object[repo_path]["perceval_obj"] = pt.main(vcs_path, early_cutoff) + if len(largest_object[repo_path]["perceval_obj"]) == 0: + print("PERCEVAL ERROR") + del largest_object[repo_path] + return largest_object[repo_path]["gha_obj"] = gha.main(vcs_path, early_cutoff) - #these are the two variables in the denominator of the formality measure - #print("Age of Project: " + str(largest_object[repo_path]["perceval_obj"]['age_of_project'])) - #print('Contributor Count: ' + str(len(largest_object[repo_path]["perceval_obj"]['contributors']))) - #print('Collaborator Count: ' + str(len(largest_object[repo_path]["perceval_obj"]['collaborators']))) - #print('Number of Milestones: ' + str(largest_object[repo_path]["gha_obj"]['milestone_count'])) - largest_object[repo_path]['new_mmt'] = compute_new_mmt(len(largest_object[repo_path]["perceval_obj"]['contributors']), len(largest_object[repo_path]["perceval_obj"]['collaborators'])) + ''' + if largest_object[repo_path]["gha_obj"]['milestone_count'] == 0: + #del largest_object[repo_path] + #return + #this is to ensure that projects which don't use milestones are counted + largest_object[repo_path]["gha_obj"]['milestone_count'] = 0.1 + largest_object[repo_path]['new_mmt'] = compute_new_mmt(largest_object[repo_path]["perceval_obj"]['contributors'], largest_object[repo_path]["perceval_obj"]['collaborators']) #print('New MMT: ' + str(largest_object[repo_path]['new_mmt'])) - largest_object[repo_path]['old_mmt'] = compute_old_mmt(len(largest_object[repo_path]["perceval_obj"]['contributors']), len(largest_object[repo_path]["perceval_obj"]['collaborators'])) + largest_object[repo_path]['old_mmt'] = compute_old_mmt(largest_object[repo_path]["perceval_obj"]['contributors'], largest_object[repo_path]["perceval_obj"]['collaborators']) #print('Old MMT: ' + str(largest_object[repo_path]['old_mmt'])) #new mmt formality score largest_object[repo_path]['new_formality'] = compute_formality_score(largest_object[repo_path]['new_mmt'], largest_object[repo_path]["gha_obj"]['milestone_count'], largest_object[repo_path]["perceval_obj"]['age_of_project']) print(largest_object[repo_path]['new_formality']) + ''' # testing out beneath: largest_object[repo_path]['ghs_obj'] = ghs.main(vcs_path, early_cutoff) #print(ghs_obj["time_cleaned_comm"]) diff --git a/perceval_tasks.py b/perceval_tasks.py index 05d3b99..a2153eb 100644 --- a/perceval_tasks.py +++ b/perceval_tasks.py @@ -3,26 +3,43 @@ from perceval.backends.core.git import Git import argparse #globals -repo_dir = '/tmp/' +#repo_dir = '/tmp/' #main function for all subsequent tasks using perceval def main(vcs_path, begin_date): perceval_info = {} perceval_info['list_of_commits'] = get_perceval_log(vcs_path, begin_date) - perceval_info['age_of_project'] = get_repo_age(perceval_info['list_of_commits']) - perceval_info['contributors'], perceval_info['collaborators'] = get_all_actors(perceval_info['list_of_commits']) - return perceval_info + if len(perceval_info['list_of_commits']) > 0: + perceval_info['age_of_project'] = get_repo_age(perceval_info['list_of_commits']) + perceval_info['contributors'], perceval_info['collaborators'] = get_all_actors(perceval_info['list_of_commits']) + del perceval_info['list_of_commits'] + return perceval_info + else: + print('error, no commits found?') + return {} + # this is the primary function for getting the list of commits from perceval def get_perceval_log(vcs_path, begin_date): print(vcs_path) - repo_dir = '/tmp/' + str(vcs_path[0].split('/')[-1]) - #gitpath=repo_dir + repo_dir = '/Users/mgone/Desktop/tmp/' + str(vcs_path[0].split('/')[-1]) + try: + #gitpath=repo_dir + repo = Git(uri=vcs_path[0], gitpath=repo_dir) + # this is a temporary date_from, will need to be more inclusive in the future + fetched_commits = repo.fetch(from_date=begin_date) + return list(fetched_commits) + except: + print("error, cannot fetch repo data?") + return {} + ''' + #gitpath=repo_dir repo = Git(uri=vcs_path[0], gitpath=repo_dir) # this is a temporary date_from, will need to be more inclusive in the future fetched_commits = repo.fetch(from_date=begin_date) return list(fetched_commits) + ''' #this function is just to evaluate the repository age, as defined by Tamburri and used by van Meijel def get_repo_age(all_commits): @@ -34,6 +51,7 @@ def get_repo_age(all_commits): project_life = last_date - first_date return project_life.total_seconds() / 86400 + #attempt at getting the rosters, though need to make sure that we can get the MR def get_all_actors(all_commits): #collaborators are more senior than contributors, doing it by author/commit @@ -50,5 +68,5 @@ def get_all_actors(all_commits): for committer in committers: if committer in authors: authors.remove(committer) - return authors, committers + return len(authors), len(committers)