updates to main functions, data collection

This commit is contained in:
mjgaughan 2023-11-06 16:20:35 -06:00
parent fd10bdfa33
commit d794f1b50d
4 changed files with 74 additions and 32 deletions

View File

@ -11,7 +11,7 @@ def main(vcs, early_cutoff):
repo_name = '"' + vcs_list[-1] + '"'
repo_owner = '"' + vcs_list[-2] + '"'
gsql_dict["original_returned_content"] = get_discussion_gql(repo_owner, repo_name)
gsql_dict["time_cleaned_comm"] = within_time(gsql_dict["original_returned_content"].content, early_cutoff)
gsql_dict["time_cleaned_comm"] = within_time(gsql_dict["original_returned_content"], early_cutoff)
return gsql_dict
def get_discussion_gql(repo_owner, repo_name):
@ -61,9 +61,10 @@ def get_discussion_gql(repo_owner, repo_name):
headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8', 'Authorization': 'bearer ' + key}
r = requests.post(url=url, data=data_json, headers=headers)
#print(r.content)
return r
return r.json()
def within_time(comment_content, early_cutoff):
try:
list_of_comments = json.loads(comment_content)["data"]["repository"]["discussions"]["edges"]
valid_comments = []
for comment in list_of_comments:
@ -72,6 +73,9 @@ def within_time(comment_content, early_cutoff):
else:
valid_comments.append(comment)
return valid_comments
except TypeError:
print("no discussions found")
return []
if __name__ == "__main__":

View File

@ -21,7 +21,8 @@ def get_milestone_information(repo_uri):
repo_uri_list = repo_uri.split('/')
print(repo_uri_list)
api_url = "https://api.github.com/repos/" + repo_uri_list[-2] + "/" + repo_uri_list[-1] + "/milestones"
response = requests.get(api_url)
headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8', 'Authorization': 'bearer ' + key}
response = requests.get(url = api_url, headers=headers)
response_dict = response.json()
return response_dict
@ -32,9 +33,13 @@ def parse_milestones(milestones, earliest_date):
# TODO: decide whether to use created_at or updated_at or closed_at
# problem is that no one closes their milestones?! hardly seems representative?!
# making this a note here, as both Tamburri and van Meijel use 'closed_at'
print(entry)
try:
if entry['updated_at'] != None:
if dt.datetime.fromisoformat(entry['updated_at'][:-1]) > earliest_date:
count_of_milestones += 1
except TypeError:
print("string indices error? or I think maybe they just don't use milestones")
return count_of_milestones

35
main.py
View File

@ -2,18 +2,20 @@ import perceval
import os
import yaml
import datetime as dt
import json
#functions from other files
import perceval_tasks as pt
import github_api_req as gha
import gh_gsql_req as ghs
def main():
# we should discuss whether we're using the 93 day window that seems to be widely used or if we want a longer window
early_cutoff = dt.datetime(2023,10, 11)
early_cutoff = dt.datetime(2013,11, 6)
print("Earliest date examined: " + str(early_cutoff))
largest_object = {}
#manifest = '../kaylea_dissertation/lifecycle/package_metadata/jupyter-notebook_manifest.yaml'
directory='../kaylea_dissertation/lifecycle/package_metadata/'
count_of_dir = 0
for filename in os.listdir(directory):
f = os.path.join(directory, filename)
# checking if it is a file
@ -21,8 +23,15 @@ def main():
print(f)
get_everything(f, largest_object, early_cutoff)
#remove this and it should just run? for the most part at least I think
count_of_dir += 1
if count_of_dir > 2:
break
print(largest_object.keys())
print(len(largest_object.keys()))
for repo in largest_object:
print(largest_object[repo]['new_formality'])
with open('result.json', 'w') as results_path:
json.dump(largest_object, results_path)
def get_everything(manifest_path, largest_object, early_cutoff):
with open(manifest_path, 'r') as stream:
@ -30,24 +39,30 @@ def get_everything(manifest_path, largest_object, early_cutoff):
config = yaml.safe_load(stream)
#below lines will probably need to be refactored as tasks expand
vcs_path = config['Upstream_VCS']
#print("------------------")
print("------------------")
#print(vcs_path)
repo_path = vcs_path[0]
largest_object[repo_path] = {}
largest_object[repo_path]["perceval_obj"] = pt.main(vcs_path, early_cutoff)
if len(largest_object[repo_path]["perceval_obj"]) == 0:
print("PERCEVAL ERROR")
del largest_object[repo_path]
return
largest_object[repo_path]["gha_obj"] = gha.main(vcs_path, early_cutoff)
#these are the two variables in the denominator of the formality measure
#print("Age of Project: " + str(largest_object[repo_path]["perceval_obj"]['age_of_project']))
#print('Contributor Count: ' + str(len(largest_object[repo_path]["perceval_obj"]['contributors'])))
#print('Collaborator Count: ' + str(len(largest_object[repo_path]["perceval_obj"]['collaborators'])))
#print('Number of Milestones: ' + str(largest_object[repo_path]["gha_obj"]['milestone_count']))
largest_object[repo_path]['new_mmt'] = compute_new_mmt(len(largest_object[repo_path]["perceval_obj"]['contributors']), len(largest_object[repo_path]["perceval_obj"]['collaborators']))
'''
if largest_object[repo_path]["gha_obj"]['milestone_count'] == 0:
#del largest_object[repo_path]
#return
#this is to ensure that projects which don't use milestones are counted
largest_object[repo_path]["gha_obj"]['milestone_count'] = 0.1
largest_object[repo_path]['new_mmt'] = compute_new_mmt(largest_object[repo_path]["perceval_obj"]['contributors'], largest_object[repo_path]["perceval_obj"]['collaborators'])
#print('New MMT: ' + str(largest_object[repo_path]['new_mmt']))
largest_object[repo_path]['old_mmt'] = compute_old_mmt(len(largest_object[repo_path]["perceval_obj"]['contributors']), len(largest_object[repo_path]["perceval_obj"]['collaborators']))
largest_object[repo_path]['old_mmt'] = compute_old_mmt(largest_object[repo_path]["perceval_obj"]['contributors'], largest_object[repo_path]["perceval_obj"]['collaborators'])
#print('Old MMT: ' + str(largest_object[repo_path]['old_mmt']))
#new mmt formality score
largest_object[repo_path]['new_formality'] = compute_formality_score(largest_object[repo_path]['new_mmt'], largest_object[repo_path]["gha_obj"]['milestone_count'], largest_object[repo_path]["perceval_obj"]['age_of_project'])
print(largest_object[repo_path]['new_formality'])
'''
# testing out beneath:
largest_object[repo_path]['ghs_obj'] = ghs.main(vcs_path, early_cutoff)
#print(ghs_obj["time_cleaned_comm"])

View File

@ -3,26 +3,43 @@ from perceval.backends.core.git import Git
import argparse
#globals
repo_dir = '/tmp/'
#repo_dir = '/tmp/'
#main function for all subsequent tasks using perceval
def main(vcs_path, begin_date):
perceval_info = {}
perceval_info['list_of_commits'] = get_perceval_log(vcs_path, begin_date)
if len(perceval_info['list_of_commits']) > 0:
perceval_info['age_of_project'] = get_repo_age(perceval_info['list_of_commits'])
perceval_info['contributors'], perceval_info['collaborators'] = get_all_actors(perceval_info['list_of_commits'])
del perceval_info['list_of_commits']
return perceval_info
else:
print('error, no commits found?')
return {}
# this is the primary function for getting the list of commits from perceval
def get_perceval_log(vcs_path, begin_date):
print(vcs_path)
repo_dir = '/tmp/' + str(vcs_path[0].split('/')[-1])
repo_dir = '/Users/mgone/Desktop/tmp/' + str(vcs_path[0].split('/')[-1])
try:
#gitpath=repo_dir
repo = Git(uri=vcs_path[0], gitpath=repo_dir)
# this is a temporary date_from, will need to be more inclusive in the future
fetched_commits = repo.fetch(from_date=begin_date)
return list(fetched_commits)
except:
print("error, cannot fetch repo data?")
return {}
'''
#gitpath=repo_dir
repo = Git(uri=vcs_path[0], gitpath=repo_dir)
# this is a temporary date_from, will need to be more inclusive in the future
fetched_commits = repo.fetch(from_date=begin_date)
return list(fetched_commits)
'''
#this function is just to evaluate the repository age, as defined by Tamburri and used by van Meijel
def get_repo_age(all_commits):
@ -34,6 +51,7 @@ def get_repo_age(all_commits):
project_life = last_date - first_date
return project_life.total_seconds() / 86400
#attempt at getting the rosters, though need to make sure that we can get the MR
def get_all_actors(all_commits):
#collaborators are more senior than contributors, doing it by author/commit
@ -50,5 +68,5 @@ def get_all_actors(all_commits):
for committer in committers:
if committer in authors:
authors.remove(committer)
return authors, committers
return len(authors), len(committers)