updates to main functions, data collection
This commit is contained in:
parent
fd10bdfa33
commit
d794f1b50d
@ -11,7 +11,7 @@ def main(vcs, early_cutoff):
|
||||
repo_name = '"' + vcs_list[-1] + '"'
|
||||
repo_owner = '"' + vcs_list[-2] + '"'
|
||||
gsql_dict["original_returned_content"] = get_discussion_gql(repo_owner, repo_name)
|
||||
gsql_dict["time_cleaned_comm"] = within_time(gsql_dict["original_returned_content"].content, early_cutoff)
|
||||
gsql_dict["time_cleaned_comm"] = within_time(gsql_dict["original_returned_content"], early_cutoff)
|
||||
return gsql_dict
|
||||
|
||||
def get_discussion_gql(repo_owner, repo_name):
|
||||
@ -61,17 +61,21 @@ def get_discussion_gql(repo_owner, repo_name):
|
||||
headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8', 'Authorization': 'bearer ' + key}
|
||||
r = requests.post(url=url, data=data_json, headers=headers)
|
||||
#print(r.content)
|
||||
return r
|
||||
return r.json()
|
||||
|
||||
def within_time(comment_content, early_cutoff):
|
||||
list_of_comments = json.loads(comment_content)["data"]["repository"]["discussions"]["edges"]
|
||||
valid_comments = []
|
||||
for comment in list_of_comments:
|
||||
if dt.datetime.fromisoformat(comment['node']['createdAt'][:-1]) < early_cutoff:
|
||||
break
|
||||
else:
|
||||
valid_comments.append(comment)
|
||||
return valid_comments
|
||||
try:
|
||||
list_of_comments = json.loads(comment_content)["data"]["repository"]["discussions"]["edges"]
|
||||
valid_comments = []
|
||||
for comment in list_of_comments:
|
||||
if dt.datetime.fromisoformat(comment['node']['createdAt'][:-1]) < early_cutoff:
|
||||
break
|
||||
else:
|
||||
valid_comments.append(comment)
|
||||
return valid_comments
|
||||
except TypeError:
|
||||
print("no discussions found")
|
||||
return []
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -21,7 +21,8 @@ def get_milestone_information(repo_uri):
|
||||
repo_uri_list = repo_uri.split('/')
|
||||
print(repo_uri_list)
|
||||
api_url = "https://api.github.com/repos/" + repo_uri_list[-2] + "/" + repo_uri_list[-1] + "/milestones"
|
||||
response = requests.get(api_url)
|
||||
headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8', 'Authorization': 'bearer ' + key}
|
||||
response = requests.get(url = api_url, headers=headers)
|
||||
response_dict = response.json()
|
||||
return response_dict
|
||||
|
||||
@ -32,9 +33,13 @@ def parse_milestones(milestones, earliest_date):
|
||||
# TODO: decide whether to use created_at or updated_at or closed_at
|
||||
# problem is that no one closes their milestones?! hardly seems representative?!
|
||||
# making this a note here, as both Tamburri and van Meijel use 'closed_at'
|
||||
if entry['updated_at'] != None:
|
||||
if dt.datetime.fromisoformat(entry['updated_at'][:-1]) > earliest_date:
|
||||
count_of_milestones += 1
|
||||
print(entry)
|
||||
try:
|
||||
if entry['updated_at'] != None:
|
||||
if dt.datetime.fromisoformat(entry['updated_at'][:-1]) > earliest_date:
|
||||
count_of_milestones += 1
|
||||
except TypeError:
|
||||
print("string indices error? or I think maybe they just don't use milestones")
|
||||
return count_of_milestones
|
||||
|
||||
|
||||
|
37
main.py
37
main.py
@ -2,18 +2,20 @@ import perceval
|
||||
import os
|
||||
import yaml
|
||||
import datetime as dt
|
||||
|
||||
import json
|
||||
#functions from other files
|
||||
import perceval_tasks as pt
|
||||
import github_api_req as gha
|
||||
import gh_gsql_req as ghs
|
||||
|
||||
def main():
|
||||
# we should discuss whether we're using the 93 day window that seems to be widely used or if we want a longer window
|
||||
early_cutoff = dt.datetime(2023,10, 11)
|
||||
early_cutoff = dt.datetime(2013,11, 6)
|
||||
print("Earliest date examined: " + str(early_cutoff))
|
||||
largest_object = {}
|
||||
#manifest = '../kaylea_dissertation/lifecycle/package_metadata/jupyter-notebook_manifest.yaml'
|
||||
directory='../kaylea_dissertation/lifecycle/package_metadata/'
|
||||
count_of_dir = 0
|
||||
for filename in os.listdir(directory):
|
||||
f = os.path.join(directory, filename)
|
||||
# checking if it is a file
|
||||
@ -21,8 +23,15 @@ def main():
|
||||
print(f)
|
||||
get_everything(f, largest_object, early_cutoff)
|
||||
#remove this and it should just run? for the most part at least I think
|
||||
break
|
||||
count_of_dir += 1
|
||||
if count_of_dir > 2:
|
||||
break
|
||||
print(largest_object.keys())
|
||||
print(len(largest_object.keys()))
|
||||
for repo in largest_object:
|
||||
print(largest_object[repo]['new_formality'])
|
||||
with open('result.json', 'w') as results_path:
|
||||
json.dump(largest_object, results_path)
|
||||
|
||||
def get_everything(manifest_path, largest_object, early_cutoff):
|
||||
with open(manifest_path, 'r') as stream:
|
||||
@ -30,24 +39,30 @@ def get_everything(manifest_path, largest_object, early_cutoff):
|
||||
config = yaml.safe_load(stream)
|
||||
#below lines will probably need to be refactored as tasks expand
|
||||
vcs_path = config['Upstream_VCS']
|
||||
#print("------------------")
|
||||
print("------------------")
|
||||
#print(vcs_path)
|
||||
repo_path = vcs_path[0]
|
||||
largest_object[repo_path] = {}
|
||||
largest_object[repo_path]["perceval_obj"] = pt.main(vcs_path, early_cutoff)
|
||||
if len(largest_object[repo_path]["perceval_obj"]) == 0:
|
||||
print("PERCEVAL ERROR")
|
||||
del largest_object[repo_path]
|
||||
return
|
||||
largest_object[repo_path]["gha_obj"] = gha.main(vcs_path, early_cutoff)
|
||||
#these are the two variables in the denominator of the formality measure
|
||||
#print("Age of Project: " + str(largest_object[repo_path]["perceval_obj"]['age_of_project']))
|
||||
#print('Contributor Count: ' + str(len(largest_object[repo_path]["perceval_obj"]['contributors'])))
|
||||
#print('Collaborator Count: ' + str(len(largest_object[repo_path]["perceval_obj"]['collaborators'])))
|
||||
#print('Number of Milestones: ' + str(largest_object[repo_path]["gha_obj"]['milestone_count']))
|
||||
largest_object[repo_path]['new_mmt'] = compute_new_mmt(len(largest_object[repo_path]["perceval_obj"]['contributors']), len(largest_object[repo_path]["perceval_obj"]['collaborators']))
|
||||
'''
|
||||
if largest_object[repo_path]["gha_obj"]['milestone_count'] == 0:
|
||||
#del largest_object[repo_path]
|
||||
#return
|
||||
#this is to ensure that projects which don't use milestones are counted
|
||||
largest_object[repo_path]["gha_obj"]['milestone_count'] = 0.1
|
||||
largest_object[repo_path]['new_mmt'] = compute_new_mmt(largest_object[repo_path]["perceval_obj"]['contributors'], largest_object[repo_path]["perceval_obj"]['collaborators'])
|
||||
#print('New MMT: ' + str(largest_object[repo_path]['new_mmt']))
|
||||
largest_object[repo_path]['old_mmt'] = compute_old_mmt(len(largest_object[repo_path]["perceval_obj"]['contributors']), len(largest_object[repo_path]["perceval_obj"]['collaborators']))
|
||||
largest_object[repo_path]['old_mmt'] = compute_old_mmt(largest_object[repo_path]["perceval_obj"]['contributors'], largest_object[repo_path]["perceval_obj"]['collaborators'])
|
||||
#print('Old MMT: ' + str(largest_object[repo_path]['old_mmt']))
|
||||
#new mmt formality score
|
||||
largest_object[repo_path]['new_formality'] = compute_formality_score(largest_object[repo_path]['new_mmt'], largest_object[repo_path]["gha_obj"]['milestone_count'], largest_object[repo_path]["perceval_obj"]['age_of_project'])
|
||||
print(largest_object[repo_path]['new_formality'])
|
||||
'''
|
||||
# testing out beneath:
|
||||
largest_object[repo_path]['ghs_obj'] = ghs.main(vcs_path, early_cutoff)
|
||||
#print(ghs_obj["time_cleaned_comm"])
|
||||
|
@ -3,26 +3,43 @@ from perceval.backends.core.git import Git
|
||||
import argparse
|
||||
|
||||
#globals
|
||||
repo_dir = '/tmp/'
|
||||
#repo_dir = '/tmp/'
|
||||
|
||||
#main function for all subsequent tasks using perceval
|
||||
def main(vcs_path, begin_date):
|
||||
perceval_info = {}
|
||||
perceval_info['list_of_commits'] = get_perceval_log(vcs_path, begin_date)
|
||||
perceval_info['age_of_project'] = get_repo_age(perceval_info['list_of_commits'])
|
||||
perceval_info['contributors'], perceval_info['collaborators'] = get_all_actors(perceval_info['list_of_commits'])
|
||||
return perceval_info
|
||||
if len(perceval_info['list_of_commits']) > 0:
|
||||
perceval_info['age_of_project'] = get_repo_age(perceval_info['list_of_commits'])
|
||||
perceval_info['contributors'], perceval_info['collaborators'] = get_all_actors(perceval_info['list_of_commits'])
|
||||
del perceval_info['list_of_commits']
|
||||
return perceval_info
|
||||
else:
|
||||
print('error, no commits found?')
|
||||
return {}
|
||||
|
||||
|
||||
|
||||
# this is the primary function for getting the list of commits from perceval
|
||||
def get_perceval_log(vcs_path, begin_date):
|
||||
print(vcs_path)
|
||||
repo_dir = '/tmp/' + str(vcs_path[0].split('/')[-1])
|
||||
#gitpath=repo_dir
|
||||
repo_dir = '/Users/mgone/Desktop/tmp/' + str(vcs_path[0].split('/')[-1])
|
||||
try:
|
||||
#gitpath=repo_dir
|
||||
repo = Git(uri=vcs_path[0], gitpath=repo_dir)
|
||||
# this is a temporary date_from, will need to be more inclusive in the future
|
||||
fetched_commits = repo.fetch(from_date=begin_date)
|
||||
return list(fetched_commits)
|
||||
except:
|
||||
print("error, cannot fetch repo data?")
|
||||
return {}
|
||||
'''
|
||||
#gitpath=repo_dir
|
||||
repo = Git(uri=vcs_path[0], gitpath=repo_dir)
|
||||
# this is a temporary date_from, will need to be more inclusive in the future
|
||||
fetched_commits = repo.fetch(from_date=begin_date)
|
||||
return list(fetched_commits)
|
||||
'''
|
||||
|
||||
#this function is just to evaluate the repository age, as defined by Tamburri and used by van Meijel
|
||||
def get_repo_age(all_commits):
|
||||
@ -34,6 +51,7 @@ def get_repo_age(all_commits):
|
||||
project_life = last_date - first_date
|
||||
return project_life.total_seconds() / 86400
|
||||
|
||||
|
||||
#attempt at getting the rosters, though need to make sure that we can get the MR
|
||||
def get_all_actors(all_commits):
|
||||
#collaborators are more senior than contributors, doing it by author/commit
|
||||
@ -50,5 +68,5 @@ def get_all_actors(all_commits):
|
||||
for committer in committers:
|
||||
if committer in authors:
|
||||
authors.remove(committer)
|
||||
return authors, committers
|
||||
return len(authors), len(committers)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user