24_deb_pkg_gov/main.py

106 lines
3.7 KiB
Python
Raw Permalink Normal View History

2023-10-19 16:46:00 +00:00
import perceval
2023-10-23 20:40:24 +00:00
import os
import yaml
2023-10-26 15:57:56 +00:00
import datetime as dt
import json
#functions from other files
2023-10-23 20:40:24 +00:00
import perceval_tasks as pt
import github_api_req as gha
2023-10-26 15:57:56 +00:00
import gh_gsql_req as ghs
2023-10-23 20:40:24 +00:00
2023-11-06 22:39:07 +00:00
# In total, the data will look like:
# - repository VCS url
# - perceval object
# list of all commits to the project
# count of contributors and collaborators to the project
# age of the project
# - github api object
# object of milestones from the project
# count of milestones from the project
# - github gsql object
# - list of discussion comments from repo
# list left blank if none
2023-10-23 20:40:24 +00:00
def main():
# we should discuss whether we're using the 93 day window that seems to be widely used or if we want a longer window
2023-11-10 21:38:49 +00:00
early_cutoff = dt.datetime(2008,2, 8)
2023-10-24 01:11:51 +00:00
print("Earliest date examined: " + str(early_cutoff))
2023-11-07 23:25:22 +00:00
#largest_object = {}
2023-11-05 17:11:03 +00:00
#manifest = '../kaylea_dissertation/lifecycle/package_metadata/jupyter-notebook_manifest.yaml'
directory='../kaylea_dissertation/lifecycle/package_metadata/'
count_of_dir = 0
2023-11-05 17:11:03 +00:00
for filename in os.listdir(directory):
f = os.path.join(directory, filename)
# checking if it is a file
if os.path.isfile(f):
print(f)
2023-11-07 23:25:22 +00:00
get_everything(f, early_cutoff)
2023-11-05 17:11:03 +00:00
#remove this and it should just run? for the most part at least I think
count_of_dir += 1
2023-11-07 22:09:16 +00:00
#this is what needs to be commented out
2023-11-08 16:45:45 +00:00
#if count_of_dir > 4:
# break
2023-11-07 23:25:22 +00:00
#print(largest_object.keys())
#print(len(largest_object.keys()))
2023-11-07 04:18:30 +00:00
'''
for repo in largest_object:
print(largest_object[repo]['new_formality'])
2023-11-07 04:18:30 +00:00
'''
2023-11-07 23:25:22 +00:00
#with open('/data/users/mgaughan/kkex_data_110723/result.json', 'w') as results_path:
# json.dump(largest_object, results_path)
2023-11-05 17:11:03 +00:00
2023-11-07 23:25:22 +00:00
def get_everything(manifest_path, early_cutoff):
largest_object = {}
2023-11-05 17:11:03 +00:00
with open(manifest_path, 'r') as stream:
2023-10-23 20:40:24 +00:00
try:
config = yaml.safe_load(stream)
#below lines will probably need to be refactored as tasks expand
2023-11-08 19:17:01 +00:00
try:
vcs_path = config['Upstream_VCS']
except:
print('error with the keys, i guess')
return
print("------------------")
2023-11-08 16:55:48 +00:00
#print(vcs_path) return
2023-11-08 16:41:49 +00:00
try:
repo_path = vcs_path[0]
except:
print('vcs error')
return
largest_object[repo_path] = {}
largest_object[repo_path]["perceval_obj"] = pt.main(vcs_path, early_cutoff)
if len(largest_object[repo_path]["perceval_obj"]) == 0:
print("PERCEVAL ERROR")
del largest_object[repo_path]
return
largest_object[repo_path]["gha_obj"] = gha.main(vcs_path, early_cutoff)
# testing out beneath:
largest_object[repo_path]['ghs_obj'] = ghs.main(vcs_path, early_cutoff)
#print(ghs_obj["time_cleaned_comm"])
repo_uri_list = repo_path.split('/')
2023-11-10 21:38:49 +00:00
with open('/data/users/mgaughan/kkex_data_111023/' + repo_uri_list[-2] + '_' + repo_uri_list[-1] + '_result.json', 'w') as data_path:
json.dump(largest_object[repo_path], data_path)
2023-11-07 23:25:22 +00:00
except yaml.YAMLError as err:
2023-10-23 20:40:24 +00:00
print(err)
2023-11-05 17:11:03 +00:00
print("----------------------")
2023-10-23 20:40:24 +00:00
2023-10-24 01:11:51 +00:00
#this is Yoshi 2 MMT per van Meijel
def compute_new_mmt(contrib_count, collab_count):
return (contrib_count + collab_count * 2) / (contrib_count + collab_count)
#this is Yoshi 1 mmt per Tamburri
def compute_old_mmt(contrib_count, collab_count):
return (contrib_count) / (contrib_count + collab_count)
#formality score
def compute_formality_score(mmt, milestones, lifetime):
return mmt / (milestones / lifetime)
2023-10-23 20:40:24 +00:00
if __name__ == "__main__":
2023-11-07 22:11:24 +00:00
main()