import perceval import os import yaml import datetime as dt import json #functions from other files import perceval_tasks as pt import github_api_req as gha import gh_gsql_req as ghs # In total, the data will look like: # - repository VCS url # - perceval object # list of all commits to the project # count of contributors and collaborators to the project # age of the project # - github api object # object of milestones from the project # count of milestones from the project # - github gsql object # - list of discussion comments from repo # list left blank if none def main(): # we should discuss whether we're using the 93 day window that seems to be widely used or if we want a longer window early_cutoff = dt.datetime(2013,11, 6) print("Earliest date examined: " + str(early_cutoff)) largest_object = {} #manifest = '../kaylea_dissertation/lifecycle/package_metadata/jupyter-notebook_manifest.yaml' directory='../kaylea_dissertation/lifecycle/package_metadata/' count_of_dir = 0 for filename in os.listdir(directory): f = os.path.join(directory, filename) # checking if it is a file if os.path.isfile(f): print(f) get_everything(f, largest_object, early_cutoff) #remove this and it should just run? for the most part at least I think count_of_dir += 1 #this is what needs to be commented out if count_of_dir > 4: break print(largest_object.keys()) print(len(largest_object.keys())) ''' for repo in largest_object: print(largest_object[repo]['new_formality']) ''' with open('/data/users/mgaughan/kkex_data_110723/result.json', 'w') as results_path: json.dump(largest_object, results_path) def get_everything(manifest_path, largest_object, early_cutoff): with open(manifest_path, 'r') as stream: try: config = yaml.safe_load(stream) #below lines will probably need to be refactored as tasks expand vcs_path = config['Upstream_VCS'] print("------------------") #print(vcs_path) repo_path = vcs_path[0] largest_object[repo_path] = {} largest_object[repo_path]["perceval_obj"] = pt.main(vcs_path, early_cutoff) if len(largest_object[repo_path]["perceval_obj"]) == 0: print("PERCEVAL ERROR") del largest_object[repo_path] return largest_object[repo_path]["gha_obj"] = gha.main(vcs_path, early_cutoff) ''' if largest_object[repo_path]["gha_obj"]['milestone_count'] == 0: #del largest_object[repo_path] #return #this is to ensure that projects which don't use milestones are counted largest_object[repo_path]["gha_obj"]['milestone_count'] = 0.1 largest_object[repo_path]['new_mmt'] = compute_new_mmt(largest_object[repo_path]["perceval_obj"]['contributors'], largest_object[repo_path]["perceval_obj"]['collaborators']) #print('New MMT: ' + str(largest_object[repo_path]['new_mmt'])) largest_object[repo_path]['old_mmt'] = compute_old_mmt(largest_object[repo_path]["perceval_obj"]['contributors'], largest_object[repo_path]["perceval_obj"]['collaborators']) #print('Old MMT: ' + str(largest_object[repo_path]['old_mmt'])) #new mmt formality score largest_object[repo_path]['new_formality'] = compute_formality_score(largest_object[repo_path]['new_mmt'], largest_object[repo_path]["gha_obj"]['milestone_count'], largest_object[repo_path]["perceval_obj"]['age_of_project']) print(largest_object[repo_path]['new_formality']) ''' # testing out beneath: largest_object[repo_path]['ghs_obj'] = ghs.main(vcs_path, early_cutoff) #print(ghs_obj["time_cleaned_comm"]) repo_uri_list = repo_path.split('/') with open('/data/users/mgaughan/kkex_data_110723/' + repo_uri_list[-2] + '_' + repo_uri_list[-1] + '_result.json', 'w') as data_path: json.dump(largest_object[repo_path], data_path) except yaml.YAMLOError as err: print(err) print("----------------------") #this is Yoshi 2 MMT per van Meijel def compute_new_mmt(contrib_count, collab_count): return (contrib_count + collab_count * 2) / (contrib_count + collab_count) #this is Yoshi 1 mmt per Tamburri def compute_old_mmt(contrib_count, collab_count): return (contrib_count) / (contrib_count + collab_count) #formality score def compute_formality_score(mmt, milestones, lifetime): return mmt / (milestones / lifetime) if __name__ == "__main__": main()