24_deb_pkg_gov/main.py

import perceval
import os
import yaml
import datetime as dt
import json
#functions from other files
import perceval_tasks as pt
import github_api_req as gha
import gh_gsql_req as ghs

# In total, the data will look like:
# - repository VCS url
#   - perceval object
#      list of all commits to the project
#      count of contributors and collaborators to the project
#      age of the project
#   - github api object
#       object of milestones from the project
#       count of milestones from the project
#   - github gsql object
#       - list of discussion comments from repo
#               list left blank if none

def main():
    # we should discuss whether we're using the 93 day window that seems to be widely used or if we want a longer window
    early_cutoff = dt.datetime(2008,2, 8)
    print("Earliest date examined: " + str(early_cutoff))
    #largest_object = {}
    #manifest = '../kaylea_dissertation/lifecycle/package_metadata/jupyter-notebook_manifest.yaml'
    directory='../kaylea_dissertation/lifecycle/package_metadata/'
    count_of_dir = 0
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        # checking if it is a file
        if os.path.isfile(f):
            print(f)
        get_everything(f, early_cutoff)
        #remove this and it should just run? for the most part at least I think
        count_of_dir += 1
        #this is what needs to be commented out
        #if count_of_dir > 4:
        #    break
    #print(largest_object.keys())
    #print(len(largest_object.keys()))
    '''
    for repo in largest_object:
        print(largest_object[repo]['new_formality'])
    '''
    #with open('/data/users/mgaughan/kkex_data_110723/result.json', 'w') as results_path:
    #    json.dump(largest_object, results_path)

def get_everything(manifest_path, early_cutoff):
    largest_object = {}
    with open(manifest_path, 'r') as stream:
        try:
            config = yaml.safe_load(stream)
            #below lines will probably need to be refactored as tasks expand
            try:
                vcs_path = config['Upstream_VCS']
            except:
                print('error with the keys, i guess')
                return
            print("------------------")
            #print(vcs_path) return
            try:
                repo_path = vcs_path[0]
            except:
                print('vcs error')
                return
            largest_object[repo_path] = {}
            largest_object[repo_path]["perceval_obj"] = pt.main(vcs_path, early_cutoff)
            if len(largest_object[repo_path]["perceval_obj"]) == 0:
                print("PERCEVAL ERROR")
                del largest_object[repo_path]
                return
            largest_object[repo_path]["gha_obj"] = gha.main(vcs_path, early_cutoff)
            # testing out beneath:
            largest_object[repo_path]['ghs_obj'] = ghs.main(vcs_path, early_cutoff)
            #print(ghs_obj["time_cleaned_comm"])
            repo_uri_list = repo_path.split('/')
            with open('/data/users/mgaughan/kkex_data_111023/' + repo_uri_list[-2]  + '_' + repo_uri_list[-1] + '_result.json', 'w') as data_path:
                json.dump(largest_object[repo_path], data_path)

        except yaml.YAMLError as err:
            print(err)
    print("----------------------")


#this is Yoshi 2 MMT per van Meijel
def compute_new_mmt(contrib_count, collab_count):
    return (contrib_count + collab_count * 2) / (contrib_count + collab_count)

#this is Yoshi 1 mmt per Tamburri
def compute_old_mmt(contrib_count, collab_count):
    return (contrib_count) / (contrib_count + collab_count)

#formality score
def compute_formality_score(mmt, milestones, lifetime):
    return mmt / (milestones / lifetime)

if __name__ == "__main__":
    main()


'''
if largest_object[repo_path]["gha_obj"]['milestone_count'] == 0:
    #del largest_object[repo_path]
    #return
    #this is to ensure that projects which don't use milestones are counted
    largest_object[repo_path]["gha_obj"]['milestone_count'] = 0.1
largest_object[repo_path]['new_mmt'] = compute_new_mmt(largest_object[repo_path]["perceval_obj"]['contributors'], largest_object[repo_path]["perceval_obj"]['collaborators'])
#print('New MMT: ' + str(largest_object[repo_path]['new_mmt']))
largest_object[repo_path]['old_mmt'] = compute_old_mmt(largest_object[repo_path]["perceval_obj"]['contributors'], largest_object[repo_path]["perceval_obj"]['collaborators'])
#print('Old MMT: ' + str(largest_object[repo_path]['old_mmt']))
#new mmt formality score
largest_object[repo_path]['new_formality'] = compute_formality_score(largest_object[repo_path]['new_mmt'], largest_object[repo_path]["gha_obj"]['milestone_count'], largest_object[repo_path]["perceval_obj"]['age_of_project'])
print(largest_object[repo_path]['new_formality'])
'''