diff --git a/expanded_data_collection.py b/expanded_data_collection.py index a3d23be..9cc255e 100644 --- a/expanded_data_collection.py +++ b/expanded_data_collection.py @@ -3,7 +3,7 @@ import requests import os import datetime as dt import wget - +import json import perceval_tasks as pt import github_api_req as gha import gh_gsql_req as ghs @@ -11,47 +11,60 @@ import debian_queries as dqs key = os.environ.get('KKEXKEY') -test_csv_path = "120523_data_test.csv" - +test_csv_path = "120523_expanded_data_test.csv" def main(): early_cutoff = dt.datetime(2008,2, 8) - with open('inst_all_packages_full_results.csv', newline='') as csvfile: - array_of_projects =[] - spamreader = csv.reader(csvfile) - index = 0 - successful_count = 0 - for row in spamreader: - index += 1 - if index > 20: - break - project_dict = {} - project_dict["project_name"] = row[0] - project_dict["underproduction_mean"] = row[16] - project_dict["underproduction_low"] = row[17] - project_dict["underproduction_high"] = row[18] - project_dict["debian_vcs_link"] = dqs.debian_query(project_dict["project_name"]) - if project_dict["debian_vcs_link"] == "": - continue - if "github" in project_dict["debian_vcs_link"]: - project_dict["upstream_vcs_link"] = clean_gh_vcs_link(project_dict["debian_vcs_link"]) - else: - project_dict["upstream_vcs_link"] = dqs.debian_vcs_query(project_dict["debian_vcs_link"]) - if project_dict["upstream_vcs_link"] == "": - continue - perceval_data = pt.main(project_dict["upstream_vcs_link"], early_cutoff) - if perceval_data == {}: - continue - project_dict['age_of_project'], project_dict['contributors'], project_dict['collaborators'] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators'] - successful_count += 1 - if index > 1: - array_of_projects.append(project_dict) - print("success rate: " + str(successful_count/index) + "; total success count: " + str(successful_count)) - keys = array_of_projects[0].keys() + meta_dict = {} with open(test_csv_path, 'w', newline='') as output_file: + keys = ["project_name", "underproduction_mean", "underproduction_low", "underproduction_high", "debian_vcs_link", "upstream_vcs_link", "age_of_project", "contributors", "collaborators", "milestone_count"] dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() - dict_writer.writerows(array_of_projects) + with open('inst_all_packages_full_results.csv', newline='') as csvfile: + array_of_projects =[] + spamreader = csv.reader(csvfile) + index = 0 + successful_count = 0 + no_upstream = 0 + for row in spamreader: + index += 1 + if index > 10: + break + project_dict = {} + project_dict["project_name"] = row[0] + project_dict["underproduction_mean"] = row[16] + project_dict["underproduction_low"] = row[17] + project_dict["underproduction_high"] = row[18] + project_dict["debian_vcs_link"] = dqs.debian_query(project_dict["project_name"]) + if project_dict["debian_vcs_link"] == "": + continue + if "github" in project_dict["debian_vcs_link"]: + project_dict["upstream_vcs_link"] = clean_gh_vcs_link(project_dict["debian_vcs_link"]) + else: + project_dict["upstream_vcs_link"] = dqs.debian_vcs_query(project_dict["debian_vcs_link"]) + if project_dict["upstream_vcs_link"] == "": + no_upstream += 1 + continue + perceval_data = pt.main(project_dict["upstream_vcs_link"], early_cutoff) + if perceval_data == {}: + continue + project_dict["age_of_project"], project_dict["contributors"], project_dict["collaborators"] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators'] + if "github" in project_dict["upstream_vcs_link"]: + project_dict['milestone_count'] = gha.main(project_dict["upstream_vcs_link"], early_cutoff) + with open('/data/users/mgaughan/kkex_comment_data_120523/' + 'gh_comments_' + project_dict["project_name"] + '.json', 'w') as data_path: + json.dump(ghs.main(project_dict["upstream_vcs_link"], early_cutoff), data_path) + else: + project_dict['milestone_count'] = 0 + successful_count += 1 + if index > 1: + dict_writer.writerow(project_dict) + meta_dict['success_rate'] = successful_count/index + meta_dict['final_index'] = index + meta_dict['total_success'] = successful_count + meta_dict['no_upstream_info'] = no_upstream + #print("success rate: " + str(successful_count/index) + "; total success count: " + str(successful_count)) + with open('120523_metadata_expanded.json', 'w') as data_path: + json.dump(meta_dict, data_path) def clean_gh_vcs_link(debian_vcs_link): diff --git a/gh_gsql_req.py b/gh_gsql_req.py index b708a1b..faa52ed 100644 --- a/gh_gsql_req.py +++ b/gh_gsql_req.py @@ -7,12 +7,12 @@ key = os.environ.get('KKEXKEY') def main(vcs, early_cutoff): gsql_dict = {} - vcs_list = vcs[0].split('/') + vcs_list = vcs.split('/') repo_name = '"' + vcs_list[-1] + '"' repo_owner = '"' + vcs_list[-2] + '"' gsql_dict["original_returned_content"] = get_discussion_gql(repo_owner, repo_name) gsql_dict["time_cleaned_comm"] = within_time(gsql_dict["original_returned_content"], early_cutoff) - return gsql_dict + return gsql_dict['original_returned_content'] def get_discussion_gql(repo_owner, repo_name): url = "https://api.github.com/graphql" diff --git a/github_api_req.py b/github_api_req.py index 7c8737a..ea3bb8d 100644 --- a/github_api_req.py +++ b/github_api_req.py @@ -6,14 +6,14 @@ import os key = os.environ.get('KKEXKEY') def main(vcs, begin_date): - repo_uri=vcs[0] + repo_uri=vcs gha_info = {} #this is the entire list of Github 'milestones' grabbed from the API gha_info['milestones'] = get_milestone_information(repo_uri) #this is the count of milestones that occur after the cutoff date gha_info['milestone_count'] = parse_milestones(gha_info['milestones'], begin_date) #split_actors(repo_uri, actors_list) - return gha_info + return gha_info['milestone_count'] #this simple API call has been working for now but may need to be updated as more information is desired