import csv import requests import os import datetime as dt import wget import json import perceval_tasks as pt import github_api_req as gha import gh_gsql_req as ghs import debian_queries as dqs key = os.environ.get('KKEXKEY') test_csv_path = "121123_expanded_data.csv" def main(): early_cutoff = dt.datetime(2008,2, 8) meta_dict = {} with open(test_csv_path, 'w', newline='') as output_file: keys = ["project_name", "underproduction_mean", "underproduction_low", "underproduction_high", "debian_vcs_link", "upstream_vcs_link", "age_of_project", "contributors", "collaborators", "milestone_count"] dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() with open('inst_all_packages_full_results.csv', newline='') as csvfile: array_of_projects =[] spamreader = csv.reader(csvfile) index = 0 successful_count = 0 no_upstream = 0 for row in spamreader: index += 1 project_dict = {} project_dict["project_name"] = row[0] project_dict["underproduction_mean"] = row[16] project_dict["underproduction_low"] = row[17] project_dict["underproduction_high"] = row[18] project_dict["debian_vcs_link"] = dqs.debian_query(project_dict["project_name"]) if project_dict["debian_vcs_link"] == "": continue if "github" in project_dict["debian_vcs_link"]: project_dict["upstream_vcs_link"] = clean_gh_vcs_link(project_dict["debian_vcs_link"]) else: project_dict["upstream_vcs_link"] = dqs.debian_vcs_query(project_dict["debian_vcs_link"]) if project_dict["upstream_vcs_link"] == "": no_upstream += 1 continue perceval_data = pt.main(project_dict["upstream_vcs_link"], early_cutoff) if perceval_data == {}: continue project_rosters = {} project_dict["age_of_project"], project_dict["contributors"], project_dict["collaborators"], project_rosters['contributors_list'], project_rosters['collaborators_list'] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators'], perceval_data['contributors_list'], perceval_data['collaborators_list'] with open("/data/users/mgaughan/kkex_roster_data_121123/" + "rosters_" + project_dict["project_name"] + '.json', 'w') as roster_path json.dump(project_rosters, roster_path) if "github" in project_dict["upstream_vcs_link"]: project_dict['milestone_count'] = gha.main(project_dict["upstream_vcs_link"], early_cutoff) with open('/data/users/mgaughan/kkex_comment_data_121123/' + 'gh_comments_' + project_dict["project_name"] + '.json', 'w') as data_path: json.dump(ghs.main(project_dict["upstream_vcs_link"], early_cutoff), data_path) else: project_dict['milestone_count'] = 0 successful_count += 1 if index > 1: dict_writer.writerow(project_dict) meta_dict['success_rate'] = successful_count/index meta_dict['final_index'] = index meta_dict['total_success'] = successful_count meta_dict['no_upstream_info'] = no_upstream #print("success rate: " + str(successful_count/index) + "; total success count: " + str(successful_count)) with open('121123_metadata_expanded.json', 'w') as data_path: json.dump(meta_dict, data_path) def clean_gh_vcs_link(debian_vcs_link): url_array = debian_vcs_link.split("/") new_url = "/".join(url_array[:5]) return new_url if __name__ == "__main__": main() #clean_gh_vcs_link("https://github.com/kilobyte/3270font/tree/debian/") #debian_vcs_query("https://salsa.debian.org/debian/0xffff/")