From d00a6169a2962c130cba88be61c398e7f21a621c Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Tue, 5 Dec 2023 12:36:07 -0600 Subject: [PATCH] initial work to expand data set --- expanded_data_collection.py | 82 +++++++++++++++++++++++++++++++++++++ perceval_tasks.py | 4 +- 2 files changed, 84 insertions(+), 2 deletions(-) create mode 100644 expanded_data_collection.py diff --git a/expanded_data_collection.py b/expanded_data_collection.py new file mode 100644 index 0000000..20b2137 --- /dev/null +++ b/expanded_data_collection.py @@ -0,0 +1,82 @@ +import csv +import requests +import os +import datetime as dt + +import perceval_tasks as pt +import github_api_req as gha +import gh_gsql_req as ghs + + +key = os.environ.get('KKEXKEY') +test_csv_path = "120523_data_test.csv" + + +def main(): + early_cutoff = dt.datetime(2008,2, 8) + with open('inst_all_packages_full_results.csv', newline='') as csvfile: + array_of_projects =[] + spamreader = csv.reader(csvfile) + index = 0 + successful_count = 0 + for row in spamreader: + index += 1 + if index > 10: + break + project_dict = {} + project_dict["project_name"] = row[0] + project_dict["underproduction_mean"] = row[16] + project_dict["underproduction_low"] = row[17] + project_dict["underproduction_high"] = row[18] + project_dict["vcs_link"] = debian_query(project_dict["project_name"]) + if project_dict["vcs_link"] == "": + continue + perceval_data = pt.main(project_dict["vcs_link"], early_cutoff) + if perceval_data == {}: + continue + project_dict['age_of_project'], project_dict['contributors'], project_dict['collaborators'] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators'] + successful_count += 1 + if index > 1: + array_of_projects.append(project_dict) + print("success rate: " + str(successful_count/index) + "; total success count: " + str(successful_count)) + keys = array_of_projects[0].keys() + with open(test_csv_path, 'w', newline='') as output_file: + dict_writer = csv.DictWriter(output_file, keys) + dict_writer.writeheader() + dict_writer.writerows(array_of_projects) + + +def debian_query(package_name): + headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8'} + first_api_url = "https://sources.debian.org/api/src/" + package_name + try: + first_response = requests.get(url = first_api_url, headers=headers) + first_response_dict = first_response.json() + except: + print('error with the first debian request') + first_response_dict = {} + return ("") + if first_response_dict == {'error': 404}: + print('not found in debian system') + return ("") + #print(first_response_dict) + most_recent_package_version = first_response_dict['versions'][0]['version'] + #print(first_response_dict['versions'][0]['version']) + second_api_url = "https://sources.debian.org/api/info/package/" + package_name + "/" + most_recent_package_version + "/" + try: + second_response = requests.get(url = second_api_url, headers=headers) + second_response_dict = second_response.json() + except: + print('error with the second debian request') + second_response_dict = {} + return ("") + if 'vcs_browser' not in second_response_dict['pkg_infos'].keys(): + print('no vcs link') + return ("") + print(second_response_dict) + print(second_response_dict['pkg_infos']['vcs_browser']) + return second_response_dict['pkg_infos']['vcs_browser'] + +if __name__ == "__main__": + main() + #debian_query("zurl") diff --git a/perceval_tasks.py b/perceval_tasks.py index d398a36..821740e 100644 --- a/perceval_tasks.py +++ b/perceval_tasks.py @@ -24,9 +24,9 @@ def main(vcs_path, begin_date): def get_perceval_log(vcs_path, begin_date): print(vcs_path) try: - repo_dir = '/data/users/mgaughan/tmp/' + str(vcs_path[0].split('/')[-1]) + repo_dir = '/data/users/mgaughan/tmp/' + str(vcs_path.split('/')[-1]) #gitpath=repo_dir - repo = Git(uri=vcs_path[0], gitpath=repo_dir) + repo = Git(uri=vcs_path, gitpath=repo_dir) # this is a temporary date_from, will need to be more inclusive in the future fetched_commits = repo.fetch(from_date=begin_date) return list(fetched_commits)