From ab763bcc13d228634df8ab633acf971652e251dd Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Tue, 5 Dec 2023 18:39:10 -0600 Subject: [PATCH] initial steps in ecpanded data collection --- debian_queries.py | 68 +++++++++++++++++++++++++++++++++++++ expanded_data_collection.py | 53 ++++++++++------------------- perceval_tasks.py | 4 +++ 3 files changed, 90 insertions(+), 35 deletions(-) create mode 100644 debian_queries.py diff --git a/debian_queries.py b/debian_queries.py new file mode 100644 index 0000000..dbb3533 --- /dev/null +++ b/debian_queries.py @@ -0,0 +1,68 @@ +import os +import requests +import wget +from perceval.backends.core.git import Git +from git import Repo +import shutil + + +def debian_vcs_query(debian_vcs_link): + upstream_repo_link = "" + project_name = debian_vcs_link.split("/")[-1] + output_directory = "/data/users/mgaughan/tmp1/" + project_name + print(output_directory) + #url = debian_vcs_link + '/-/blob/master/debian/upstream/metadata' + try: + Repo.clone_from(debian_vcs_link, output_directory) + except: + print("error cloning") + return upstream_repo_link + try: + upstream_metadata = open(output_directory + "/debian/upstream/metadata", "r").read().split("\n") + upstream_repo_array = upstream_metadata[3].split(":")[-2:] + upstream_repo_link = ":".join(upstream_repo_array) + print(upstream_repo_link) + except: + print("no file!") + upstream_repo_link = "" + #filename = wget.download(url, out=output_directory) + #print(type(filename)) + shutil.rmtree(output_directory, ignore_errors=True) + return upstream_repo_link + + +def debian_query(package_name): + headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8'} + first_api_url = "https://sources.debian.org/api/src/" + package_name + try: + first_response = requests.get(url = first_api_url, headers=headers) + first_response_dict = first_response.json() + except: + print('error with the first debian request') + first_response_dict = {} + return ("") + if first_response_dict == {'error': 404}: + print('not found in debian system') + return ("") + #print(first_response_dict) + most_recent_package_version = first_response_dict['versions'][0]['version'] + #print(first_response_dict['versions'][0]['version']) + second_api_url = "https://sources.debian.org/api/info/package/" + package_name + "/" + most_recent_package_version + "/" + try: + second_response = requests.get(url = second_api_url, headers=headers) + second_response_dict = second_response.json() + except: + print('error with the second debian request') + second_response_dict = {} + return ("") + if 'vcs_browser' not in second_response_dict['pkg_infos'].keys(): + print('no vcs link') + return ("") + print(second_response_dict) + print(second_response_dict['pkg_infos']['vcs_browser']) + return second_response_dict['pkg_infos']['vcs_browser'] + + +if __name__ == "__main__": + #main() + debian_vcs_query("https://salsa.debian.org/debian/0xffff") \ No newline at end of file diff --git a/expanded_data_collection.py b/expanded_data_collection.py index 20b2137..a3d23be 100644 --- a/expanded_data_collection.py +++ b/expanded_data_collection.py @@ -2,10 +2,12 @@ import csv import requests import os import datetime as dt +import wget import perceval_tasks as pt import github_api_req as gha import gh_gsql_req as ghs +import debian_queries as dqs key = os.environ.get('KKEXKEY') @@ -21,17 +23,23 @@ def main(): successful_count = 0 for row in spamreader: index += 1 - if index > 10: + if index > 20: break project_dict = {} project_dict["project_name"] = row[0] project_dict["underproduction_mean"] = row[16] project_dict["underproduction_low"] = row[17] project_dict["underproduction_high"] = row[18] - project_dict["vcs_link"] = debian_query(project_dict["project_name"]) - if project_dict["vcs_link"] == "": + project_dict["debian_vcs_link"] = dqs.debian_query(project_dict["project_name"]) + if project_dict["debian_vcs_link"] == "": continue - perceval_data = pt.main(project_dict["vcs_link"], early_cutoff) + if "github" in project_dict["debian_vcs_link"]: + project_dict["upstream_vcs_link"] = clean_gh_vcs_link(project_dict["debian_vcs_link"]) + else: + project_dict["upstream_vcs_link"] = dqs.debian_vcs_query(project_dict["debian_vcs_link"]) + if project_dict["upstream_vcs_link"] == "": + continue + perceval_data = pt.main(project_dict["upstream_vcs_link"], early_cutoff) if perceval_data == {}: continue project_dict['age_of_project'], project_dict['contributors'], project_dict['collaborators'] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators'] @@ -46,37 +54,12 @@ def main(): dict_writer.writerows(array_of_projects) -def debian_query(package_name): - headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8'} - first_api_url = "https://sources.debian.org/api/src/" + package_name - try: - first_response = requests.get(url = first_api_url, headers=headers) - first_response_dict = first_response.json() - except: - print('error with the first debian request') - first_response_dict = {} - return ("") - if first_response_dict == {'error': 404}: - print('not found in debian system') - return ("") - #print(first_response_dict) - most_recent_package_version = first_response_dict['versions'][0]['version'] - #print(first_response_dict['versions'][0]['version']) - second_api_url = "https://sources.debian.org/api/info/package/" + package_name + "/" + most_recent_package_version + "/" - try: - second_response = requests.get(url = second_api_url, headers=headers) - second_response_dict = second_response.json() - except: - print('error with the second debian request') - second_response_dict = {} - return ("") - if 'vcs_browser' not in second_response_dict['pkg_infos'].keys(): - print('no vcs link') - return ("") - print(second_response_dict) - print(second_response_dict['pkg_infos']['vcs_browser']) - return second_response_dict['pkg_infos']['vcs_browser'] +def clean_gh_vcs_link(debian_vcs_link): + url_array = debian_vcs_link.split("/") + new_url = "/".join(url_array[:5]) + return new_url if __name__ == "__main__": main() - #debian_query("zurl") + #clean_gh_vcs_link("https://github.com/kilobyte/3270font/tree/debian/") + #debian_vcs_query("https://salsa.debian.org/debian/0xffff/") diff --git a/perceval_tasks.py b/perceval_tasks.py index 821740e..7f08ed4 100644 --- a/perceval_tasks.py +++ b/perceval_tasks.py @@ -13,6 +13,7 @@ def main(vcs_path, begin_date): perceval_info['age_of_project'] = get_repo_age(perceval_info['list_of_commits']) perceval_info['contributors'], perceval_info['collaborators'] = get_all_actors(perceval_info['list_of_commits']) del perceval_info['list_of_commits'] + print(perceval_info) return perceval_info else: print('error, no commits found?') @@ -22,6 +23,7 @@ def main(vcs_path, begin_date): # this is the primary function for getting the list of commits from perceval def get_perceval_log(vcs_path, begin_date): + vcs_path = vcs_path.strip() print(vcs_path) try: repo_dir = '/data/users/mgaughan/tmp/' + str(vcs_path.split('/')[-1]) @@ -63,3 +65,5 @@ def get_all_actors(all_commits): authors.remove(committer) return len(authors), len(committers) +if __name__ == "__main__": + main(" https://github.com/pali/0xFFFF".strip(),dt.datetime(2008,2, 8)) \ No newline at end of file