24_deb_pkg_gov/expanded_data_collection.py

83 lines
3.2 KiB
Python
Raw Normal View History

2023-12-05 18:36:07 +00:00
import csv
import requests
import os
import datetime as dt
import perceval_tasks as pt
import github_api_req as gha
import gh_gsql_req as ghs
key = os.environ.get('KKEXKEY')
test_csv_path = "120523_data_test.csv"
def main():
early_cutoff = dt.datetime(2008,2, 8)
with open('inst_all_packages_full_results.csv', newline='') as csvfile:
array_of_projects =[]
spamreader = csv.reader(csvfile)
index = 0
successful_count = 0
for row in spamreader:
index += 1
if index > 10:
break
project_dict = {}
project_dict["project_name"] = row[0]
project_dict["underproduction_mean"] = row[16]
project_dict["underproduction_low"] = row[17]
project_dict["underproduction_high"] = row[18]
project_dict["vcs_link"] = debian_query(project_dict["project_name"])
if project_dict["vcs_link"] == "":
continue
perceval_data = pt.main(project_dict["vcs_link"], early_cutoff)
if perceval_data == {}:
continue
project_dict['age_of_project'], project_dict['contributors'], project_dict['collaborators'] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators']
successful_count += 1
if index > 1:
array_of_projects.append(project_dict)
print("success rate: " + str(successful_count/index) + "; total success count: " + str(successful_count))
keys = array_of_projects[0].keys()
with open(test_csv_path, 'w', newline='') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(array_of_projects)
def debian_query(package_name):
headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8'}
first_api_url = "https://sources.debian.org/api/src/" + package_name
try:
first_response = requests.get(url = first_api_url, headers=headers)
first_response_dict = first_response.json()
except:
print('error with the first debian request')
first_response_dict = {}
return ("")
if first_response_dict == {'error': 404}:
print('not found in debian system')
return ("")
#print(first_response_dict)
most_recent_package_version = first_response_dict['versions'][0]['version']
#print(first_response_dict['versions'][0]['version'])
second_api_url = "https://sources.debian.org/api/info/package/" + package_name + "/" + most_recent_package_version + "/"
try:
second_response = requests.get(url = second_api_url, headers=headers)
second_response_dict = second_response.json()
except:
print('error with the second debian request')
second_response_dict = {}
return ("")
if 'vcs_browser' not in second_response_dict['pkg_infos'].keys():
print('no vcs link')
return ("")
print(second_response_dict)
print(second_response_dict['pkg_infos']['vcs_browser'])
return second_response_dict['pkg_infos']['vcs_browser']
if __name__ == "__main__":
main()
#debian_query("zurl")