24_deb_pkg_gov/expanded_data_collection.py

77 lines
3.6 KiB
Python
Raw Normal View History

2023-12-05 18:36:07 +00:00
import csv
import requests
import os
import datetime as dt
import wget
import json
2023-12-05 18:36:07 +00:00
import perceval_tasks as pt
import github_api_req as gha
import gh_gsql_req as ghs
import debian_queries as dqs
2023-12-05 18:36:07 +00:00
key = os.environ.get('KKEXKEY')
2023-12-11 20:34:13 +00:00
test_csv_path = "121123_expanded_data.csv"
2023-12-05 18:36:07 +00:00
def main():
early_cutoff = dt.datetime(2008,2, 8)
meta_dict = {}
2023-12-05 18:36:07 +00:00
with open(test_csv_path, 'w', newline='') as output_file:
keys = ["project_name", "underproduction_mean", "underproduction_low", "underproduction_high", "debian_vcs_link", "upstream_vcs_link", "age_of_project", "contributors", "collaborators", "milestone_count"]
2023-12-05 18:36:07 +00:00
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
with open('inst_all_packages_full_results.csv', newline='') as csvfile:
array_of_projects =[]
spamreader = csv.reader(csvfile)
index = 0
successful_count = 0
no_upstream = 0
for row in spamreader:
index += 1
project_dict = {}
project_dict["project_name"] = row[0]
project_dict["underproduction_mean"] = row[16]
project_dict["underproduction_low"] = row[17]
project_dict["underproduction_high"] = row[18]
project_dict["debian_vcs_link"] = dqs.debian_query(project_dict["project_name"])
if project_dict["debian_vcs_link"] == "":
continue
if "github" in project_dict["debian_vcs_link"]:
project_dict["upstream_vcs_link"] = clean_gh_vcs_link(project_dict["debian_vcs_link"])
else:
project_dict["upstream_vcs_link"] = dqs.debian_vcs_query(project_dict["debian_vcs_link"])
if project_dict["upstream_vcs_link"] == "":
no_upstream += 1
continue
perceval_data = pt.main(project_dict["upstream_vcs_link"], early_cutoff)
if perceval_data == {}:
continue
project_dict["age_of_project"], project_dict["contributors"], project_dict["collaborators"] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators']
if "github" in project_dict["upstream_vcs_link"]:
project_dict['milestone_count'] = gha.main(project_dict["upstream_vcs_link"], early_cutoff)
2023-12-11 20:34:13 +00:00
with open('/data/users/mgaughan/kkex_comment_data_121123/' + 'gh_comments_' + project_dict["project_name"] + '.json', 'w') as data_path:
json.dump(ghs.main(project_dict["upstream_vcs_link"], early_cutoff), data_path)
else:
project_dict['milestone_count'] = 0
successful_count += 1
if index > 1:
dict_writer.writerow(project_dict)
meta_dict['success_rate'] = successful_count/index
meta_dict['final_index'] = index
meta_dict['total_success'] = successful_count
meta_dict['no_upstream_info'] = no_upstream
#print("success rate: " + str(successful_count/index) + "; total success count: " + str(successful_count))
2023-12-11 20:34:13 +00:00
with open('121123_metadata_expanded.json', 'w') as data_path:
json.dump(meta_dict, data_path)
2023-12-05 18:36:07 +00:00
def clean_gh_vcs_link(debian_vcs_link):
url_array = debian_vcs_link.split("/")
new_url = "/".join(url_array[:5])
return new_url
2023-12-05 18:36:07 +00:00
if __name__ == "__main__":
main()
#clean_gh_vcs_link("https://github.com/kilobyte/3270font/tree/debian/")
#debian_vcs_query("https://salsa.debian.org/debian/0xffff/")