initial work to expand data set
This commit is contained in:
parent
5dc0406c82
commit
d00a6169a2
82
expanded_data_collection.py
Normal file
82
expanded_data_collection.py
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
import csv
|
||||||
|
import requests
|
||||||
|
import os
|
||||||
|
import datetime as dt
|
||||||
|
|
||||||
|
import perceval_tasks as pt
|
||||||
|
import github_api_req as gha
|
||||||
|
import gh_gsql_req as ghs
|
||||||
|
|
||||||
|
|
||||||
|
key = os.environ.get('KKEXKEY')
|
||||||
|
test_csv_path = "120523_data_test.csv"
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
early_cutoff = dt.datetime(2008,2, 8)
|
||||||
|
with open('inst_all_packages_full_results.csv', newline='') as csvfile:
|
||||||
|
array_of_projects =[]
|
||||||
|
spamreader = csv.reader(csvfile)
|
||||||
|
index = 0
|
||||||
|
successful_count = 0
|
||||||
|
for row in spamreader:
|
||||||
|
index += 1
|
||||||
|
if index > 10:
|
||||||
|
break
|
||||||
|
project_dict = {}
|
||||||
|
project_dict["project_name"] = row[0]
|
||||||
|
project_dict["underproduction_mean"] = row[16]
|
||||||
|
project_dict["underproduction_low"] = row[17]
|
||||||
|
project_dict["underproduction_high"] = row[18]
|
||||||
|
project_dict["vcs_link"] = debian_query(project_dict["project_name"])
|
||||||
|
if project_dict["vcs_link"] == "":
|
||||||
|
continue
|
||||||
|
perceval_data = pt.main(project_dict["vcs_link"], early_cutoff)
|
||||||
|
if perceval_data == {}:
|
||||||
|
continue
|
||||||
|
project_dict['age_of_project'], project_dict['contributors'], project_dict['collaborators'] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators']
|
||||||
|
successful_count += 1
|
||||||
|
if index > 1:
|
||||||
|
array_of_projects.append(project_dict)
|
||||||
|
print("success rate: " + str(successful_count/index) + "; total success count: " + str(successful_count))
|
||||||
|
keys = array_of_projects[0].keys()
|
||||||
|
with open(test_csv_path, 'w', newline='') as output_file:
|
||||||
|
dict_writer = csv.DictWriter(output_file, keys)
|
||||||
|
dict_writer.writeheader()
|
||||||
|
dict_writer.writerows(array_of_projects)
|
||||||
|
|
||||||
|
|
||||||
|
def debian_query(package_name):
|
||||||
|
headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8'}
|
||||||
|
first_api_url = "https://sources.debian.org/api/src/" + package_name
|
||||||
|
try:
|
||||||
|
first_response = requests.get(url = first_api_url, headers=headers)
|
||||||
|
first_response_dict = first_response.json()
|
||||||
|
except:
|
||||||
|
print('error with the first debian request')
|
||||||
|
first_response_dict = {}
|
||||||
|
return ("")
|
||||||
|
if first_response_dict == {'error': 404}:
|
||||||
|
print('not found in debian system')
|
||||||
|
return ("")
|
||||||
|
#print(first_response_dict)
|
||||||
|
most_recent_package_version = first_response_dict['versions'][0]['version']
|
||||||
|
#print(first_response_dict['versions'][0]['version'])
|
||||||
|
second_api_url = "https://sources.debian.org/api/info/package/" + package_name + "/" + most_recent_package_version + "/"
|
||||||
|
try:
|
||||||
|
second_response = requests.get(url = second_api_url, headers=headers)
|
||||||
|
second_response_dict = second_response.json()
|
||||||
|
except:
|
||||||
|
print('error with the second debian request')
|
||||||
|
second_response_dict = {}
|
||||||
|
return ("")
|
||||||
|
if 'vcs_browser' not in second_response_dict['pkg_infos'].keys():
|
||||||
|
print('no vcs link')
|
||||||
|
return ("")
|
||||||
|
print(second_response_dict)
|
||||||
|
print(second_response_dict['pkg_infos']['vcs_browser'])
|
||||||
|
return second_response_dict['pkg_infos']['vcs_browser']
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
#debian_query("zurl")
|
@ -24,9 +24,9 @@ def main(vcs_path, begin_date):
|
|||||||
def get_perceval_log(vcs_path, begin_date):
|
def get_perceval_log(vcs_path, begin_date):
|
||||||
print(vcs_path)
|
print(vcs_path)
|
||||||
try:
|
try:
|
||||||
repo_dir = '/data/users/mgaughan/tmp/' + str(vcs_path[0].split('/')[-1])
|
repo_dir = '/data/users/mgaughan/tmp/' + str(vcs_path.split('/')[-1])
|
||||||
#gitpath=repo_dir
|
#gitpath=repo_dir
|
||||||
repo = Git(uri=vcs_path[0], gitpath=repo_dir)
|
repo = Git(uri=vcs_path, gitpath=repo_dir)
|
||||||
# this is a temporary date_from, will need to be more inclusive in the future
|
# this is a temporary date_from, will need to be more inclusive in the future
|
||||||
fetched_commits = repo.fetch(from_date=begin_date)
|
fetched_commits = repo.fetch(from_date=begin_date)
|
||||||
return list(fetched_commits)
|
return list(fetched_commits)
|
||||||
|
Loading…
Reference in New Issue
Block a user