initial steps in ecpanded data collection
This commit is contained in:
parent
d00a6169a2
commit
ab763bcc13
68
debian_queries.py
Normal file
68
debian_queries.py
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
import os
|
||||||
|
import requests
|
||||||
|
import wget
|
||||||
|
from perceval.backends.core.git import Git
|
||||||
|
from git import Repo
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
|
||||||
|
def debian_vcs_query(debian_vcs_link):
|
||||||
|
upstream_repo_link = ""
|
||||||
|
project_name = debian_vcs_link.split("/")[-1]
|
||||||
|
output_directory = "/data/users/mgaughan/tmp1/" + project_name
|
||||||
|
print(output_directory)
|
||||||
|
#url = debian_vcs_link + '/-/blob/master/debian/upstream/metadata'
|
||||||
|
try:
|
||||||
|
Repo.clone_from(debian_vcs_link, output_directory)
|
||||||
|
except:
|
||||||
|
print("error cloning")
|
||||||
|
return upstream_repo_link
|
||||||
|
try:
|
||||||
|
upstream_metadata = open(output_directory + "/debian/upstream/metadata", "r").read().split("\n")
|
||||||
|
upstream_repo_array = upstream_metadata[3].split(":")[-2:]
|
||||||
|
upstream_repo_link = ":".join(upstream_repo_array)
|
||||||
|
print(upstream_repo_link)
|
||||||
|
except:
|
||||||
|
print("no file!")
|
||||||
|
upstream_repo_link = ""
|
||||||
|
#filename = wget.download(url, out=output_directory)
|
||||||
|
#print(type(filename))
|
||||||
|
shutil.rmtree(output_directory, ignore_errors=True)
|
||||||
|
return upstream_repo_link
|
||||||
|
|
||||||
|
|
||||||
|
def debian_query(package_name):
|
||||||
|
headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8'}
|
||||||
|
first_api_url = "https://sources.debian.org/api/src/" + package_name
|
||||||
|
try:
|
||||||
|
first_response = requests.get(url = first_api_url, headers=headers)
|
||||||
|
first_response_dict = first_response.json()
|
||||||
|
except:
|
||||||
|
print('error with the first debian request')
|
||||||
|
first_response_dict = {}
|
||||||
|
return ("")
|
||||||
|
if first_response_dict == {'error': 404}:
|
||||||
|
print('not found in debian system')
|
||||||
|
return ("")
|
||||||
|
#print(first_response_dict)
|
||||||
|
most_recent_package_version = first_response_dict['versions'][0]['version']
|
||||||
|
#print(first_response_dict['versions'][0]['version'])
|
||||||
|
second_api_url = "https://sources.debian.org/api/info/package/" + package_name + "/" + most_recent_package_version + "/"
|
||||||
|
try:
|
||||||
|
second_response = requests.get(url = second_api_url, headers=headers)
|
||||||
|
second_response_dict = second_response.json()
|
||||||
|
except:
|
||||||
|
print('error with the second debian request')
|
||||||
|
second_response_dict = {}
|
||||||
|
return ("")
|
||||||
|
if 'vcs_browser' not in second_response_dict['pkg_infos'].keys():
|
||||||
|
print('no vcs link')
|
||||||
|
return ("")
|
||||||
|
print(second_response_dict)
|
||||||
|
print(second_response_dict['pkg_infos']['vcs_browser'])
|
||||||
|
return second_response_dict['pkg_infos']['vcs_browser']
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
#main()
|
||||||
|
debian_vcs_query("https://salsa.debian.org/debian/0xffff")
|
@ -2,10 +2,12 @@ import csv
|
|||||||
import requests
|
import requests
|
||||||
import os
|
import os
|
||||||
import datetime as dt
|
import datetime as dt
|
||||||
|
import wget
|
||||||
|
|
||||||
import perceval_tasks as pt
|
import perceval_tasks as pt
|
||||||
import github_api_req as gha
|
import github_api_req as gha
|
||||||
import gh_gsql_req as ghs
|
import gh_gsql_req as ghs
|
||||||
|
import debian_queries as dqs
|
||||||
|
|
||||||
|
|
||||||
key = os.environ.get('KKEXKEY')
|
key = os.environ.get('KKEXKEY')
|
||||||
@ -21,17 +23,23 @@ def main():
|
|||||||
successful_count = 0
|
successful_count = 0
|
||||||
for row in spamreader:
|
for row in spamreader:
|
||||||
index += 1
|
index += 1
|
||||||
if index > 10:
|
if index > 20:
|
||||||
break
|
break
|
||||||
project_dict = {}
|
project_dict = {}
|
||||||
project_dict["project_name"] = row[0]
|
project_dict["project_name"] = row[0]
|
||||||
project_dict["underproduction_mean"] = row[16]
|
project_dict["underproduction_mean"] = row[16]
|
||||||
project_dict["underproduction_low"] = row[17]
|
project_dict["underproduction_low"] = row[17]
|
||||||
project_dict["underproduction_high"] = row[18]
|
project_dict["underproduction_high"] = row[18]
|
||||||
project_dict["vcs_link"] = debian_query(project_dict["project_name"])
|
project_dict["debian_vcs_link"] = dqs.debian_query(project_dict["project_name"])
|
||||||
if project_dict["vcs_link"] == "":
|
if project_dict["debian_vcs_link"] == "":
|
||||||
continue
|
continue
|
||||||
perceval_data = pt.main(project_dict["vcs_link"], early_cutoff)
|
if "github" in project_dict["debian_vcs_link"]:
|
||||||
|
project_dict["upstream_vcs_link"] = clean_gh_vcs_link(project_dict["debian_vcs_link"])
|
||||||
|
else:
|
||||||
|
project_dict["upstream_vcs_link"] = dqs.debian_vcs_query(project_dict["debian_vcs_link"])
|
||||||
|
if project_dict["upstream_vcs_link"] == "":
|
||||||
|
continue
|
||||||
|
perceval_data = pt.main(project_dict["upstream_vcs_link"], early_cutoff)
|
||||||
if perceval_data == {}:
|
if perceval_data == {}:
|
||||||
continue
|
continue
|
||||||
project_dict['age_of_project'], project_dict['contributors'], project_dict['collaborators'] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators']
|
project_dict['age_of_project'], project_dict['contributors'], project_dict['collaborators'] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators']
|
||||||
@ -46,37 +54,12 @@ def main():
|
|||||||
dict_writer.writerows(array_of_projects)
|
dict_writer.writerows(array_of_projects)
|
||||||
|
|
||||||
|
|
||||||
def debian_query(package_name):
|
def clean_gh_vcs_link(debian_vcs_link):
|
||||||
headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8'}
|
url_array = debian_vcs_link.split("/")
|
||||||
first_api_url = "https://sources.debian.org/api/src/" + package_name
|
new_url = "/".join(url_array[:5])
|
||||||
try:
|
return new_url
|
||||||
first_response = requests.get(url = first_api_url, headers=headers)
|
|
||||||
first_response_dict = first_response.json()
|
|
||||||
except:
|
|
||||||
print('error with the first debian request')
|
|
||||||
first_response_dict = {}
|
|
||||||
return ("")
|
|
||||||
if first_response_dict == {'error': 404}:
|
|
||||||
print('not found in debian system')
|
|
||||||
return ("")
|
|
||||||
#print(first_response_dict)
|
|
||||||
most_recent_package_version = first_response_dict['versions'][0]['version']
|
|
||||||
#print(first_response_dict['versions'][0]['version'])
|
|
||||||
second_api_url = "https://sources.debian.org/api/info/package/" + package_name + "/" + most_recent_package_version + "/"
|
|
||||||
try:
|
|
||||||
second_response = requests.get(url = second_api_url, headers=headers)
|
|
||||||
second_response_dict = second_response.json()
|
|
||||||
except:
|
|
||||||
print('error with the second debian request')
|
|
||||||
second_response_dict = {}
|
|
||||||
return ("")
|
|
||||||
if 'vcs_browser' not in second_response_dict['pkg_infos'].keys():
|
|
||||||
print('no vcs link')
|
|
||||||
return ("")
|
|
||||||
print(second_response_dict)
|
|
||||||
print(second_response_dict['pkg_infos']['vcs_browser'])
|
|
||||||
return second_response_dict['pkg_infos']['vcs_browser']
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
#debian_query("zurl")
|
#clean_gh_vcs_link("https://github.com/kilobyte/3270font/tree/debian/")
|
||||||
|
#debian_vcs_query("https://salsa.debian.org/debian/0xffff/")
|
||||||
|
@ -13,6 +13,7 @@ def main(vcs_path, begin_date):
|
|||||||
perceval_info['age_of_project'] = get_repo_age(perceval_info['list_of_commits'])
|
perceval_info['age_of_project'] = get_repo_age(perceval_info['list_of_commits'])
|
||||||
perceval_info['contributors'], perceval_info['collaborators'] = get_all_actors(perceval_info['list_of_commits'])
|
perceval_info['contributors'], perceval_info['collaborators'] = get_all_actors(perceval_info['list_of_commits'])
|
||||||
del perceval_info['list_of_commits']
|
del perceval_info['list_of_commits']
|
||||||
|
print(perceval_info)
|
||||||
return perceval_info
|
return perceval_info
|
||||||
else:
|
else:
|
||||||
print('error, no commits found?')
|
print('error, no commits found?')
|
||||||
@ -22,6 +23,7 @@ def main(vcs_path, begin_date):
|
|||||||
|
|
||||||
# this is the primary function for getting the list of commits from perceval
|
# this is the primary function for getting the list of commits from perceval
|
||||||
def get_perceval_log(vcs_path, begin_date):
|
def get_perceval_log(vcs_path, begin_date):
|
||||||
|
vcs_path = vcs_path.strip()
|
||||||
print(vcs_path)
|
print(vcs_path)
|
||||||
try:
|
try:
|
||||||
repo_dir = '/data/users/mgaughan/tmp/' + str(vcs_path.split('/')[-1])
|
repo_dir = '/data/users/mgaughan/tmp/' + str(vcs_path.split('/')[-1])
|
||||||
@ -63,3 +65,5 @@ def get_all_actors(all_commits):
|
|||||||
authors.remove(committer)
|
authors.remove(committer)
|
||||||
return len(authors), len(committers)
|
return len(authors), len(committers)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main(" https://github.com/pali/0xFFFF".strip(),dt.datetime(2008,2, 8))
|
Loading…
Reference in New Issue
Block a user