final prep for expanded data collection a1
This commit is contained in:
parent
ab763bcc13
commit
49a02dfca4
@ -3,7 +3,7 @@ import requests
|
|||||||
import os
|
import os
|
||||||
import datetime as dt
|
import datetime as dt
|
||||||
import wget
|
import wget
|
||||||
|
import json
|
||||||
import perceval_tasks as pt
|
import perceval_tasks as pt
|
||||||
import github_api_req as gha
|
import github_api_req as gha
|
||||||
import gh_gsql_req as ghs
|
import gh_gsql_req as ghs
|
||||||
@ -11,19 +11,24 @@ import debian_queries as dqs
|
|||||||
|
|
||||||
|
|
||||||
key = os.environ.get('KKEXKEY')
|
key = os.environ.get('KKEXKEY')
|
||||||
test_csv_path = "120523_data_test.csv"
|
test_csv_path = "120523_expanded_data_test.csv"
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
early_cutoff = dt.datetime(2008,2, 8)
|
early_cutoff = dt.datetime(2008,2, 8)
|
||||||
|
meta_dict = {}
|
||||||
|
with open(test_csv_path, 'w', newline='') as output_file:
|
||||||
|
keys = ["project_name", "underproduction_mean", "underproduction_low", "underproduction_high", "debian_vcs_link", "upstream_vcs_link", "age_of_project", "contributors", "collaborators", "milestone_count"]
|
||||||
|
dict_writer = csv.DictWriter(output_file, keys)
|
||||||
|
dict_writer.writeheader()
|
||||||
with open('inst_all_packages_full_results.csv', newline='') as csvfile:
|
with open('inst_all_packages_full_results.csv', newline='') as csvfile:
|
||||||
array_of_projects =[]
|
array_of_projects =[]
|
||||||
spamreader = csv.reader(csvfile)
|
spamreader = csv.reader(csvfile)
|
||||||
index = 0
|
index = 0
|
||||||
successful_count = 0
|
successful_count = 0
|
||||||
|
no_upstream = 0
|
||||||
for row in spamreader:
|
for row in spamreader:
|
||||||
index += 1
|
index += 1
|
||||||
if index > 20:
|
if index > 10:
|
||||||
break
|
break
|
||||||
project_dict = {}
|
project_dict = {}
|
||||||
project_dict["project_name"] = row[0]
|
project_dict["project_name"] = row[0]
|
||||||
@ -38,20 +43,28 @@ def main():
|
|||||||
else:
|
else:
|
||||||
project_dict["upstream_vcs_link"] = dqs.debian_vcs_query(project_dict["debian_vcs_link"])
|
project_dict["upstream_vcs_link"] = dqs.debian_vcs_query(project_dict["debian_vcs_link"])
|
||||||
if project_dict["upstream_vcs_link"] == "":
|
if project_dict["upstream_vcs_link"] == "":
|
||||||
|
no_upstream += 1
|
||||||
continue
|
continue
|
||||||
perceval_data = pt.main(project_dict["upstream_vcs_link"], early_cutoff)
|
perceval_data = pt.main(project_dict["upstream_vcs_link"], early_cutoff)
|
||||||
if perceval_data == {}:
|
if perceval_data == {}:
|
||||||
continue
|
continue
|
||||||
project_dict['age_of_project'], project_dict['contributors'], project_dict['collaborators'] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators']
|
project_dict["age_of_project"], project_dict["contributors"], project_dict["collaborators"] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators']
|
||||||
|
if "github" in project_dict["upstream_vcs_link"]:
|
||||||
|
project_dict['milestone_count'] = gha.main(project_dict["upstream_vcs_link"], early_cutoff)
|
||||||
|
with open('/data/users/mgaughan/kkex_comment_data_120523/' + 'gh_comments_' + project_dict["project_name"] + '.json', 'w') as data_path:
|
||||||
|
json.dump(ghs.main(project_dict["upstream_vcs_link"], early_cutoff), data_path)
|
||||||
|
else:
|
||||||
|
project_dict['milestone_count'] = 0
|
||||||
successful_count += 1
|
successful_count += 1
|
||||||
if index > 1:
|
if index > 1:
|
||||||
array_of_projects.append(project_dict)
|
dict_writer.writerow(project_dict)
|
||||||
print("success rate: " + str(successful_count/index) + "; total success count: " + str(successful_count))
|
meta_dict['success_rate'] = successful_count/index
|
||||||
keys = array_of_projects[0].keys()
|
meta_dict['final_index'] = index
|
||||||
with open(test_csv_path, 'w', newline='') as output_file:
|
meta_dict['total_success'] = successful_count
|
||||||
dict_writer = csv.DictWriter(output_file, keys)
|
meta_dict['no_upstream_info'] = no_upstream
|
||||||
dict_writer.writeheader()
|
#print("success rate: " + str(successful_count/index) + "; total success count: " + str(successful_count))
|
||||||
dict_writer.writerows(array_of_projects)
|
with open('120523_metadata_expanded.json', 'w') as data_path:
|
||||||
|
json.dump(meta_dict, data_path)
|
||||||
|
|
||||||
|
|
||||||
def clean_gh_vcs_link(debian_vcs_link):
|
def clean_gh_vcs_link(debian_vcs_link):
|
||||||
|
@ -7,12 +7,12 @@ key = os.environ.get('KKEXKEY')
|
|||||||
|
|
||||||
def main(vcs, early_cutoff):
|
def main(vcs, early_cutoff):
|
||||||
gsql_dict = {}
|
gsql_dict = {}
|
||||||
vcs_list = vcs[0].split('/')
|
vcs_list = vcs.split('/')
|
||||||
repo_name = '"' + vcs_list[-1] + '"'
|
repo_name = '"' + vcs_list[-1] + '"'
|
||||||
repo_owner = '"' + vcs_list[-2] + '"'
|
repo_owner = '"' + vcs_list[-2] + '"'
|
||||||
gsql_dict["original_returned_content"] = get_discussion_gql(repo_owner, repo_name)
|
gsql_dict["original_returned_content"] = get_discussion_gql(repo_owner, repo_name)
|
||||||
gsql_dict["time_cleaned_comm"] = within_time(gsql_dict["original_returned_content"], early_cutoff)
|
gsql_dict["time_cleaned_comm"] = within_time(gsql_dict["original_returned_content"], early_cutoff)
|
||||||
return gsql_dict
|
return gsql_dict['original_returned_content']
|
||||||
|
|
||||||
def get_discussion_gql(repo_owner, repo_name):
|
def get_discussion_gql(repo_owner, repo_name):
|
||||||
url = "https://api.github.com/graphql"
|
url = "https://api.github.com/graphql"
|
||||||
|
@ -6,14 +6,14 @@ import os
|
|||||||
key = os.environ.get('KKEXKEY')
|
key = os.environ.get('KKEXKEY')
|
||||||
|
|
||||||
def main(vcs, begin_date):
|
def main(vcs, begin_date):
|
||||||
repo_uri=vcs[0]
|
repo_uri=vcs
|
||||||
gha_info = {}
|
gha_info = {}
|
||||||
#this is the entire list of Github 'milestones' grabbed from the API
|
#this is the entire list of Github 'milestones' grabbed from the API
|
||||||
gha_info['milestones'] = get_milestone_information(repo_uri)
|
gha_info['milestones'] = get_milestone_information(repo_uri)
|
||||||
#this is the count of milestones that occur after the cutoff date
|
#this is the count of milestones that occur after the cutoff date
|
||||||
gha_info['milestone_count'] = parse_milestones(gha_info['milestones'], begin_date)
|
gha_info['milestone_count'] = parse_milestones(gha_info['milestones'], begin_date)
|
||||||
#split_actors(repo_uri, actors_list)
|
#split_actors(repo_uri, actors_list)
|
||||||
return gha_info
|
return gha_info['milestone_count']
|
||||||
|
|
||||||
|
|
||||||
#this simple API call has been working for now but may need to be updated as more information is desired
|
#this simple API call has been working for now but may need to be updated as more information is desired
|
||||||
|
Loading…
Reference in New Issue
Block a user