expanded data collection script edits

This commit is contained in:
Matthew Gaughan 2023-12-11 14:34:13 -06:00
parent d5863e5735
commit 29a6ef7074
2 changed files with 22 additions and 9 deletions

View File

@ -1,10 +1,11 @@
import os
import requests
import wget
from perceval.backends.core.git import Git
#from perceval.backends.core.git import Git
from git import Repo
import shutil
import pexpect
import subprocess
def debian_vcs_query(debian_vcs_link):
upstream_repo_link = ""
@ -12,11 +13,21 @@ def debian_vcs_query(debian_vcs_link):
output_directory = "/data/users/mgaughan/tmp1/" + project_name
print(output_directory)
#url = debian_vcs_link + '/-/blob/master/debian/upstream/metadata'
# the below try is the password handling
try:
Repo.clone_from(debian_vcs_link, output_directory)
#Repo.clone_from(debian_vcs_link, output_directory, env={"Username for 'https://salsa.debian.org": "", "Password for 'https://salsa.debian.org":""})
command = "git clone " + debian_vcs_link + " " + output_directory
p = pexpect.spawn(command)
p.expect("Username for 'https://salsa.debian.org':")
p.sendline ("")
p.expect ("Password for 'https://salsa.debian.org':")
p.sendline ("")
p.expect(pexpect.EOF, timeout=None)
#p.expect([pexpect.TIMEOUT, 'pattern', pexpect.EOF])
p.close()
except:
print("error cloning")
return upstream_repo_link
print("handling misc pexpect issues")
#return upstream_repo_link
try:
upstream_metadata = open(output_directory + "/debian/upstream/metadata", "r").read().split("\n")
upstream_repo_array = upstream_metadata[3].split(":")[-2:]
@ -28,6 +39,7 @@ def debian_vcs_query(debian_vcs_link):
#filename = wget.download(url, out=output_directory)
#print(type(filename))
shutil.rmtree(output_directory, ignore_errors=True)
print('success')
return upstream_repo_link
@ -65,4 +77,5 @@ def debian_query(package_name):
if __name__ == "__main__":
#main()
debian_vcs_query("https://salsa.debian.org/debian/0xffff")
#debian_query("broccoli-ruby")
debian_vcs_query(debian_query("brotli"))

View File

@ -11,7 +11,7 @@ import debian_queries as dqs
key = os.environ.get('KKEXKEY')
test_csv_path = "120523_expanded_data.csv"
test_csv_path = "121123_expanded_data.csv"
def main():
early_cutoff = dt.datetime(2008,2, 8)
@ -49,7 +49,7 @@ def main():
project_dict["age_of_project"], project_dict["contributors"], project_dict["collaborators"] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators']
if "github" in project_dict["upstream_vcs_link"]:
project_dict['milestone_count'] = gha.main(project_dict["upstream_vcs_link"], early_cutoff)
with open('/data/users/mgaughan/kkex_comment_data_120523/' + 'gh_comments_' + project_dict["project_name"] + '.json', 'w') as data_path:
with open('/data/users/mgaughan/kkex_comment_data_121123/' + 'gh_comments_' + project_dict["project_name"] + '.json', 'w') as data_path:
json.dump(ghs.main(project_dict["upstream_vcs_link"], early_cutoff), data_path)
else:
project_dict['milestone_count'] = 0
@ -61,7 +61,7 @@ def main():
meta_dict['total_success'] = successful_count
meta_dict['no_upstream_info'] = no_upstream
#print("success rate: " + str(successful_count/index) + "; total success count: " + str(successful_count))
with open('120523_metadata_expanded.json', 'w') as data_path:
with open('121123_metadata_expanded.json', 'w') as data_path:
json.dump(meta_dict, data_path)