From 29a6ef7074973b1475b7c3487722f4d5ebdeb935 Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Mon, 11 Dec 2023 14:34:13 -0600 Subject: [PATCH] expanded data collection script edits --- debian_queries.py | 25 +++++++++++++++++++------ expanded_data_collection.py | 6 +++--- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/debian_queries.py b/debian_queries.py index dbb3533..952eeac 100644 --- a/debian_queries.py +++ b/debian_queries.py @@ -1,10 +1,11 @@ import os import requests import wget -from perceval.backends.core.git import Git +#from perceval.backends.core.git import Git from git import Repo import shutil - +import pexpect +import subprocess def debian_vcs_query(debian_vcs_link): upstream_repo_link = "" @@ -12,11 +13,21 @@ def debian_vcs_query(debian_vcs_link): output_directory = "/data/users/mgaughan/tmp1/" + project_name print(output_directory) #url = debian_vcs_link + '/-/blob/master/debian/upstream/metadata' + # the below try is the password handling try: - Repo.clone_from(debian_vcs_link, output_directory) + #Repo.clone_from(debian_vcs_link, output_directory, env={"Username for 'https://salsa.debian.org": "", "Password for 'https://salsa.debian.org":""}) + command = "git clone " + debian_vcs_link + " " + output_directory + p = pexpect.spawn(command) + p.expect("Username for 'https://salsa.debian.org':") + p.sendline ("") + p.expect ("Password for 'https://salsa.debian.org':") + p.sendline ("") + p.expect(pexpect.EOF, timeout=None) + #p.expect([pexpect.TIMEOUT, 'pattern', pexpect.EOF]) + p.close() except: - print("error cloning") - return upstream_repo_link + print("handling misc pexpect issues") + #return upstream_repo_link try: upstream_metadata = open(output_directory + "/debian/upstream/metadata", "r").read().split("\n") upstream_repo_array = upstream_metadata[3].split(":")[-2:] @@ -28,6 +39,7 @@ def debian_vcs_query(debian_vcs_link): #filename = wget.download(url, out=output_directory) #print(type(filename)) shutil.rmtree(output_directory, ignore_errors=True) + print('success') return upstream_repo_link @@ -65,4 +77,5 @@ def debian_query(package_name): if __name__ == "__main__": #main() - debian_vcs_query("https://salsa.debian.org/debian/0xffff") \ No newline at end of file + #debian_query("broccoli-ruby") + debian_vcs_query(debian_query("brotli")) \ No newline at end of file diff --git a/expanded_data_collection.py b/expanded_data_collection.py index 140ecd7..bcfdc1e 100644 --- a/expanded_data_collection.py +++ b/expanded_data_collection.py @@ -11,7 +11,7 @@ import debian_queries as dqs key = os.environ.get('KKEXKEY') -test_csv_path = "120523_expanded_data.csv" +test_csv_path = "121123_expanded_data.csv" def main(): early_cutoff = dt.datetime(2008,2, 8) @@ -49,7 +49,7 @@ def main(): project_dict["age_of_project"], project_dict["contributors"], project_dict["collaborators"] = perceval_data['age_of_project'], perceval_data['contributors'], perceval_data['collaborators'] if "github" in project_dict["upstream_vcs_link"]: project_dict['milestone_count'] = gha.main(project_dict["upstream_vcs_link"], early_cutoff) - with open('/data/users/mgaughan/kkex_comment_data_120523/' + 'gh_comments_' + project_dict["project_name"] + '.json', 'w') as data_path: + with open('/data/users/mgaughan/kkex_comment_data_121123/' + 'gh_comments_' + project_dict["project_name"] + '.json', 'w') as data_path: json.dump(ghs.main(project_dict["upstream_vcs_link"], early_cutoff), data_path) else: project_dict['milestone_count'] = 0 @@ -61,7 +61,7 @@ def main(): meta_dict['total_success'] = successful_count meta_dict['no_upstream_info'] = no_upstream #print("success rate: " + str(successful_count/index) + "; total success count: " + str(successful_count)) - with open('120523_metadata_expanded.json', 'w') as data_path: + with open('121123_metadata_expanded.json', 'w') as data_path: json.dump(meta_dict, data_path)