From da8e1c0e457fb45e9b13d475358d2e254d651c56 Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Thu, 30 Jan 2025 23:37:13 -0600 Subject: [PATCH] updates to getting first version file --- .../{ => spec_file}/get_spec_file.py | 0 .../spec_file/updated_get_spec_file.py | 154 ++++++++++++++++++ 2 files changed, 154 insertions(+) rename 12825_revision/{ => spec_file}/get_spec_file.py (100%) create mode 100644 12825_revision/spec_file/updated_get_spec_file.py diff --git a/12825_revision/get_spec_file.py b/12825_revision/spec_file/get_spec_file.py similarity index 100% rename from 12825_revision/get_spec_file.py rename to 12825_revision/spec_file/get_spec_file.py diff --git a/12825_revision/spec_file/updated_get_spec_file.py b/12825_revision/spec_file/updated_get_spec_file.py new file mode 100644 index 0000000..50fee7e --- /dev/null +++ b/12825_revision/spec_file/updated_get_spec_file.py @@ -0,0 +1,154 @@ +import git +from tqdm import tqdm +import csv +import os +import shutil +import pandas as pd +import time + +#destination DIR: +dest_dir="/data/users/mgaughan/kkex/012825_cam_revision_main/first_version_documents/013025_readme/" +#temp DIR: +temp_dir="/data/users/mgaughan/tmp3/" + +def temp_clone(vcs_link, temp_location): + """ + ARGS + vcs_link : url link to upstream repo vcs + temp_location : filepath to where the repo should be cloned to + + RETURNS + repo : the GitRepository object of the cloned repo + repo_path : the filepath to the cloned repository + """ + #print(temp_location) + vcs_link = vcs_link.strip() + os.makedirs(temp_location) + repo_path = temp_location + repo = git.Repo.clone_from(vcs_link, repo_path) + print(f"Successfully Cloned {vcs_link}") + return repo, repo_path + + +def delete_clone(temp_location): + """ + ARGS + temp_location : filepath to the cloned repository + + RETURNS + whether or not the deletion was a success + """ + if os.path.exists(temp_location): + shutil.rmtree(temp_location) + print(f"{temp_location} has been deleted.") + return 0 + else: + print("No clone at location") + return 1 + +# getting the specific readme or contributing file from a given commit +# inputs: upstream vcs link, commit hash, yes/no is it a readme +def get_file(vcs_link, commit_hash, is_readme): + cwd = os.getcwd() + os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no' + os.environ['GIT_ASKPASS'] = 'false' + os.environ['GIT_TERMINAL_PROMPT'] = '0' + try: + ssh_url = "" + try: + if "github" in vcs_link: + repo_id = vcs_link[len('https://github.com/'):] + ssh_url = f'git@github.com:{repo_id}.git' + if ssh_url.endswith('.git.git'): + ssh_url = ssh_url[:-4] + temp_repo, temp_repo_path = temp_clone(ssh_url, temp_dir) + else: + parts = vcs_link.split('/') + domain = parts[2] + repo_id = '/'.join(parts[3:]) + try: + temp_repo, temp_repo_path = temp_clone(vcs_link, temp_dir) + except Exception as e: + print(f'non-Github cloning error, assuming HTTPS issue: {e}') + delete_clone(temp_dir) + ssh_url = f'git@{domain}:{repo_id}.git' + if ssh_url.endswith('.git.git'): + ssh_url = ssh_url[:-4] + temp_repo, temp_repo_path = temp_clone(ssh_url, temp_dir) + except Exception as e: + print(f'cloning error at {vcs_link}') + print(f'inside cloning error: {e}') + raise ValueError(e) + + os.chdir(temp_repo_path) + reset_command = f"git reset --hard {commit_hash}" + os.system(reset_command) + + doc_name = "CONTRIBUTING" + if is_readme: + doc_name = "README" + + und_repo_id = '_'.join(repo_id.split("/")) + + for root, dirs, files in os.walk(temp_repo_path): + for file in files: + if file.startswith(doc_name): + doc_found = True + doc_filename = file + doc_path = os.path.join(root, file) + break + if doc_found: + break + dest_path = os.path.join(dest_dir, f"{und_repo_id}_hullabaloo_{doc_filename}") + shutil.copy(doc_path, dest_path) + except Exception as e: + print(f"outside cloning error: {e}") + return False, "" + finally: + und_repo_id = "" + delete_clone(temp_dir) + os.chdir(cwd) + + return True, dest_path + +def for_all_files(csv_path, is_readme): + doc = "CONTRIBUTING" + if is_readme: + doc = "README" + try: + with open(csv_path, 'r') as file: + csv_reader = csv.DictReader(file) + lines = [line for line in file] + new_manifest_list = [] + index = -1 + for row in tqdm(csv.reader(lines), total=len(lines)): + index += 1 + if index < 1: + continue + time.sleep(4) + manifest_df = pd.DataFrame({ + 'commit_hash': [row[0]], + 'upstream_vcs_link': [row[14]], + 'repo_id': [row[12]], + 'project_handle': [row[13]] + }) + + _check, new_filepath = get_file(manifest_df['upstream_vcs_link'][0], manifest_df['commit_hash'][0], is_readme) + manifest_df['new_filepath'] = new_filepath + + if _check == False: + break + + new_manifest_list.append(manifest_df) + except KeyboardInterrupt: + print("KeyBoardInterrrupt") + finally: + manifest_df = pd.concat(new_manifest_list, ignore_index=True) + manifest_df.to_csv(f"013025_{doc}_manifest.csv", index=False) + +if __name__ == "__main__": + for_all_files("../misc_data_files/README_for_download.csv", True) + #get_file("https://github.com/breakfastquay/rubberband", "a94f3f33577bf9d71166392febbfdf3cace6f1c8", True) + #get_file("https://gitlab.freedesktop.org/gstreamer/gstreamer", "1762dfbf982a75d895676b0063379e33b4f9b96a", True) + #get_file("https://github.com/ranger/ranger.git", "ea355f491fb10d5ce054c7813d9abdfd3fc68991" ,False) + #get_file("https://gitlab.gnome.org/Archive/glade", "7e5dfa8ccd211945e624b0fab7fe2b19fb1b9907" ,False)