updates to getting first version file

This commit is contained in:
Matthew Gaughan 2025-01-30 23:37:13 -06:00
parent 4dae764bf8
commit da8e1c0e45
2 changed files with 154 additions and 0 deletions

View File

@ -0,0 +1,154 @@
import git
from tqdm import tqdm
import csv
import os
import shutil
import pandas as pd
import time
#destination DIR:
dest_dir="/data/users/mgaughan/kkex/012825_cam_revision_main/first_version_documents/013025_readme/"
#temp DIR:
temp_dir="/data/users/mgaughan/tmp3/"
def temp_clone(vcs_link, temp_location):
"""
ARGS
vcs_link : url link to upstream repo vcs
temp_location : filepath to where the repo should be cloned to
RETURNS
repo : the GitRepository object of the cloned repo
repo_path : the filepath to the cloned repository
"""
#print(temp_location)
vcs_link = vcs_link.strip()
os.makedirs(temp_location)
repo_path = temp_location
repo = git.Repo.clone_from(vcs_link, repo_path)
print(f"Successfully Cloned {vcs_link}")
return repo, repo_path
def delete_clone(temp_location):
"""
ARGS
temp_location : filepath to the cloned repository
RETURNS
whether or not the deletion was a success
"""
if os.path.exists(temp_location):
shutil.rmtree(temp_location)
print(f"{temp_location} has been deleted.")
return 0
else:
print("No clone at location")
return 1
# getting the specific readme or contributing file from a given commit
# inputs: upstream vcs link, commit hash, yes/no is it a readme
def get_file(vcs_link, commit_hash, is_readme):
cwd = os.getcwd()
os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no'
os.environ['GIT_ASKPASS'] = 'false'
os.environ['GIT_TERMINAL_PROMPT'] = '0'
try:
ssh_url = ""
try:
if "github" in vcs_link:
repo_id = vcs_link[len('https://github.com/'):]
ssh_url = f'git@github.com:{repo_id}.git'
if ssh_url.endswith('.git.git'):
ssh_url = ssh_url[:-4]
temp_repo, temp_repo_path = temp_clone(ssh_url, temp_dir)
else:
parts = vcs_link.split('/')
domain = parts[2]
repo_id = '/'.join(parts[3:])
try:
temp_repo, temp_repo_path = temp_clone(vcs_link, temp_dir)
except Exception as e:
print(f'non-Github cloning error, assuming HTTPS issue: {e}')
delete_clone(temp_dir)
ssh_url = f'git@{domain}:{repo_id}.git'
if ssh_url.endswith('.git.git'):
ssh_url = ssh_url[:-4]
temp_repo, temp_repo_path = temp_clone(ssh_url, temp_dir)
except Exception as e:
print(f'cloning error at {vcs_link}')
print(f'inside cloning error: {e}')
raise ValueError(e)
os.chdir(temp_repo_path)
reset_command = f"git reset --hard {commit_hash}"
os.system(reset_command)
doc_name = "CONTRIBUTING"
if is_readme:
doc_name = "README"
und_repo_id = '_'.join(repo_id.split("/"))
for root, dirs, files in os.walk(temp_repo_path):
for file in files:
if file.startswith(doc_name):
doc_found = True
doc_filename = file
doc_path = os.path.join(root, file)
break
if doc_found:
break
dest_path = os.path.join(dest_dir, f"{und_repo_id}_hullabaloo_{doc_filename}")
shutil.copy(doc_path, dest_path)
except Exception as e:
print(f"outside cloning error: {e}")
return False, ""
finally:
und_repo_id = ""
delete_clone(temp_dir)
os.chdir(cwd)
return True, dest_path
def for_all_files(csv_path, is_readme):
doc = "CONTRIBUTING"
if is_readme:
doc = "README"
try:
with open(csv_path, 'r') as file:
csv_reader = csv.DictReader(file)
lines = [line for line in file]
new_manifest_list = []
index = -1
for row in tqdm(csv.reader(lines), total=len(lines)):
index += 1
if index < 1:
continue
time.sleep(4)
manifest_df = pd.DataFrame({
'commit_hash': [row[0]],
'upstream_vcs_link': [row[14]],
'repo_id': [row[12]],
'project_handle': [row[13]]
})
_check, new_filepath = get_file(manifest_df['upstream_vcs_link'][0], manifest_df['commit_hash'][0], is_readme)
manifest_df['new_filepath'] = new_filepath
if _check == False:
break
new_manifest_list.append(manifest_df)
except KeyboardInterrupt:
print("KeyBoardInterrrupt")
finally:
manifest_df = pd.concat(new_manifest_list, ignore_index=True)
manifest_df.to_csv(f"013025_{doc}_manifest.csv", index=False)
if __name__ == "__main__":
for_all_files("../misc_data_files/README_for_download.csv", True)
#get_file("https://github.com/breakfastquay/rubberband", "a94f3f33577bf9d71166392febbfdf3cace6f1c8", True)
#get_file("https://gitlab.freedesktop.org/gstreamer/gstreamer", "1762dfbf982a75d895676b0063379e33b4f9b96a", True)
#get_file("https://github.com/ranger/ranger.git", "ea355f491fb10d5ce054c7813d9abdfd3fc68991" ,False)
#get_file("https://gitlab.gnome.org/Archive/glade", "7e5dfa8ccd211945e624b0fab7fe2b19fb1b9907" ,False)