updates to getting first version file
This commit is contained in:
parent
4dae764bf8
commit
da8e1c0e45
154
12825_revision/spec_file/updated_get_spec_file.py
Normal file
154
12825_revision/spec_file/updated_get_spec_file.py
Normal file
@ -0,0 +1,154 @@
|
||||
import git
|
||||
from tqdm import tqdm
|
||||
import csv
|
||||
import os
|
||||
import shutil
|
||||
import pandas as pd
|
||||
import time
|
||||
|
||||
#destination DIR:
|
||||
dest_dir="/data/users/mgaughan/kkex/012825_cam_revision_main/first_version_documents/013025_readme/"
|
||||
#temp DIR:
|
||||
temp_dir="/data/users/mgaughan/tmp3/"
|
||||
|
||||
def temp_clone(vcs_link, temp_location):
|
||||
"""
|
||||
ARGS
|
||||
vcs_link : url link to upstream repo vcs
|
||||
temp_location : filepath to where the repo should be cloned to
|
||||
|
||||
RETURNS
|
||||
repo : the GitRepository object of the cloned repo
|
||||
repo_path : the filepath to the cloned repository
|
||||
"""
|
||||
#print(temp_location)
|
||||
vcs_link = vcs_link.strip()
|
||||
os.makedirs(temp_location)
|
||||
repo_path = temp_location
|
||||
repo = git.Repo.clone_from(vcs_link, repo_path)
|
||||
print(f"Successfully Cloned {vcs_link}")
|
||||
return repo, repo_path
|
||||
|
||||
|
||||
def delete_clone(temp_location):
|
||||
"""
|
||||
ARGS
|
||||
temp_location : filepath to the cloned repository
|
||||
|
||||
RETURNS
|
||||
whether or not the deletion was a success
|
||||
"""
|
||||
if os.path.exists(temp_location):
|
||||
shutil.rmtree(temp_location)
|
||||
print(f"{temp_location} has been deleted.")
|
||||
return 0
|
||||
else:
|
||||
print("No clone at location")
|
||||
return 1
|
||||
|
||||
# getting the specific readme or contributing file from a given commit
|
||||
# inputs: upstream vcs link, commit hash, yes/no is it a readme
|
||||
def get_file(vcs_link, commit_hash, is_readme):
|
||||
cwd = os.getcwd()
|
||||
os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no'
|
||||
os.environ['GIT_ASKPASS'] = 'false'
|
||||
os.environ['GIT_TERMINAL_PROMPT'] = '0'
|
||||
try:
|
||||
ssh_url = ""
|
||||
try:
|
||||
if "github" in vcs_link:
|
||||
repo_id = vcs_link[len('https://github.com/'):]
|
||||
ssh_url = f'git@github.com:{repo_id}.git'
|
||||
if ssh_url.endswith('.git.git'):
|
||||
ssh_url = ssh_url[:-4]
|
||||
temp_repo, temp_repo_path = temp_clone(ssh_url, temp_dir)
|
||||
else:
|
||||
parts = vcs_link.split('/')
|
||||
domain = parts[2]
|
||||
repo_id = '/'.join(parts[3:])
|
||||
try:
|
||||
temp_repo, temp_repo_path = temp_clone(vcs_link, temp_dir)
|
||||
except Exception as e:
|
||||
print(f'non-Github cloning error, assuming HTTPS issue: {e}')
|
||||
delete_clone(temp_dir)
|
||||
ssh_url = f'git@{domain}:{repo_id}.git'
|
||||
if ssh_url.endswith('.git.git'):
|
||||
ssh_url = ssh_url[:-4]
|
||||
temp_repo, temp_repo_path = temp_clone(ssh_url, temp_dir)
|
||||
except Exception as e:
|
||||
print(f'cloning error at {vcs_link}')
|
||||
print(f'inside cloning error: {e}')
|
||||
raise ValueError(e)
|
||||
|
||||
os.chdir(temp_repo_path)
|
||||
reset_command = f"git reset --hard {commit_hash}"
|
||||
os.system(reset_command)
|
||||
|
||||
doc_name = "CONTRIBUTING"
|
||||
if is_readme:
|
||||
doc_name = "README"
|
||||
|
||||
und_repo_id = '_'.join(repo_id.split("/"))
|
||||
|
||||
for root, dirs, files in os.walk(temp_repo_path):
|
||||
for file in files:
|
||||
if file.startswith(doc_name):
|
||||
doc_found = True
|
||||
doc_filename = file
|
||||
doc_path = os.path.join(root, file)
|
||||
break
|
||||
if doc_found:
|
||||
break
|
||||
dest_path = os.path.join(dest_dir, f"{und_repo_id}_hullabaloo_{doc_filename}")
|
||||
shutil.copy(doc_path, dest_path)
|
||||
except Exception as e:
|
||||
print(f"outside cloning error: {e}")
|
||||
return False, ""
|
||||
finally:
|
||||
und_repo_id = ""
|
||||
delete_clone(temp_dir)
|
||||
os.chdir(cwd)
|
||||
|
||||
return True, dest_path
|
||||
|
||||
def for_all_files(csv_path, is_readme):
|
||||
doc = "CONTRIBUTING"
|
||||
if is_readme:
|
||||
doc = "README"
|
||||
try:
|
||||
with open(csv_path, 'r') as file:
|
||||
csv_reader = csv.DictReader(file)
|
||||
lines = [line for line in file]
|
||||
new_manifest_list = []
|
||||
index = -1
|
||||
for row in tqdm(csv.reader(lines), total=len(lines)):
|
||||
index += 1
|
||||
if index < 1:
|
||||
continue
|
||||
time.sleep(4)
|
||||
manifest_df = pd.DataFrame({
|
||||
'commit_hash': [row[0]],
|
||||
'upstream_vcs_link': [row[14]],
|
||||
'repo_id': [row[12]],
|
||||
'project_handle': [row[13]]
|
||||
})
|
||||
|
||||
_check, new_filepath = get_file(manifest_df['upstream_vcs_link'][0], manifest_df['commit_hash'][0], is_readme)
|
||||
manifest_df['new_filepath'] = new_filepath
|
||||
|
||||
if _check == False:
|
||||
break
|
||||
|
||||
new_manifest_list.append(manifest_df)
|
||||
except KeyboardInterrupt:
|
||||
print("KeyBoardInterrrupt")
|
||||
finally:
|
||||
manifest_df = pd.concat(new_manifest_list, ignore_index=True)
|
||||
manifest_df.to_csv(f"013025_{doc}_manifest.csv", index=False)
|
||||
|
||||
if __name__ == "__main__":
|
||||
for_all_files("../misc_data_files/README_for_download.csv", True)
|
||||
#get_file("https://github.com/breakfastquay/rubberband", "a94f3f33577bf9d71166392febbfdf3cace6f1c8", True)
|
||||
#get_file("https://gitlab.freedesktop.org/gstreamer/gstreamer", "1762dfbf982a75d895676b0063379e33b4f9b96a", True)
|
||||
#get_file("https://github.com/ranger/ranger.git", "ea355f491fb10d5ce054c7813d9abdfd3fc68991" ,False)
|
||||
#get_file("https://gitlab.gnome.org/Archive/glade", "7e5dfa8ccd211945e624b0fab7fe2b19fb1b9907" ,False)
|
Loading…
Reference in New Issue
Block a user