updates to getting first version file

2025-01-30 23:37:13 -06:00 · 2025-01-30 23:37:13 -06:00 · da8e1c0e45
commit da8e1c0e45
parent 4dae764bf8
2 changed files with 154 additions and 0 deletions
--- a/12825_revision/spec_file/get_spec_file.py
+++ b/12825_revision/spec_file/get_spec_file.py
--- a/12825_revision/spec_file/updated_get_spec_file.py
+++ b/12825_revision/spec_file/updated_get_spec_file.py
@ -0,0 +1,154 @@
+import git 
+from tqdm import tqdm
+import csv
+import os
+import shutil
+import pandas as pd
+import time
+
+#destination DIR:
+dest_dir="/data/users/mgaughan/kkex/012825_cam_revision_main/first_version_documents/013025_readme/"
+#temp DIR:
+temp_dir="/data/users/mgaughan/tmp3/"
+
+def temp_clone(vcs_link, temp_location):
+    """
+    ARGS
+    vcs_link : url link to upstream repo vcs
+    temp_location : filepath to where the repo should be cloned to 
+
+    RETURNS
+    repo : the GitRepository object of the cloned repo 
+    repo_path : the filepath to the cloned repository
+    """
+    #print(temp_location)
+    vcs_link = vcs_link.strip()
+    os.makedirs(temp_location)
+    repo_path = temp_location
+    repo = git.Repo.clone_from(vcs_link, repo_path)
+    print(f"Successfully Cloned {vcs_link}")
+    return repo, repo_path
+
+
+def delete_clone(temp_location):
+    """
+    ARGS
+    temp_location : filepath to the cloned repository 
+
+    RETURNS
+    whether or not the deletion was a success
+    """
+    if os.path.exists(temp_location):
+        shutil.rmtree(temp_location)
+        print(f"{temp_location} has been deleted.")
+        return 0
+    else:
+        print("No clone at location")
+        return 1
+
+# getting the specific readme or contributing file from a given commit 
+# inputs: upstream vcs link, commit hash, yes/no is it a readme 
+def get_file(vcs_link, commit_hash, is_readme):
+    cwd = os.getcwd()
+    os.environ['GIT_SSH_COMMAND'] = 'ssh -o StrictHostKeyChecking=no'
+    os.environ['GIT_ASKPASS'] = 'false'
+    os.environ['GIT_TERMINAL_PROMPT'] = '0'
+    try:
+        ssh_url = ""
+        try: 
+            if "github" in vcs_link:
+                repo_id = vcs_link[len('https://github.com/'):]
+                ssh_url = f'git@github.com:{repo_id}.git'
+                if ssh_url.endswith('.git.git'):
+                    ssh_url = ssh_url[:-4]
+                temp_repo, temp_repo_path =  temp_clone(ssh_url, temp_dir)
+            else:
+                parts = vcs_link.split('/')
+                domain = parts[2]
+                repo_id = '/'.join(parts[3:])
+                try:
+                    temp_repo, temp_repo_path =  temp_clone(vcs_link, temp_dir)
+                except Exception as e:
+                    print(f'non-Github cloning error, assuming HTTPS issue: {e}')
+                    delete_clone(temp_dir)
+                    ssh_url = f'git@{domain}:{repo_id}.git'
+                    if ssh_url.endswith('.git.git'):
+                        ssh_url = ssh_url[:-4]
+                    temp_repo, temp_repo_path =  temp_clone(ssh_url, temp_dir)
+        except Exception as e:
+            print(f'cloning error at {vcs_link}')
+            print(f'inside cloning error: {e}')
+            raise ValueError(e)
+    
+        os.chdir(temp_repo_path)
+        reset_command = f"git reset --hard {commit_hash}"
+        os.system(reset_command)
+
+        doc_name = "CONTRIBUTING"
+        if is_readme:
+            doc_name = "README"
+        
+        und_repo_id = '_'.join(repo_id.split("/"))
+
+        for root, dirs, files in os.walk(temp_repo_path):
+            for file in files:
+                if file.startswith(doc_name):
+                    doc_found = True
+                    doc_filename = file
+                    doc_path = os.path.join(root, file)
+                    break
+            if doc_found:
+                break
+        dest_path = os.path.join(dest_dir, f"{und_repo_id}_hullabaloo_{doc_filename}")
+        shutil.copy(doc_path, dest_path)
+    except Exception as e:
+        print(f"outside cloning error: {e}")
+        return False, ""
+    finally:
+        und_repo_id = ""
+        delete_clone(temp_dir)
+        os.chdir(cwd)
+    
+    return True, dest_path
+
+def for_all_files(csv_path, is_readme):
+    doc = "CONTRIBUTING"
+    if is_readme:
+        doc = "README"
+    try:
+        with open(csv_path, 'r') as file:
+            csv_reader = csv.DictReader(file) 
+            lines = [line for line in file]
+            new_manifest_list = []
+            index = -1
+            for row in tqdm(csv.reader(lines), total=len(lines)):
+                index += 1
+                if index < 1:
+                    continue
+                time.sleep(4)
+                manifest_df = pd.DataFrame({
+                    'commit_hash': [row[0]],
+                    'upstream_vcs_link': [row[14]],
+                    'repo_id': [row[12]],
+                    'project_handle': [row[13]]
+                })
+
+                _check, new_filepath = get_file(manifest_df['upstream_vcs_link'][0], manifest_df['commit_hash'][0], is_readme)
+                manifest_df['new_filepath'] = new_filepath
+
+                if _check == False:
+                    break
+                
+                new_manifest_list.append(manifest_df)
+    except KeyboardInterrupt:
+        print("KeyBoardInterrrupt")
+    finally:
+        manifest_df = pd.concat(new_manifest_list, ignore_index=True)
+        manifest_df.to_csv(f"013025_{doc}_manifest.csv", index=False)
+
+if __name__ == "__main__":
+    for_all_files("../misc_data_files/README_for_download.csv", True)
+    #get_file("https://github.com/breakfastquay/rubberband", "a94f3f33577bf9d71166392febbfdf3cace6f1c8", True)
+    #get_file("https://gitlab.freedesktop.org/gstreamer/gstreamer", "1762dfbf982a75d895676b0063379e33b4f9b96a", True)
+    #get_file("https://github.com/ranger/ranger.git", "ea355f491fb10d5ce054c7813d9abdfd3fc68991" ,False)
+    #get_file("https://gitlab.gnome.org/Archive/glade", "7e5dfa8ccd211945e624b0fab7fe2b19fb1b9907" ,False)