import csv from git import Repo from perceval.backends.core.git import Git import os import datetime as dt import time import shutil import pandas as pd import dateutil from tqdm import tqdm import math import io import re working_dir = "/data/users/mgaughan/kkex/time_specific_files/readme" temp_dir = "/data/users/mgaughan/tmp3/" # getting the specific readme or contributing file from a given commit # inputs: upstream vcs link, commit hash, yes/no is it a readme def get_file(vcs_link, commit_hash, is_readme): if "github" in vcs_link or "gitlab" in vcs_link: #making an evaluation that sub branches aren't being used and that people would fork if needed #this only looks at main vcs_link = "/".join(vcs_link.split("/")[0:5]) full_temp_path = temp_dir + vcs_link.split('/')[4] + ".git" other_temp_path = temp_dir + vcs_link.split('/')[4] + ".git0" else: full_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git" other_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git0" repo0 = Repo.clone_from(vcs_link, full_temp_path) repo = Git(uri=vcs_link, gitpath=other_temp_path) commit0 = repo0.commit(commit_hash) commits = repo.fetch() target_filename = "" for commit in commits: files = commit['data']['files'] for file in files: if is_readme: if "README" in file['file']: #print(file['file']) if "/" in file['file']: target_filename = file['file'].split("/")[-1] else: target_filename = file['file'] else: if "CONTRIBUTING" in file['file']: ''' if "/" in file['file']: target_filename = file['file'].split("/")[-1] else: ''' target_filename = str(file['file']) #print(commit.tree) #getting the name of the file from the root directory ''' target_filename = "" for filename in os.listdir(full_temp_path): if is_readme: #target_filename = "README.md" if "README" in filename or "readme" in filename: target_filename = filename else: #target_filename = "CONTRIBUTING.md" if "CONTRIBUTING" in filename or "contributing" in filename: target_filename = filename ''' if target_filename == "": shutil.rmtree(full_temp_path, ignore_errors=True) shutil.rmtree(other_temp_path, ignore_errors=True) return "NoFile" #target_filename = r"/README/" #issue with searching through the tree this way # need to match the tree w blobs #print(commit0.tree.blobs) #file_id = commit0.tree[r'README.*'].hexsha #targetfile = repo0.blob(file_id) #targetfile = commit0.tree / target_filename ''' try: targetfile = commit0.tree / target_filename except KeyError: target_filename = "README.rst" try: targetfile = commit0.tree / target_filename except KeyError: target_filename = "README" try: targetfile = commit0.tree / target_filename except KeyError: target_filename = "README.txt" try: targetfile = commit0.tree / target_filename except KeyError: shutil.rmtree(full_temp_path, ignore_errors=True) shutil.rmtree(other_temp_path, ignore_errors=True) return "KeyError -- the file is not in the commit tree" ''' print(target_filename) for blob in commit0.tree.blobs: if "README" in blob.path: targetfile = blob print(blob.path) # why would a file not be in the commit tree? but would be in the directory? #shutil.rmtree(full_temp_path, ignore_errors=True) # return "KeyError -- the file is not in the commit tree" if is_readme: last_path = "readme" else: last_path = "contributing" with open("/data/users/mgaughan/kkex/time_specific_files/" + last_path + "/" + full_temp_path[len(temp_dir):-4] + "_" + last_path + ".md", "w") as file: with io.BytesIO(targetfile.data_stream.read()) as f: file.write(f.read().decode('utf-8')) file.close() shutil.rmtree(full_temp_path, ignore_errors=True) shutil.rmtree(other_temp_path, ignore_errors=True) return "NoError" def for_all_files(): #toggle this based on readme or contributing files readme_is = True csv_path = "kk_031624_pr_did.csv" index = -1 with open(csv_path, 'r') as file: with open('a_031824_spec_errors.csv', "w") as writing_file: csv_writer = csv.writer(writing_file) #csv_reader = csv.DictReader(file) lines = [line for line in file] for row in tqdm(csv.reader(lines), total=len(lines)): index += 1 if index == 0: continue if row[0] == "": continue #print(row[0]) return_value = get_file(row[0], row[2], readme_is) if return_value != "NoError": csv_writer.writerow([row[0], row[2], readme_is, return_value]) #get_file('https://github.com/tqdm/tqdm', 'fbe7952cce11e8073378b063bdae7ab277a96eb8', True) if __name__ == "__main__": for_all_files() #print(get_file('https://github.com/the-tcpdump-group/tcpslice', 'ffac277bf41946a1d985afae7fe2535d7a28546f', True)) #get_file('https://github.com/krahets/hello-algo/tree/dev1', 'f615ad42ef3c58cfc6f080b8fb0cd0eb741706a9', True )