import csv from git import Repo import os import datetime as dt import time import shutil import pandas as pd import dateutil from tqdm import tqdm import math import io working_dir = "/data/users/mgaughan/kkex/time_specific_files_readme" temp_dir = "/data/users/mgaughan/tmp3/" # getting the specific readme or contributing file from a given commit # inputs: upstream vcs link, commit hash, yes/no is it a readme def get_file(vcs_link, commit_hash, is_readme): if "github" in vcs_link or "gitlab" in vcs_link: #making an evaluation that sub branches aren't being used and that people would fork if needed #this only looks at main vcs_link = "/".join(vcs_link.split("/")[0:5]) full_temp_path = temp_dir + vcs_link.split('/')[4] + ".git" else: full_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git" repo = Repo.clone_from(vcs_link, full_temp_path) commit = repo.commit(commit_hash) #getting the name of the file from the root directory for filename in os.listdir(full_temp_path): if is_readme: target_filename = "README.md" if "README" in filename: target_filename = filename else: target_filename = "CONTRIBUTING.md" if "CONTRIBUTING" in filename: target_filename = filename targetfile = commit.tree / target_filename if is_readme: last_path = "readme" else: last_path = "contributing" with open("/data/users/mgaughan/kkex/time_specific_files/" + last_path + "/" + full_temp_path[len(temp_dir):-4] + "_" + last_path + ".md", "w") as file: with io.BytesIO(targetfile.data_stream.read()) as f: file.write(f.read().decode('utf-8')) file.close() shutil.rmtree(full_temp_path, ignore_errors=True) def for_all_files(): #toggle this based on readme or contributing files readme_is = True csv_path = "kk_test_031324_pr_did.csv" with open(csv_path, 'r') as file: #csv_reader = csv.DictReader(file) lines = [line for line in file] for row in tqdm(csv.reader(lines), total=len(lines)): if row[0] == "": continue #print(row[0]) #get_file(row[0], row[?], readme_is) #get_file('https://github.com/tqdm/tqdm', 'fbe7952cce11e8073378b063bdae7ab277a96eb8', True) if __name__ == "__main__": for_all_files() #get_file('https://github.com/tqdm/tqdm', 'fbe7952cce11e8073378b063bdae7ab277a96eb8', True) #get_file('https://github.com/krahets/hello-algo/tree/dev1', 'f615ad42ef3c58cfc6f080b8fb0cd0eb741706a9', True )