24_deb_pkg_gov/get_spec_file.py

82 lines
3.2 KiB
Python
Raw Normal View History

import csv
from git import Repo
import os
import datetime as dt
import time
import shutil
import pandas as pd
import dateutil
from tqdm import tqdm
import math
import io
2024-03-26 16:46:28 +00:00
working_dir = "/data/users/mgaughan/kkex/time_specific_files/readme"
temp_dir = "/data/users/mgaughan/tmp3/"
# getting the specific readme or contributing file from a given commit
# inputs: upstream vcs link, commit hash, yes/no is it a readme
def get_file(vcs_link, commit_hash, is_readme):
if "github" in vcs_link or "gitlab" in vcs_link:
#making an evaluation that sub branches aren't being used and that people would fork if needed
#this only looks at main
vcs_link = "/".join(vcs_link.split("/")[0:5])
full_temp_path = temp_dir + vcs_link.split('/')[4] + ".git"
else:
full_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git"
repo = Repo.clone_from(vcs_link, full_temp_path)
commit = repo.commit(commit_hash)
#getting the name of the file from the root directory
2024-03-18 23:03:41 +00:00
target_filename = ""
for filename in os.listdir(full_temp_path):
if is_readme:
2024-03-18 23:03:41 +00:00
#target_filename = "README.md"
if "README" in filename:
target_filename = filename
else:
2024-03-18 23:03:41 +00:00
#target_filename = "CONTRIBUTING.md"
if "CONTRIBUTING" in filename:
target_filename = filename
2024-03-18 23:03:41 +00:00
if target_filename == "":
return "NoFile"
try:
targetfile = commit.tree / target_filename
except KeyError:
# why would a file not be in the commit tree? but would be in the directory?
return "KeyError"
if is_readme:
last_path = "readme"
else:
last_path = "contributing"
with open("/data/users/mgaughan/kkex/time_specific_files/" + last_path + "/" + full_temp_path[len(temp_dir):-4] + "_" + last_path + ".md", "w") as file:
with io.BytesIO(targetfile.data_stream.read()) as f:
file.write(f.read().decode('utf-8'))
file.close()
shutil.rmtree(full_temp_path, ignore_errors=True)
2024-03-18 23:03:41 +00:00
return "NoError"
def for_all_files():
#toggle this based on readme or contributing files
readme_is = True
2024-03-18 22:40:39 +00:00
csv_path = "kk_031624_pr_did.csv"
index = -1
with open(csv_path, 'r') as file:
2024-03-18 23:03:41 +00:00
with open('031824_spec_errors.csv', "w") as writing_file:
2024-03-27 02:58:18 +00:00
csv_writer = csv.writer(writing_file)
2024-03-18 23:03:41 +00:00
#csv_reader = csv.DictReader(file)
lines = [line for line in file]
for row in tqdm(csv.reader(lines), total=len(lines)):
index += 1
if index == 0:
continue
if row[0] == "":
continue
#print(row[0])
return_value = get_file(row[0], row[2], readme_is)
if return_value != "NoError":
2024-03-27 02:58:18 +00:00
csv_writer.writerow([row[0], row[2], readme_is, return_value])
2024-03-18 23:03:41 +00:00
#get_file('https://github.com/tqdm/tqdm', 'fbe7952cce11e8073378b063bdae7ab277a96eb8', True)
if __name__ == "__main__":
for_all_files()
#get_file('https://github.com/tqdm/tqdm', 'fbe7952cce11e8073378b063bdae7ab277a96eb8', True)
#get_file('https://github.com/krahets/hello-algo/tree/dev1', 'f615ad42ef3c58cfc6f080b8fb0cd0eb741706a9', True )