24_deb_pkg_gov/get_spec_file.py

import csv
from git import Repo
from perceval.backends.core.git import Git
import os
import datetime as dt
import time
import shutil
import pandas as pd
import dateutil
from tqdm import tqdm
import math
import io
import re

working_dir = "/data/users/mgaughan/kkex/time_specific_files/contributing2"
temp_dir = "/data/users/mgaughan/tmp3/"

# getting the specific readme or contributing file from a given commit
# inputs: upstream vcs link, commit hash, yes/no is it a readme
def get_file(vcs_link, commit_hash, is_readme):
    if "github" in vcs_link or "gitlab" in vcs_link:
        #making an evaluation that sub branches aren't being used and that people would fork if needed
        #this only looks at main
        vcs_link = "/".join(vcs_link.split("/")[0:5])
        full_temp_path = temp_dir + vcs_link.split('/')[4] + ".git"
        other_temp_path = temp_dir + vcs_link.split('/')[4] + ".git0"
    else:
        full_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git"
        other_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git0"
    repo0 = Repo.clone_from(vcs_link, full_temp_path)
    repo = Git(uri=vcs_link, gitpath=other_temp_path)
    commit0 = repo0.commit(commit_hash)
    commits = repo.fetch()
    target_filename = ""
    for commit in commits:
        files = commit['data']['files']
        for file in files:
            if is_readme:
                if "README" in file['file']:
                    #print(file['file'])
                    if "/" in file['file']:
                        target_filename = file['file'].split("/")[-1]
                    else:
                        target_filename = file['file']
            else:
                if "CONTRIBUTING" in file['file']:
                    if "/" in file['file']:
                        target_filename = file['file'].split("/")[-1]
                    else:
                        target_filename = file['file']

    if target_filename == "":
        shutil.rmtree(full_temp_path, ignore_errors=True)
        shutil.rmtree(other_temp_path, ignore_errors=True)
        return "NoFile"

    targetfile = ""
    for blob in commit0.tree.blobs:
        #print(type(blob.path))
        if "CONTRIBUTING" in blob.path:
            targetfile = blob
            #print(blob.path)
        # why would a file not be in the commit tree? but would be in the directory?
        #shutil.rmtree(full_temp_path, ignore_errors=True)
    #    return "KeyError -- the file is not in the commit tree"
    if targetfile == "":
        shutil.rmtree(full_temp_path, ignore_errors=True)
        shutil.rmtree(other_temp_path, ignore_errors=True)
        return "KeyError -- the file is not in the commit tree"
    if is_readme:
        last_path = "readme2"
    else:
        last_path = "contributing2"
    with open("/data/users/mgaughan/kkex/time_specific_files/" + last_path + "/" + full_temp_path[len(temp_dir):-4] + "_" + targetfile.path , "w") as file:
        with io.BytesIO(targetfile.data_stream.read()) as f:
            file.write(f.read().decode('utf-8', errors='ignore'))
            #file.write(f.read())
        file.close()
    shutil.rmtree(full_temp_path, ignore_errors=True)
    shutil.rmtree(other_temp_path, ignore_errors=True)
    return "NoError"

def for_all_files():
    #toggle this based on readme or contributing files
    readme_is = False
    csv_path = "final_data/deb_contrib_did_data.csv"
    index = -1
    saved = []
    with open(csv_path, 'r') as file:
        with open('e_031824_spec_errors.csv', "w") as writing_file:
            csv_writer = csv.writer(writing_file)
            with open("contributing_completed_downloads.csv", "w") as writing_file2:
                csv_writer2 = csv.writer(writing_file2)
                #csv_reader = csv.DictReader(file)
                lines = [line for line in file]
                for row in tqdm(csv.reader(lines), total=len(lines)):
                    index += 1
                    if index == 0:
                        continue
                    if row[0] == "":
                        continue
                    #print(row[0])
                    return_value = get_file(row[0], row[2], readme_is)
                    if return_value != "NoError":
                        csv_writer.writerow([row[0], row[2], readme_is, return_value])
                    else:
                        if row[0] in saved:
                            continue
                        saved.append(row[0])
                        csv_writer2.writerow(row)
                    # if it is noError, just write the row down in a different csv
                    # there's an issue of duplicates, but just keep it moving
                    # if no duplicates -- just run it through

if __name__ == "__main__":
    for_all_files()