new file to grab time-specific gov docs

This commit is contained in:
Matthew Gaughan 2024-03-16 06:14:03 -05:00
parent 65d970bbd3
commit eaa84d33e0
2 changed files with 76 additions and 9 deletions

66
get_spec_file.py Normal file
View File

@ -0,0 +1,66 @@
import csv
from git import Repo
import os
import datetime as dt
import time
import shutil
import pandas as pd
import dateutil
from tqdm import tqdm
import math
import io
working_dir = "/data/users/mgaughan/kkex/time_specific_files_readme"
temp_dir = "/data/users/mgaughan/tmp3/"
# getting the specific readme or contributing file from a given commit
# inputs: upstream vcs link, commit hash, yes/no is it a readme
def get_file(vcs_link, commit_hash, is_readme):
if "github" in vcs_link or "gitlab" in vcs_link:
#making an evaluation that sub branches aren't being used and that people would fork if needed
#this only looks at main
vcs_link = "/".join(vcs_link.split("/")[0:5])
full_temp_path = temp_dir + vcs_link.split('/')[4] + ".git"
else:
full_temp_path = temp_dir + vcs_link.split('/')[- 1] + ".git"
repo = Repo.clone_from(vcs_link, full_temp_path)
commit = repo.commit(commit_hash)
#getting the name of the file from the root directory
for filename in os.listdir(full_temp_path):
if is_readme:
target_filename = "README.md"
if "README" in filename:
target_filename = filename
else:
target_filename = "CONTRIBUTING.md"
if "CONTRIBUTING" in filename:
target_filename = filename
targetfile = commit.tree / target_filename
if is_readme:
last_path = "readme"
else:
last_path = "contributing"
with open("/data/users/mgaughan/kkex/time_specific_files/" + last_path + "/" + full_temp_path[len(temp_dir):-4] + "_" + last_path + ".md", "w") as file:
with io.BytesIO(targetfile.data_stream.read()) as f:
file.write(f.read().decode('utf-8'))
file.close()
shutil.rmtree(full_temp_path, ignore_errors=True)
def for_all_files():
#toggle this based on readme or contributing files
readme_is = True
csv_path = "kk_test_031324_pr_did.csv"
with open(csv_path, 'r') as file:
#csv_reader = csv.DictReader(file)
lines = [line for line in file]
for row in tqdm(csv.reader(lines), total=len(lines)):
if row[0] == "":
continue
#print(row[0])
#get_file(row[0], row[?], readme_is)
get_file('https://github.com/tqdm/tqdm', 'fbe7952cce11e8073378b063bdae7ab277a96eb8', True)
if __name__ == "__main__":
for_all_files()
#get_file('https://github.com/tqdm/tqdm', 'fbe7952cce11e8073378b063bdae7ab277a96eb8', True)
#get_file('https://github.com/krahets/hello-algo/tree/dev1', 'f615ad42ef3c58cfc6f080b8fb0cd0eb741706a9', True )

View File

@ -12,7 +12,7 @@ import math
key = os.environ.get('KKEXKEY')
early_cutoff = dt.datetime(2008,2, 8)
temp_dir = "/data/users/mgaughan/tmp"
temp_dir = "/data/users/mgaughan/tmp/"
'''
- rate of change, rate of all/day
@ -24,8 +24,6 @@ def file_get_pr(upstream_vcs_link, me_read):
#print(upstream_vcs_link.split('/')[4])
project_dict = {}
project_dict['upstream_vcs_link'] = upstream_vcs_link
if upstream_vcs_link == " https://gitlab.com/ubports/core/cmake-extras":
return {}
upstream_vcs_link = upstream_vcs_link.strip()
if "github" in upstream_vcs_link or "gitlab" in upstream_vcs_link:
#making an evaluation that sub branches aren't being used and that people would fork if needed
@ -36,6 +34,9 @@ def file_get_pr(upstream_vcs_link, me_read):
else:
full_temp_path = temp_dir + upstream_vcs_link.split('/')[- 1] + ".git"
print(upstream_vcs_link)
if upstream_vcs_link == "https://gitlab.com/ubports/core":
shutil.rmtree(full_temp_path, ignore_errors=True)
return {}
repo = Git(uri=upstream_vcs_link, gitpath=full_temp_path)
try:
commits = repo.fetch()
@ -144,13 +145,13 @@ def pr_count(start, end, commits, author_roster, commit_roster):
return [by_week, by_week_merge, new_authors, new_committers, author_roster, commit_roster]
def for_files():
csv_path = "final_data/kk_final_readme_roster.csv"
csv_path = "final_data/deb_readme_roster.csv"
count = 0
with open(csv_path, 'r') as file:
csv_reader = csv.DictReader(file)
with open('kk_test_031424_pr_did.csv', "w") as writing_file:
with open('kk_test_031624_pr_did.csv', "w") as writing_file:
# this would also have to get switched fro the cont dataset
keys = ['upstream_vcs_link', "first_readme", "before_allcom_read", "before_mrg_read", "after_allcom_read", "after_mrg_read", 'before_auth_new', 'after_commit_new', 'after_auth_new', 'before_commit_new']
keys = ['upstream_vcs_link', "first_readme", "readme_commit_hash", "before_allcom_read", "before_mrg_read", "after_allcom_read", "after_mrg_read", 'before_auth_new', 'after_commit_new', 'after_auth_new', 'before_commit_new']
dict_writer = csv.DictWriter(writing_file, keys)
dict_writer.writeheader()
for row in csv_reader:
@ -162,9 +163,9 @@ def for_files():
if __name__ == "__main__":
#for_files()
file_get_pr("https://github.com/tqdm/tqdm", True)
for_files()
#file_get_pr("https://github.com/tqdm/tqdm", True)
#file_get_pr("https://github.com/GameServerManagers/LinuxGSM", True)
#file_get_pr("https://github.com/walling/unorm/issues/new/", True)
file_get_pr("https://github.com/krahets/hello-algo/tree/dev1", True)
#file_get_pr("https://github.com/krahets/hello-algo/tree/dev1", True)