updated scripts

This commit is contained in:
Matthew Gaughan 2024-03-18 18:03:41 -05:00
parent a342c2a67f
commit c57b760d5c
2 changed files with 32 additions and 17 deletions

View File

@ -26,16 +26,23 @@ def get_file(vcs_link, commit_hash, is_readme):
repo = Repo.clone_from(vcs_link, full_temp_path)
commit = repo.commit(commit_hash)
#getting the name of the file from the root directory
target_filename = ""
for filename in os.listdir(full_temp_path):
if is_readme:
target_filename = "README.md"
#target_filename = "README.md"
if "README" in filename:
target_filename = filename
else:
target_filename = "CONTRIBUTING.md"
#target_filename = "CONTRIBUTING.md"
if "CONTRIBUTING" in filename:
target_filename = filename
if target_filename == "":
return "NoFile"
try:
targetfile = commit.tree / target_filename
except KeyError:
# why would a file not be in the commit tree? but would be in the directory?
return "KeyError"
if is_readme:
last_path = "readme"
else:
@ -45,6 +52,7 @@ def get_file(vcs_link, commit_hash, is_readme):
file.write(f.read().decode('utf-8'))
file.close()
shutil.rmtree(full_temp_path, ignore_errors=True)
return "NoError"
def for_all_files():
#toggle this based on readme or contributing files
@ -52,6 +60,7 @@ def for_all_files():
csv_path = "kk_031624_pr_did.csv"
index = -1
with open(csv_path, 'r') as file:
with open('031824_spec_errors.csv', "w") as writing_file:
#csv_reader = csv.DictReader(file)
lines = [line for line in file]
for row in tqdm(csv.reader(lines), total=len(lines)):
@ -61,7 +70,9 @@ def for_all_files():
if row[0] == "":
continue
#print(row[0])
get_file(row[0], row[2], readme_is)
return_value = get_file(row[0], row[2], readme_is)
if return_value != "NoError":
writing_file.write(row[0], row[2], readme_is, return_value)
#get_file('https://github.com/tqdm/tqdm', 'fbe7952cce11e8073378b063bdae7ab277a96eb8', True)
if __name__ == "__main__":

View File

@ -34,7 +34,7 @@ def file_get_pr(upstream_vcs_link, me_read):
else:
full_temp_path = temp_dir + upstream_vcs_link.split('/')[- 1] + ".git"
print(upstream_vcs_link)
if upstream_vcs_link == "https://gitlab.com/ubports/core":
if upstream_vcs_link == "https://gitlab.com/ubports/core" or upstream_vcs_link == "https://gitlab.freedesktop.org/xorg/lib":
shutil.rmtree(full_temp_path, ignore_errors=True)
return {}
repo = Git(uri=upstream_vcs_link, gitpath=full_temp_path)
@ -145,11 +145,12 @@ def pr_count(start, end, commits, author_roster, commit_roster):
return [by_week, by_week_merge, new_authors, new_committers, author_roster, commit_roster]
def for_files():
#csv_path = "final_data/deb_contribfile_roster.csv"
csv_path = "final_data/deb_readme_roster.csv"
count = 0
with open(csv_path, 'r') as file:
csv_reader = csv.DictReader(file)
with open('kk_test_031624_pr_did.csv', "w") as writing_file:
with open('kk_031624_pr_did.csv', "w") as writing_file:
# this would also have to get switched fro the cont dataset
keys = ['upstream_vcs_link', "first_readme", "readme_commit_hash", "before_allcom_read", "before_mrg_read", "after_allcom_read", "after_mrg_read", 'before_auth_new', 'after_commit_new', 'after_auth_new', 'before_commit_new']
dict_writer = csv.DictWriter(writing_file, keys)
@ -158,7 +159,10 @@ def for_files():
count += 1
print(row['upstream_vcs_link'])
# this would have to get switched to false for the cont dataset
try:
dict_row = file_get_pr(row['upstream_vcs_link'].strip(), True)
except:
dict_row = {}
dict_writer.writerow(dict_row)