backup
This commit is contained in:
parent
d113e0991a
commit
cc590c6ecc
@ -2,11 +2,13 @@ import csv
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
# the repo of files is the record of what projects we have the specific files for
|
||||||
|
# use the repo of files to then match back to the DiD data
|
||||||
|
|
||||||
temp_dir = "/data/users/mgaughan/tmp3/"
|
temp_dir = "/data/users/mgaughan/tmp3/"
|
||||||
with open("final_readme_did.csv", "w") as writing_file:
|
with open("final_readme_did.csv", "w") as writing_file:
|
||||||
csv_writer = csv.writer(writing_file)
|
csv_writer = csv.writer(writing_file)
|
||||||
for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme")]:
|
for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/")]:
|
||||||
file_project = "".join(filename.split("_")[:-1])
|
file_project = "".join(filename.split("_")[:-1])
|
||||||
with open("kk_031624_pr_did.csv", "r") as file1:
|
with open("kk_031624_pr_did.csv", "r") as file1:
|
||||||
reader_obj = csv.reader(file1)
|
reader_obj = csv.reader(file1)
|
||||||
@ -22,6 +24,7 @@ with open("final_readme_did.csv", "w") as writing_file:
|
|||||||
project_name = temp_dir + line[0].split('/')[- 1]
|
project_name = temp_dir + line[0].split('/')[- 1]
|
||||||
if file_project == project_name:
|
if file_project == project_name:
|
||||||
csv_writer.writerow(line)
|
csv_writer.writerow(line)
|
||||||
|
break
|
||||||
|
|
||||||
'''
|
'''
|
||||||
for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme/")]:
|
for filename in [f for f in os.listdir("/data/users/mgaughan/kkex/time_specific_files/readme/readme/")]:
|
||||||
|
@ -12,7 +12,7 @@ import math
|
|||||||
import io
|
import io
|
||||||
import re
|
import re
|
||||||
|
|
||||||
working_dir = "/data/users/mgaughan/kkex/time_specific_files/contributing"
|
working_dir = "/data/users/mgaughan/kkex/time_specific_files/readme2"
|
||||||
temp_dir = "/data/users/mgaughan/tmp3/"
|
temp_dir = "/data/users/mgaughan/tmp3/"
|
||||||
|
|
||||||
# getting the specific readme or contributing file from a given commit
|
# getting the specific readme or contributing file from a given commit
|
||||||
@ -48,20 +48,7 @@ def get_file(vcs_link, commit_hash, is_readme):
|
|||||||
target_filename = file['file'].split("/")[-1]
|
target_filename = file['file'].split("/")[-1]
|
||||||
else:
|
else:
|
||||||
target_filename = file['file']
|
target_filename = file['file']
|
||||||
#print(commit.tree)
|
|
||||||
#getting the name of the file from the root directory
|
|
||||||
'''
|
|
||||||
target_filename = ""
|
|
||||||
for filename in os.listdir(full_temp_path):
|
|
||||||
if is_readme:
|
|
||||||
#target_filename = "README.md"
|
|
||||||
if "README" in filename or "readme" in filename:
|
|
||||||
target_filename = filename
|
|
||||||
else:
|
|
||||||
#target_filename = "CONTRIBUTING.md"
|
|
||||||
if "CONTRIBUTING" in filename or "contributing" in filename:
|
|
||||||
target_filename = filename
|
|
||||||
'''
|
|
||||||
if target_filename == "":
|
if target_filename == "":
|
||||||
shutil.rmtree(full_temp_path, ignore_errors=True)
|
shutil.rmtree(full_temp_path, ignore_errors=True)
|
||||||
shutil.rmtree(other_temp_path, ignore_errors=True)
|
shutil.rmtree(other_temp_path, ignore_errors=True)
|
||||||
@ -80,9 +67,8 @@ def get_file(vcs_link, commit_hash, is_readme):
|
|||||||
shutil.rmtree(full_temp_path, ignore_errors=True)
|
shutil.rmtree(full_temp_path, ignore_errors=True)
|
||||||
shutil.rmtree(other_temp_path, ignore_errors=True)
|
shutil.rmtree(other_temp_path, ignore_errors=True)
|
||||||
return "KeyError -- the file is not in the commit tree"
|
return "KeyError -- the file is not in the commit tree"
|
||||||
|
|
||||||
if is_readme:
|
if is_readme:
|
||||||
last_path = "readme"
|
last_path = "readme2"
|
||||||
else:
|
else:
|
||||||
last_path = "contributing"
|
last_path = "contributing"
|
||||||
with open("/data/users/mgaughan/kkex/time_specific_files/" + last_path + "/" + full_temp_path[len(temp_dir):-4] + "_" + targetfile.path , "w") as file:
|
with open("/data/users/mgaughan/kkex/time_specific_files/" + last_path + "/" + full_temp_path[len(temp_dir):-4] + "_" + targetfile.path , "w") as file:
|
||||||
@ -96,27 +82,31 @@ def get_file(vcs_link, commit_hash, is_readme):
|
|||||||
|
|
||||||
def for_all_files():
|
def for_all_files():
|
||||||
#toggle this based on readme or contributing files
|
#toggle this based on readme or contributing files
|
||||||
readme_is = False
|
readme_is = True
|
||||||
csv_path = "final_data/deb_contrib_did_data.csv"
|
csv_path = "kk_031624_pr_did.csv"
|
||||||
index = -1
|
index = -1
|
||||||
with open(csv_path, 'r') as file:
|
with open(csv_path, 'r') as file:
|
||||||
with open('c_031824_spec_errors.csv', "w") as writing_file:
|
with open('d_031824_spec_errors.csv', "w") as writing_file:
|
||||||
csv_writer = csv.writer(writing_file)
|
csv_writer = csv.writer(writing_file)
|
||||||
#csv_reader = csv.DictReader(file)
|
with open("readme_completed_downloads.csv", "w") as writing_file2:
|
||||||
lines = [line for line in file]
|
csv_writer2 = csv.writer(writing_file2)
|
||||||
for row in tqdm(csv.reader(lines), total=len(lines)):
|
#csv_reader = csv.DictReader(file)
|
||||||
index += 1
|
lines = [line for line in file]
|
||||||
if index == 0:
|
for row in tqdm(csv.reader(lines), total=len(lines)):
|
||||||
continue
|
index += 1
|
||||||
if row[0] == "":
|
if index == 0:
|
||||||
continue
|
continue
|
||||||
#print(row[0])
|
if row[0] == "":
|
||||||
return_value = get_file(row[0], row[2], readme_is)
|
continue
|
||||||
if return_value != "NoError":
|
#print(row[0])
|
||||||
csv_writer.writerow([row[0], row[2], readme_is, return_value])
|
return_value = get_file(row[0], row[2], readme_is)
|
||||||
# if it is noError, just write the row down in a different csv
|
if return_value != "NoError":
|
||||||
# there's an issue of duplicates, but just keep it moving
|
csv_writer.writerow([row[0], row[2], readme_is, return_value])
|
||||||
# if no duplicates -- just run it through
|
else:
|
||||||
|
csv_writer2.writerow(row)
|
||||||
|
# if it is noError, just write the row down in a different csv
|
||||||
|
# there's an issue of duplicates, but just keep it moving
|
||||||
|
# if no duplicates -- just run it through
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
for_all_files()
|
for_all_files()
|
Loading…
Reference in New Issue
Block a user