diff --git a/get_spec_file.py b/get_spec_file.py index e68ffe5..23c3dcc 100644 --- a/get_spec_file.py +++ b/get_spec_file.py @@ -26,16 +26,23 @@ def get_file(vcs_link, commit_hash, is_readme): repo = Repo.clone_from(vcs_link, full_temp_path) commit = repo.commit(commit_hash) #getting the name of the file from the root directory + target_filename = "" for filename in os.listdir(full_temp_path): if is_readme: - target_filename = "README.md" + #target_filename = "README.md" if "README" in filename: target_filename = filename else: - target_filename = "CONTRIBUTING.md" + #target_filename = "CONTRIBUTING.md" if "CONTRIBUTING" in filename: target_filename = filename - targetfile = commit.tree / target_filename + if target_filename == "": + return "NoFile" + try: + targetfile = commit.tree / target_filename + except KeyError: + # why would a file not be in the commit tree? but would be in the directory? + return "KeyError" if is_readme: last_path = "readme" else: @@ -45,6 +52,7 @@ def get_file(vcs_link, commit_hash, is_readme): file.write(f.read().decode('utf-8')) file.close() shutil.rmtree(full_temp_path, ignore_errors=True) + return "NoError" def for_all_files(): #toggle this based on readme or contributing files @@ -52,17 +60,20 @@ def for_all_files(): csv_path = "kk_031624_pr_did.csv" index = -1 with open(csv_path, 'r') as file: - #csv_reader = csv.DictReader(file) - lines = [line for line in file] - for row in tqdm(csv.reader(lines), total=len(lines)): - index += 1 - if index == 0: - continue - if row[0] == "": - continue - #print(row[0]) - get_file(row[0], row[2], readme_is) - #get_file('https://github.com/tqdm/tqdm', 'fbe7952cce11e8073378b063bdae7ab277a96eb8', True) + with open('031824_spec_errors.csv', "w") as writing_file: + #csv_reader = csv.DictReader(file) + lines = [line for line in file] + for row in tqdm(csv.reader(lines), total=len(lines)): + index += 1 + if index == 0: + continue + if row[0] == "": + continue + #print(row[0]) + return_value = get_file(row[0], row[2], readme_is) + if return_value != "NoError": + writing_file.write(row[0], row[2], readme_is, return_value) + #get_file('https://github.com/tqdm/tqdm', 'fbe7952cce11e8073378b063bdae7ab277a96eb8', True) if __name__ == "__main__": for_all_files() diff --git a/pr_data_get.py b/pr_data_get.py index 65da5f1..0969355 100644 --- a/pr_data_get.py +++ b/pr_data_get.py @@ -34,7 +34,7 @@ def file_get_pr(upstream_vcs_link, me_read): else: full_temp_path = temp_dir + upstream_vcs_link.split('/')[- 1] + ".git" print(upstream_vcs_link) - if upstream_vcs_link == "https://gitlab.com/ubports/core": + if upstream_vcs_link == "https://gitlab.com/ubports/core" or upstream_vcs_link == "https://gitlab.freedesktop.org/xorg/lib": shutil.rmtree(full_temp_path, ignore_errors=True) return {} repo = Git(uri=upstream_vcs_link, gitpath=full_temp_path) @@ -145,11 +145,12 @@ def pr_count(start, end, commits, author_roster, commit_roster): return [by_week, by_week_merge, new_authors, new_committers, author_roster, commit_roster] def for_files(): + #csv_path = "final_data/deb_contribfile_roster.csv" csv_path = "final_data/deb_readme_roster.csv" count = 0 with open(csv_path, 'r') as file: csv_reader = csv.DictReader(file) - with open('kk_test_031624_pr_did.csv', "w") as writing_file: + with open('kk_031624_pr_did.csv', "w") as writing_file: # this would also have to get switched fro the cont dataset keys = ['upstream_vcs_link', "first_readme", "readme_commit_hash", "before_allcom_read", "before_mrg_read", "after_allcom_read", "after_mrg_read", 'before_auth_new', 'after_commit_new', 'after_auth_new', 'before_commit_new'] dict_writer = csv.DictWriter(writing_file, keys) @@ -158,7 +159,10 @@ def for_files(): count += 1 print(row['upstream_vcs_link']) # this would have to get switched to false for the cont dataset - dict_row = file_get_pr(row['upstream_vcs_link'].strip(), True) + try: + dict_row = file_get_pr(row['upstream_vcs_link'].strip(), True) + except: + dict_row = {} dict_writer.writerow(dict_row)