import pandas as pd import os import csv from tqdm import tqdm import fnmatch # Load the CSV file into a DataFrame csv_file = '/home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/020525_README_manifest.csv' # Replace with your CSV file path #column_name = 'new_filepath' # Replace with your column name containing file paths # Read the CSV file df = pd.read_csv(csv_file) # Function to check if a file exists def check_file_exists(file_path): return os.path.exists(file_path) # Apply the function to the DataFrame column df['file_exists'] = df[column_name].apply(check_file_exists) missing_files_df = df[df['file_exists'] == False] # Print the DataFrame with missing file paths print("Files that do not exist:") print(missing_files_df) #source_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/second_batch/readme_commits' # Replace with your source directory path #target_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/second_batch/first_version_documents/validation_readme' # Replace with your target directory path source_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/main_commit_data/readme/' # Replace with your source directory path target_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme' # Replace with your target directory path # List all files in the source directory source_files = os.listdir(source_directory) target_files = os.listdir(target_directory) # Iterate through the files and check if they exist in the target directory no_pair = [] manifest = [] for file in source_files: if file.startswith('_'): repo_id = file[1:-12] else: repo_id = file[:-12] has_fvf = False pair_file = "" for file_name in target_files: if repo_id in file_name: has_fvf = True pair_file = file_name break if has_fvf == False: no_pair.append(file) else: manifest.append([repo_id, file, pair_file]) print(no_pair) manifest_df = pd.DataFrame(manifest, columns=['repo_id', 'commits_filepath', 'fvf_filepath']) print(len(manifest_df)) manifest_df.to_csv(f"0205_all_fvf_README_manifest-link.csv", index=False) #12825_revision/misc_data_files/main/all_fvf_CONTRIBUTING_manifest-link.csv #source_file_path = os.path.join(source_directory, file_name) #target_file_path = os.path.join(target_directory, file_name) #if os.path.isfile(source_file_path): # if os.path.exists(target_file_path): # print(f"File '{file_name}' exists in both directories.") # else: # print(f"File '{file_name}' does not exist in the target directory.") # Load the CSV file into a DataFrame csv_file = '/home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/test_second_batch_readme_manifest.csv' # Replace with your CSV file path column_name = 'fvf_filepath' # Replace with your column name containing filenames # Read the CSV file df = pd.read_csv(csv_file) # List all filenames in the specified directory directory_path = '/data/users/mgaughan/kkex/012825_cam_revision_main/second_batch/first_version_documents/validation_readme' # Replace with your directory path directory_files = os.listdir(directory_path) # Convert the filenames column of the DataFrame to a set for efficient lookup filenames_in_csv = df[column_name] duplicates = df.duplicated(keep=False) duplicate_rows = df[duplicates] print("\nDuplicate rows in the DataFrame:") print(duplicate_rows) # Check which filenames in the directory are not reflected in the column missing_files = [file for file in filenames_in_csv if file not in directory_files] # Print the filenames that are not reflected in the column print("Filenames in the directory that are not reflected in the CSV column:") for file in missing_files: print(file) csv_file = "/home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/0205_all_fvf_README_manifest-link.csv" # Replace with your CSV file path column_name = 'fvf_filepath' # Replace with your column name containing filenames # Read the CSV file df = pd.read_csv(csv_file) file_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/' directory_files = os.listdir(file_directory) for file in directory_files: filtered_df = df[df[column_name] == file] if len(filtered_df) != 1: print(filtered_df) ''' ''' with open("../misc_data_files/main/0205_README_publication_commits.csv", 'r') as file: csv_reader = csv.DictReader(file) lines = [line for line in file] new_manifest_list = [] missing_list = [] index = -1 for row in tqdm(csv.reader(lines), total=len(lines)): index += 1 if index < 1: continue manifest_df = pd.DataFrame({ 'commit_hash': [row[0]], 'upstream_vcs_link': [row[15]], 'repo_id': [row[13]], 'project_handle': [row[14]], 'lagged_hash': [row[12]], 'project_index': index }) prefix = f"{index}_{manifest_df['repo_id'][0]}" matched_file = "" for filename in os.listdir("/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/"): if fnmatch.fnmatch(filename, f'{prefix}*'): matched_file = filename if matched_file == "": missing_list.append(manifest_df) else: manifest_df['new_filepath'] = matched_file new_manifest_list.append(manifest_df) manifest_df = pd.concat(new_manifest_list, ignore_index=True) missing_df = pd.concat(missing_list, ignore_index=True) missing_df.to_csv(f"020625_README_missing.csv", index=False) manifest_df.to_csv(f"020525_README_manifest.csv", index=False) ''' ''' with open("/home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/020625_README_missing.csv", 'r') as file: lines = [line for line in file] index = -1 dir_path = "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/" for row in tqdm(csv.reader(lines), total=len(lines)): index += 1 if index < 0: continue prefix = f"{row[4]}_{row[2]}" matched_file = "" for filename in os.listdir(dir_path): if fnmatch.fnmatch(filename, f'{prefix}*'): matched_file = filename break if matched_file: new_filename = matched_file.replace(row[4], row[5], 1) os.rename(os.path.join(dir_path, matched_file), os.path.join(dir_path, new_filename)) '''