import pandas as pd import os ''' # Load the CSV file into a DataFrame csv_file = '020125_README_manifest.csv' # Replace with your CSV file path column_name = 'new_filepath' # Replace with your column name containing file paths # Read the CSV file df = pd.read_csv(csv_file) # Function to check if a file exists def check_file_exists(file_path): return os.path.exists(file_path) # Apply the function to the DataFrame column df['file_exists'] = df[column_name].apply(check_file_exists) missing_files_df = df[df['file_exists'] == False] # Print the DataFrame with missing file paths print("Files that do not exist:") print(missing_files_df) ''' #source_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/second_batch/readme_commits' # Replace with your source directory path #target_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/second_batch/first_version_documents/validation_readme' # Replace with your target directory path source_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/main_commit_data/readme' # Replace with your source directory path target_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/first_version_documents/readme' # Replace with your target directory path # List all files in the source directory source_files = os.listdir(source_directory) target_files = os.listdir(target_directory) # Iterate through the files and check if they exist in the target directory no_pair = [] manifest = [] for file in source_files: if file.startswith('_'): repo_id = file[1:-12] else: repo_id = file[:-12] has_fvf = False pair_file = "" for file_name in target_files: if repo_id in file_name: has_fvf = True pair_file = file_name break if has_fvf == False: no_pair.append(file) else: manifest.append([repo_id, file, pair_file]) print(no_pair) manifest_df = pd.DataFrame(manifest, columns=['repo_id', 'commits_filepath', 'fvf_filepath']) print(len(manifest_df)) manifest_df.to_csv(f"all_fvf_README_manifest-link.csv", index=False) #12825_revision/misc_data_files/main/all_fvf_CONTRIBUTING_manifest-link.csv #source_file_path = os.path.join(source_directory, file_name) #target_file_path = os.path.join(target_directory, file_name) #if os.path.isfile(source_file_path): # if os.path.exists(target_file_path): # print(f"File '{file_name}' exists in both directories.") # else: # print(f"File '{file_name}' does not exist in the target directory.") ''' # Load the CSV file into a DataFrame csv_file = '/home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/test_second_batch_readme_manifest.csv' # Replace with your CSV file path column_name = 'fvf_filepath' # Replace with your column name containing filenames # Read the CSV file df = pd.read_csv(csv_file) # List all filenames in the specified directory directory_path = '/data/users/mgaughan/kkex/012825_cam_revision_main/second_batch/first_version_documents/validation_readme' # Replace with your directory path directory_files = os.listdir(directory_path) # Convert the filenames column of the DataFrame to a set for efficient lookup filenames_in_csv = df[column_name] duplicates = df.duplicated(keep=False) duplicate_rows = df[duplicates] print("\nDuplicate rows in the DataFrame:") print(duplicate_rows) # Check which filenames in the directory are not reflected in the column missing_files = [file for file in filenames_in_csv if file not in directory_files] # Print the filenames that are not reflected in the column print("Filenames in the directory that are not reflected in the CSV column:") for file in missing_files: print(file) csv_file = '/home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/all_013025_contributing_manifest.csv' # Replace with your CSV file path column_name = 'fvf_filepath' # Replace with your column name containing filenames # Read the CSV file df = pd.read_csv(csv_file) file_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/first_version_documents/validation_contributing' directory_files = os.listdir(file_directory) for file in directory_files: filtered_df = df[df[column_name] == file] if len(filtered_df) != 1: print(filtered_df) break '''