2025-01-31 22:50:18 +00:00
import pandas as pd
import os
'''
# Load the CSV file into a DataFrame
2025-02-02 07:26:52 +00:00
csv_file = ' 020125_README_manifest.csv ' # Replace with your CSV file path
2025-01-31 22:50:18 +00:00
column_name = ' new_filepath ' # Replace with your column name containing file paths
# Read the CSV file
df = pd . read_csv ( csv_file )
# Function to check if a file exists
def check_file_exists ( file_path ) :
return os . path . exists ( file_path )
# Apply the function to the DataFrame column
df [ ' file_exists ' ] = df [ column_name ] . apply ( check_file_exists )
missing_files_df = df [ df [ ' file_exists ' ] == False ]
# Print the DataFrame with missing file paths
print ( " Files that do not exist: " )
print ( missing_files_df )
'''
2025-02-02 07:26:52 +00:00
#source_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/second_batch/readme_commits' # Replace with your source directory path
#target_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/second_batch/first_version_documents/validation_readme' # Replace with your target directory path
2025-01-31 22:50:18 +00:00
2025-02-02 07:26:52 +00:00
source_directory = ' /data/users/mgaughan/kkex/012825_cam_revision_main/main_commit_data/readme ' # Replace with your source directory path
target_directory = ' /data/users/mgaughan/kkex/012825_cam_revision_main/first_version_documents/readme ' # Replace with your target directory path
2025-01-31 22:50:18 +00:00
# List all files in the source directory
source_files = os . listdir ( source_directory )
target_files = os . listdir ( target_directory )
# Iterate through the files and check if they exist in the target directory
no_pair = [ ]
manifest = [ ]
for file in source_files :
if file . startswith ( ' _ ' ) :
2025-02-02 07:26:52 +00:00
repo_id = file [ 1 : - 12 ]
2025-01-31 22:50:18 +00:00
else :
repo_id = file [ : - 12 ]
has_fvf = False
pair_file = " "
for file_name in target_files :
2025-02-02 07:26:52 +00:00
if repo_id in file_name :
2025-01-31 22:50:18 +00:00
has_fvf = True
pair_file = file_name
break
if has_fvf == False :
no_pair . append ( file )
else :
manifest . append ( [ repo_id , file , pair_file ] )
print ( no_pair )
manifest_df = pd . DataFrame ( manifest , columns = [ ' repo_id ' , ' commits_filepath ' , ' fvf_filepath ' ] )
2025-02-02 07:26:52 +00:00
print ( len ( manifest_df ) )
manifest_df . to_csv ( f " all_fvf_README_manifest-link.csv " , index = False )
#12825_revision/misc_data_files/main/all_fvf_CONTRIBUTING_manifest-link.csv
2025-01-31 22:50:18 +00:00
#source_file_path = os.path.join(source_directory, file_name)
#target_file_path = os.path.join(target_directory, file_name)
#if os.path.isfile(source_file_path):
# if os.path.exists(target_file_path):
# print(f"File '{file_name}' exists in both directories.")
# else:
# print(f"File '{file_name}' does not exist in the target directory.")
'''
# Load the CSV file into a DataFrame
2025-02-02 07:26:52 +00:00
csv_file = ' /home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/test_second_batch_readme_manifest.csv ' # Replace with your CSV file path
2025-01-31 22:50:18 +00:00
column_name = ' fvf_filepath ' # Replace with your column name containing filenames
# Read the CSV file
df = pd . read_csv ( csv_file )
# List all filenames in the specified directory
2025-02-02 07:26:52 +00:00
directory_path = ' /data/users/mgaughan/kkex/012825_cam_revision_main/second_batch/first_version_documents/validation_readme ' # Replace with your directory path
2025-01-31 22:50:18 +00:00
directory_files = os . listdir ( directory_path )
# Convert the filenames column of the DataFrame to a set for efficient lookup
filenames_in_csv = df [ column_name ]
duplicates = df . duplicated ( keep = False )
duplicate_rows = df [ duplicates ]
print ( " \n Duplicate rows in the DataFrame: " )
print ( duplicate_rows )
# Check which filenames in the directory are not reflected in the column
2025-02-02 07:26:52 +00:00
missing_files = [ file for file in filenames_in_csv if file not in directory_files ]
2025-01-31 22:50:18 +00:00
# Print the filenames that are not reflected in the column
print ( " Filenames in the directory that are not reflected in the CSV column: " )
for file in missing_files :
print ( file )
csv_file = ' /home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/all_013025_contributing_manifest.csv ' # Replace with your CSV file path
column_name = ' fvf_filepath ' # Replace with your column name containing filenames
# Read the CSV file
df = pd . read_csv ( csv_file )
file_directory = ' /data/users/mgaughan/kkex/012825_cam_revision_main/first_version_documents/validation_contributing '
directory_files = os . listdir ( file_directory )
for file in directory_files :
filtered_df = df [ df [ column_name ] == file ]
if len ( filtered_df ) != 1 :
print ( filtered_df )
break
'''