104 lines
3.9 KiB
Python
104 lines
3.9 KiB
Python
|
import pandas as pd
|
||
|
import os
|
||
|
'''
|
||
|
# Load the CSV file into a DataFrame
|
||
|
csv_file = '013025_README_manifest.csv' # Replace with your CSV file path
|
||
|
column_name = 'new_filepath' # Replace with your column name containing file paths
|
||
|
|
||
|
# Read the CSV file
|
||
|
df = pd.read_csv(csv_file)
|
||
|
|
||
|
# Function to check if a file exists
|
||
|
def check_file_exists(file_path):
|
||
|
return os.path.exists(file_path)
|
||
|
|
||
|
# Apply the function to the DataFrame column
|
||
|
df['file_exists'] = df[column_name].apply(check_file_exists)
|
||
|
|
||
|
missing_files_df = df[df['file_exists'] == False]
|
||
|
|
||
|
# Print the DataFrame with missing file paths
|
||
|
print("Files that do not exist:")
|
||
|
print(missing_files_df)
|
||
|
'''
|
||
|
|
||
|
source_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/main_commit_data/contributing' # Replace with your source directory path
|
||
|
target_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/first_version_documents/validation_contributing' # Replace with your target directory path
|
||
|
|
||
|
# List all files in the source directory
|
||
|
source_files = os.listdir(source_directory)
|
||
|
target_files = os.listdir(target_directory)
|
||
|
|
||
|
# Iterate through the files and check if they exist in the target directory
|
||
|
no_pair = []
|
||
|
manifest = []
|
||
|
for file in source_files:
|
||
|
if file.startswith('_'):
|
||
|
repo_id = file[1:-12]
|
||
|
else:
|
||
|
repo_id = file[:-12]
|
||
|
has_fvf = False
|
||
|
pair_file = ""
|
||
|
for file_name in target_files:
|
||
|
if repo_id in file_name:
|
||
|
has_fvf = True
|
||
|
pair_file = file_name
|
||
|
break
|
||
|
if has_fvf == False:
|
||
|
no_pair.append(file)
|
||
|
else:
|
||
|
manifest.append([repo_id, file, pair_file])
|
||
|
|
||
|
print(no_pair)
|
||
|
manifest_df = pd.DataFrame(manifest, columns=['repo_id', 'commits_filepath', 'fvf_filepath'])
|
||
|
manifest_df.to_csv(f"all_013025_contributing_manifest.csv", index=False)
|
||
|
#source_file_path = os.path.join(source_directory, file_name)
|
||
|
#target_file_path = os.path.join(target_directory, file_name)
|
||
|
|
||
|
#if os.path.isfile(source_file_path):
|
||
|
# if os.path.exists(target_file_path):
|
||
|
# print(f"File '{file_name}' exists in both directories.")
|
||
|
# else:
|
||
|
# print(f"File '{file_name}' does not exist in the target directory.")
|
||
|
|
||
|
'''
|
||
|
# Load the CSV file into a DataFrame
|
||
|
csv_file = '/home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/all_013025_contributing_manifest.csv' # Replace with your CSV file path
|
||
|
column_name = 'fvf_filepath' # Replace with your column name containing filenames
|
||
|
|
||
|
# Read the CSV file
|
||
|
df = pd.read_csv(csv_file)
|
||
|
|
||
|
# List all filenames in the specified directory
|
||
|
directory_path = '/data/users/mgaughan/kkex/012825_cam_revision_main/first_version_documents/validation_contributing' # Replace with your directory path
|
||
|
directory_files = os.listdir(directory_path)
|
||
|
|
||
|
# Convert the filenames column of the DataFrame to a set for efficient lookup
|
||
|
filenames_in_csv = df[column_name]
|
||
|
duplicates = df.duplicated(keep=False)
|
||
|
duplicate_rows = df[duplicates]
|
||
|
print("\nDuplicate rows in the DataFrame:")
|
||
|
print(duplicate_rows)
|
||
|
# Check which filenames in the directory are not reflected in the column
|
||
|
#missing_files = [file for file in filenames_in_csv if file not in directory_files]
|
||
|
|
||
|
|
||
|
# Print the filenames that are not reflected in the column
|
||
|
print("Filenames in the directory that are not reflected in the CSV column:")
|
||
|
for file in missing_files:
|
||
|
print(file)
|
||
|
|
||
|
|
||
|
csv_file = '/home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/all_013025_contributing_manifest.csv' # Replace with your CSV file path
|
||
|
column_name = 'fvf_filepath' # Replace with your column name containing filenames
|
||
|
# Read the CSV file
|
||
|
df = pd.read_csv(csv_file)
|
||
|
|
||
|
file_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/first_version_documents/validation_contributing'
|
||
|
directory_files = os.listdir(file_directory)
|
||
|
for file in directory_files:
|
||
|
filtered_df = df[df[column_name] == file]
|
||
|
if len(filtered_df) != 1:
|
||
|
print(filtered_df)
|
||
|
break
|
||
|
'''
|