2025-01-31 22:50:18 +00:00
import pandas as pd
import os
2025-02-06 23:52:01 +00:00
import csv
from tqdm import tqdm
import fnmatch
2025-01-31 22:50:18 +00:00
# Load the CSV file into a DataFrame
2025-02-06 23:52:01 +00:00
csv_file = ' /home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/020525_README_manifest.csv ' # Replace with your CSV file path
#column_name = 'new_filepath' # Replace with your column name containing file paths
2025-01-31 22:50:18 +00:00
# Read the CSV file
df = pd . read_csv ( csv_file )
# Function to check if a file exists
def check_file_exists ( file_path ) :
return os . path . exists ( file_path )
# Apply the function to the DataFrame column
df [ ' file_exists ' ] = df [ column_name ] . apply ( check_file_exists )
missing_files_df = df [ df [ ' file_exists ' ] == False ]
# Print the DataFrame with missing file paths
print ( " Files that do not exist: " )
print ( missing_files_df )
2025-02-06 23:52:01 +00:00
2025-02-02 07:26:52 +00:00
#source_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/second_batch/readme_commits' # Replace with your source directory path
#target_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/second_batch/first_version_documents/validation_readme' # Replace with your target directory path
2025-01-31 22:50:18 +00:00
2025-02-06 05:50:54 +00:00
source_directory = ' /data/users/mgaughan/kkex/012825_cam_revision_main/final_data/main_commit_data/readme/ ' # Replace with your source directory path
target_directory = ' /data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme ' # Replace with your target directory path
2025-01-31 22:50:18 +00:00
# List all files in the source directory
source_files = os . listdir ( source_directory )
target_files = os . listdir ( target_directory )
# Iterate through the files and check if they exist in the target directory
no_pair = [ ]
manifest = [ ]
for file in source_files :
if file . startswith ( ' _ ' ) :
2025-02-02 07:26:52 +00:00
repo_id = file [ 1 : - 12 ]
2025-01-31 22:50:18 +00:00
else :
repo_id = file [ : - 12 ]
has_fvf = False
pair_file = " "
for file_name in target_files :
2025-02-02 07:26:52 +00:00
if repo_id in file_name :
2025-01-31 22:50:18 +00:00
has_fvf = True
pair_file = file_name
break
if has_fvf == False :
no_pair . append ( file )
else :
manifest . append ( [ repo_id , file , pair_file ] )
print ( no_pair )
manifest_df = pd . DataFrame ( manifest , columns = [ ' repo_id ' , ' commits_filepath ' , ' fvf_filepath ' ] )
2025-02-02 07:26:52 +00:00
print ( len ( manifest_df ) )
2025-02-06 05:50:54 +00:00
manifest_df . to_csv ( f " 0205_all_fvf_README_manifest-link.csv " , index = False )
2025-02-02 07:26:52 +00:00
#12825_revision/misc_data_files/main/all_fvf_CONTRIBUTING_manifest-link.csv
2025-01-31 22:50:18 +00:00
#source_file_path = os.path.join(source_directory, file_name)
#target_file_path = os.path.join(target_directory, file_name)
#if os.path.isfile(source_file_path):
# if os.path.exists(target_file_path):
# print(f"File '{file_name}' exists in both directories.")
# else:
# print(f"File '{file_name}' does not exist in the target directory.")
2025-02-06 05:50:54 +00:00
2025-01-31 22:50:18 +00:00
# Load the CSV file into a DataFrame
2025-02-02 07:26:52 +00:00
csv_file = ' /home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/test_second_batch_readme_manifest.csv ' # Replace with your CSV file path
2025-01-31 22:50:18 +00:00
column_name = ' fvf_filepath ' # Replace with your column name containing filenames
# Read the CSV file
df = pd . read_csv ( csv_file )
# List all filenames in the specified directory
2025-02-02 07:26:52 +00:00
directory_path = ' /data/users/mgaughan/kkex/012825_cam_revision_main/second_batch/first_version_documents/validation_readme ' # Replace with your directory path
2025-01-31 22:50:18 +00:00
directory_files = os . listdir ( directory_path )
# Convert the filenames column of the DataFrame to a set for efficient lookup
filenames_in_csv = df [ column_name ]
duplicates = df . duplicated ( keep = False )
duplicate_rows = df [ duplicates ]
print ( " \n Duplicate rows in the DataFrame: " )
print ( duplicate_rows )
# Check which filenames in the directory are not reflected in the column
2025-02-02 07:26:52 +00:00
missing_files = [ file for file in filenames_in_csv if file not in directory_files ]
2025-01-31 22:50:18 +00:00
# Print the filenames that are not reflected in the column
print ( " Filenames in the directory that are not reflected in the CSV column: " )
for file in missing_files :
print ( file )
2025-02-06 05:50:54 +00:00
csv_file = " /home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/0205_all_fvf_README_manifest-link.csv " # Replace with your CSV file path
2025-01-31 22:50:18 +00:00
column_name = ' fvf_filepath ' # Replace with your column name containing filenames
# Read the CSV file
df = pd . read_csv ( csv_file )
2025-02-06 05:50:54 +00:00
file_directory = ' /data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/ '
2025-01-31 22:50:18 +00:00
directory_files = os . listdir ( file_directory )
for file in directory_files :
filtered_df = df [ df [ column_name ] == file ]
if len ( filtered_df ) != 1 :
print ( filtered_df )
2025-02-06 23:52:01 +00:00
'''
'''
with open ( " ../misc_data_files/main/0205_README_publication_commits.csv " , ' r ' ) as file :
csv_reader = csv . DictReader ( file )
lines = [ line for line in file ]
new_manifest_list = [ ]
missing_list = [ ]
index = - 1
for row in tqdm ( csv . reader ( lines ) , total = len ( lines ) ) :
index + = 1
if index < 1 :
continue
manifest_df = pd . DataFrame ( {
' commit_hash ' : [ row [ 0 ] ] ,
' upstream_vcs_link ' : [ row [ 15 ] ] ,
' repo_id ' : [ row [ 13 ] ] ,
' project_handle ' : [ row [ 14 ] ] ,
' lagged_hash ' : [ row [ 12 ] ] ,
' project_index ' : index
} )
prefix = f " { index } _ { manifest_df [ ' repo_id ' ] [ 0 ] } "
matched_file = " "
for filename in os . listdir ( " /data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/ " ) :
if fnmatch . fnmatch ( filename , f ' { prefix } * ' ) :
matched_file = filename
if matched_file == " " :
missing_list . append ( manifest_df )
else :
manifest_df [ ' new_filepath ' ] = matched_file
new_manifest_list . append ( manifest_df )
manifest_df = pd . concat ( new_manifest_list , ignore_index = True )
missing_df = pd . concat ( missing_list , ignore_index = True )
missing_df . to_csv ( f " 020625_README_missing.csv " , index = False )
manifest_df . to_csv ( f " 020525_README_manifest.csv " , index = False )
'''
'''
with open ( " /home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/020625_README_missing.csv " , ' r ' ) as file :
lines = [ line for line in file ]
index = - 1
dir_path = " /data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/ "
for row in tqdm ( csv . reader ( lines ) , total = len ( lines ) ) :
index + = 1
if index < 0 :
continue
prefix = f " { row [ 4 ] } _ { row [ 2 ] } "
matched_file = " "
for filename in os . listdir ( dir_path ) :
if fnmatch . fnmatch ( filename , f ' { prefix } * ' ) :
matched_file = filename
break
if matched_file :
new_filename = matched_file . replace ( row [ 4 ] , row [ 5 ] , 1 )
os . rename ( os . path . join ( dir_path , matched_file ) , os . path . join ( dir_path , new_filename ) )
'''