24_deb_pkg_gov/12825_revision/misc_data_files/validation.py

161 lines
6.6 KiB
Python

import pandas as pd
import os
import csv
from tqdm import tqdm
import fnmatch
# Load the CSV file into a DataFrame
csv_file = '/home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/020525_README_manifest.csv' # Replace with your CSV file path
#column_name = 'new_filepath' # Replace with your column name containing file paths
# Read the CSV file
df = pd.read_csv(csv_file)
# Function to check if a file exists
def check_file_exists(file_path):
return os.path.exists(file_path)
# Apply the function to the DataFrame column
df['file_exists'] = df[column_name].apply(check_file_exists)
missing_files_df = df[df['file_exists'] == False]
# Print the DataFrame with missing file paths
print("Files that do not exist:")
print(missing_files_df)
#source_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/second_batch/readme_commits' # Replace with your source directory path
#target_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/second_batch/first_version_documents/validation_readme' # Replace with your target directory path
source_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/main_commit_data/readme/' # Replace with your source directory path
target_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme' # Replace with your target directory path
# List all files in the source directory
source_files = os.listdir(source_directory)
target_files = os.listdir(target_directory)
# Iterate through the files and check if they exist in the target directory
no_pair = []
manifest = []
for file in source_files:
if file.startswith('_'):
repo_id = file[1:-12]
else:
repo_id = file[:-12]
has_fvf = False
pair_file = ""
for file_name in target_files:
if repo_id in file_name:
has_fvf = True
pair_file = file_name
break
if has_fvf == False:
no_pair.append(file)
else:
manifest.append([repo_id, file, pair_file])
print(no_pair)
manifest_df = pd.DataFrame(manifest, columns=['repo_id', 'commits_filepath', 'fvf_filepath'])
print(len(manifest_df))
manifest_df.to_csv(f"0205_all_fvf_README_manifest-link.csv", index=False)
#12825_revision/misc_data_files/main/all_fvf_CONTRIBUTING_manifest-link.csv
#source_file_path = os.path.join(source_directory, file_name)
#target_file_path = os.path.join(target_directory, file_name)
#if os.path.isfile(source_file_path):
# if os.path.exists(target_file_path):
# print(f"File '{file_name}' exists in both directories.")
# else:
# print(f"File '{file_name}' does not exist in the target directory.")
# Load the CSV file into a DataFrame
csv_file = '/home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/test_second_batch_readme_manifest.csv' # Replace with your CSV file path
column_name = 'fvf_filepath' # Replace with your column name containing filenames
# Read the CSV file
df = pd.read_csv(csv_file)
# List all filenames in the specified directory
directory_path = '/data/users/mgaughan/kkex/012825_cam_revision_main/second_batch/first_version_documents/validation_readme' # Replace with your directory path
directory_files = os.listdir(directory_path)
# Convert the filenames column of the DataFrame to a set for efficient lookup
filenames_in_csv = df[column_name]
duplicates = df.duplicated(keep=False)
duplicate_rows = df[duplicates]
print("\nDuplicate rows in the DataFrame:")
print(duplicate_rows)
# Check which filenames in the directory are not reflected in the column
missing_files = [file for file in filenames_in_csv if file not in directory_files]
# Print the filenames that are not reflected in the column
print("Filenames in the directory that are not reflected in the CSV column:")
for file in missing_files:
print(file)
csv_file = "/home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/0205_all_fvf_README_manifest-link.csv" # Replace with your CSV file path
column_name = 'fvf_filepath' # Replace with your column name containing filenames
# Read the CSV file
df = pd.read_csv(csv_file)
file_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/'
directory_files = os.listdir(file_directory)
for file in directory_files:
filtered_df = df[df[column_name] == file]
if len(filtered_df) != 1:
print(filtered_df)
'''
'''
with open("../misc_data_files/main/0205_README_publication_commits.csv", 'r') as file:
csv_reader = csv.DictReader(file)
lines = [line for line in file]
new_manifest_list = []
missing_list = []
index = -1
for row in tqdm(csv.reader(lines), total=len(lines)):
index += 1
if index < 1:
continue
manifest_df = pd.DataFrame({
'commit_hash': [row[0]],
'upstream_vcs_link': [row[15]],
'repo_id': [row[13]],
'project_handle': [row[14]],
'lagged_hash': [row[12]],
'project_index': index
})
prefix = f"{index}_{manifest_df['repo_id'][0]}"
matched_file = ""
for filename in os.listdir("/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/"):
if fnmatch.fnmatch(filename, f'{prefix}*'):
matched_file = filename
if matched_file == "":
missing_list.append(manifest_df)
else:
manifest_df['new_filepath'] = matched_file
new_manifest_list.append(manifest_df)
manifest_df = pd.concat(new_manifest_list, ignore_index=True)
missing_df = pd.concat(missing_list, ignore_index=True)
missing_df.to_csv(f"020625_README_missing.csv", index=False)
manifest_df.to_csv(f"020525_README_manifest.csv", index=False)
'''
'''
with open("/home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/020625_README_missing.csv", 'r') as file:
lines = [line for line in file]
index = -1
dir_path = "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/"
for row in tqdm(csv.reader(lines), total=len(lines)):
index += 1
if index < 0:
continue
prefix = f"{row[4]}_{row[2]}"
matched_file = ""
for filename in os.listdir(dir_path):
if fnmatch.fnmatch(filename, f'{prefix}*'):
matched_file = filename
break
if matched_file:
new_filename = matched_file.replace(row[4], row[5], 1)
os.rename(os.path.join(dir_path, matched_file), os.path.join(dir_path, new_filename))
'''