24_deb_pkg_gov/12825_revision/misc_data_files/validation.py

import pandas as pd
import os
import csv
from tqdm import tqdm
import fnmatch

# Load the CSV file into a DataFrame
csv_file = '/home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/020525_README_manifest.csv'  # Replace with your CSV file path
#column_name = 'new_filepath'     # Replace with your column name containing file paths

# Read the CSV file
df = pd.read_csv(csv_file)

# Function to check if a file exists
def check_file_exists(file_path):
    return os.path.exists(file_path)

# Apply the function to the DataFrame column
df['file_exists'] = df[column_name].apply(check_file_exists)

missing_files_df = df[df['file_exists'] == False]

# Print the DataFrame with missing file paths
print("Files that do not exist:")
print(missing_files_df)

#source_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/second_batch/readme_commits'  # Replace with your source directory path
#target_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/second_batch/first_version_documents/validation_readme'  # Replace with your target directory path

source_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/main_commit_data/readme/'  # Replace with your source directory path
target_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme'  # Replace with your target directory path

# List all files in the source directory
source_files = os.listdir(source_directory)
target_files = os.listdir(target_directory)

# Iterate through the files and check if they exist in the target directory
no_pair = []
manifest = []
for file in source_files:
    if file.startswith('_'):
        repo_id = file[1:-12]
    else:
        repo_id = file[:-12]
    has_fvf = False
    pair_file = ""
    for file_name in target_files:
        if repo_id in file_name:
              has_fvf = True
              pair_file = file_name
              break 
    if has_fvf == False:
         no_pair.append(file)
    else:
         manifest.append([repo_id, file, pair_file])

print(no_pair)
manifest_df = pd.DataFrame(manifest, columns=['repo_id', 'commits_filepath', 'fvf_filepath'])
print(len(manifest_df))
manifest_df.to_csv(f"0205_all_fvf_README_manifest-link.csv", index=False)
#12825_revision/misc_data_files/main/all_fvf_CONTRIBUTING_manifest-link.csv
    #source_file_path = os.path.join(source_directory, file_name)
    #target_file_path = os.path.join(target_directory, file_name)
    
    #if os.path.isfile(source_file_path):  
    #    if os.path.exists(target_file_path):
    #        print(f"File '{file_name}' exists in both directories.")
    #    else:
    #        print(f"File '{file_name}' does not exist in the target directory.")

# Load the CSV file into a DataFrame
csv_file = '/home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/test_second_batch_readme_manifest.csv'  # Replace with your CSV file path
column_name = 'fvf_filepath'    # Replace with your column name containing filenames

# Read the CSV file
df = pd.read_csv(csv_file)

# List all filenames in the specified directory
directory_path = '/data/users/mgaughan/kkex/012825_cam_revision_main/second_batch/first_version_documents/validation_readme'  # Replace with your directory path
directory_files = os.listdir(directory_path)

# Convert the filenames column of the DataFrame to a set for efficient lookup
filenames_in_csv = df[column_name]
duplicates = df.duplicated(keep=False)
duplicate_rows = df[duplicates]
print("\nDuplicate rows in the DataFrame:")
print(duplicate_rows)
# Check which filenames in the directory are not reflected in the column
missing_files = [file for file in filenames_in_csv if file not in directory_files]

# Print the filenames that are not reflected in the column
print("Filenames in the directory that are not reflected in the CSV column:")
for file in missing_files:
    print(file)

csv_file = "/home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/0205_all_fvf_README_manifest-link.csv"  # Replace with your CSV file path
column_name = 'fvf_filepath'    # Replace with your column name containing filenames
# Read the CSV file
df = pd.read_csv(csv_file)

file_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/'
directory_files = os.listdir(file_directory)
for file in directory_files:
    filtered_df = df[df[column_name] == file]
    if len(filtered_df) != 1:
        print(filtered_df)
'''
'''
with open("../misc_data_files/main/0205_README_publication_commits.csv", 'r') as file:
    csv_reader = csv.DictReader(file) 
    lines = [line for line in file]
    new_manifest_list = []
    missing_list = []
    index = -1
    for row in tqdm(csv.reader(lines), total=len(lines)):
        index += 1
        if index < 1:
            continue
        manifest_df = pd.DataFrame({
                'commit_hash': [row[0]],
                'upstream_vcs_link': [row[15]],
                'repo_id': [row[13]],
                'project_handle': [row[14]],
                'lagged_hash': [row[12]],
                'project_index': index
            })
        prefix = f"{index}_{manifest_df['repo_id'][0]}"
        matched_file = ""
        for filename in os.listdir("/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/"):
            if fnmatch.fnmatch(filename, f'{prefix}*'):
                matched_file = filename
        if matched_file == "":
            missing_list.append(manifest_df)
        else:
            manifest_df['new_filepath'] = matched_file
            new_manifest_list.append(manifest_df)
    manifest_df = pd.concat(new_manifest_list, ignore_index=True)
    missing_df = pd.concat(missing_list, ignore_index=True)
    missing_df.to_csv(f"020625_README_missing.csv", index=False)
    manifest_df.to_csv(f"020525_README_manifest.csv", index=False)
'''
'''
with open("/home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/020625_README_missing.csv", 'r') as file:
    lines = [line for line in file]
    index = -1
    dir_path = "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/"
    for row in tqdm(csv.reader(lines), total=len(lines)):
        index += 1
        if index < 0:
            continue
        prefix = f"{row[4]}_{row[2]}"
        matched_file = ""
        for filename in os.listdir(dir_path):
            if fnmatch.fnmatch(filename, f'{prefix}*'):
                matched_file = filename
                break
        
        if matched_file:
            new_filename = matched_file.replace(row[4], row[5], 1)
            os.rename(os.path.join(dir_path, matched_file), os.path.join(dir_path, new_filename))
'''
fix more file validation 2025-01-31 22:50:18 +00:00			`import pandas as pd`
			`import os`
updates to readme 2025-02-06 23:52:01 +00:00			`import csv`
			`from tqdm import tqdm`
			`import fnmatch`

fix more file validation 2025-01-31 22:50:18 +00:00			`# Load the CSV file into a DataFrame`
updates to readme 2025-02-06 23:52:01 +00:00			`csv_file = '/home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/020525_README_manifest.csv' # Replace with your CSV file path`
			`#column_name = 'new_filepath' # Replace with your column name containing file paths`
fix more file validation 2025-01-31 22:50:18 +00:00
			`# Read the CSV file`
			`df = pd.read_csv(csv_file)`

			`# Function to check if a file exists`
			`def check_file_exists(file_path):`
			`return os.path.exists(file_path)`

			`# Apply the function to the DataFrame column`
			`df['file_exists'] = df[column_name].apply(check_file_exists)`

			`missing_files_df = df[df['file_exists'] == False]`

			`# Print the DataFrame with missing file paths`
			`print("Files that do not exist:")`
			`print(missing_files_df)`
updates to readme 2025-02-06 23:52:01 +00:00
(hopefully final) data cleaning and validation 2025-02-02 07:26:52 +00:00			`#source_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/second_batch/readme_commits' # Replace with your source directory path`
			`#target_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/second_batch/first_version_documents/validation_readme' # Replace with your target directory path`
fix more file validation 2025-01-31 22:50:18 +00:00
getting lagged documents, now with index 2025-02-06 05:50:54 +00:00			`source_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/final_data/main_commit_data/readme/' # Replace with your source directory path`
			`target_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme' # Replace with your target directory path`
fix more file validation 2025-01-31 22:50:18 +00:00
			`# List all files in the source directory`
			`source_files = os.listdir(source_directory)`
			`target_files = os.listdir(target_directory)`

			`# Iterate through the files and check if they exist in the target directory`
			`no_pair = []`
			`manifest = []`
			`for file in source_files:`
			`if file.startswith('_'):`
(hopefully final) data cleaning and validation 2025-02-02 07:26:52 +00:00			`repo_id = file[1:-12]`
fix more file validation 2025-01-31 22:50:18 +00:00			`else:`
			`repo_id = file[:-12]`
			`has_fvf = False`
			`pair_file = ""`
			`for file_name in target_files:`
(hopefully final) data cleaning and validation 2025-02-02 07:26:52 +00:00			`if repo_id in file_name:`
fix more file validation 2025-01-31 22:50:18 +00:00			`has_fvf = True`
			`pair_file = file_name`
			`break`
			`if has_fvf == False:`
			`no_pair.append(file)`
			`else:`
			`manifest.append([repo_id, file, pair_file])`

			`print(no_pair)`
			`manifest_df = pd.DataFrame(manifest, columns=['repo_id', 'commits_filepath', 'fvf_filepath'])`
(hopefully final) data cleaning and validation 2025-02-02 07:26:52 +00:00			`print(len(manifest_df))`
getting lagged documents, now with index 2025-02-06 05:50:54 +00:00			`manifest_df.to_csv(f"0205_all_fvf_README_manifest-link.csv", index=False)`
(hopefully final) data cleaning and validation 2025-02-02 07:26:52 +00:00			`#12825_revision/misc_data_files/main/all_fvf_CONTRIBUTING_manifest-link.csv`
fix more file validation 2025-01-31 22:50:18 +00:00			`#source_file_path = os.path.join(source_directory, file_name)`
			`#target_file_path = os.path.join(target_directory, file_name)`

			`#if os.path.isfile(source_file_path):`
			`# if os.path.exists(target_file_path):`
			`# print(f"File '{file_name}' exists in both directories.")`
			`# else:`
			`# print(f"File '{file_name}' does not exist in the target directory.")`
getting lagged documents, now with index 2025-02-06 05:50:54 +00:00
fix more file validation 2025-01-31 22:50:18 +00:00			`# Load the CSV file into a DataFrame`
(hopefully final) data cleaning and validation 2025-02-02 07:26:52 +00:00			`csv_file = '/home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/test_second_batch_readme_manifest.csv' # Replace with your CSV file path`
fix more file validation 2025-01-31 22:50:18 +00:00			`column_name = 'fvf_filepath' # Replace with your column name containing filenames`

			`# Read the CSV file`
			`df = pd.read_csv(csv_file)`

			`# List all filenames in the specified directory`
(hopefully final) data cleaning and validation 2025-02-02 07:26:52 +00:00			`directory_path = '/data/users/mgaughan/kkex/012825_cam_revision_main/second_batch/first_version_documents/validation_readme' # Replace with your directory path`
fix more file validation 2025-01-31 22:50:18 +00:00			`directory_files = os.listdir(directory_path)`

			`# Convert the filenames column of the DataFrame to a set for efficient lookup`
			`filenames_in_csv = df[column_name]`
			`duplicates = df.duplicated(keep=False)`
			`duplicate_rows = df[duplicates]`
			`print("\nDuplicate rows in the DataFrame:")`
			`print(duplicate_rows)`
			`# Check which filenames in the directory are not reflected in the column`
(hopefully final) data cleaning and validation 2025-02-02 07:26:52 +00:00			`missing_files = [file for file in filenames_in_csv if file not in directory_files]`
fix more file validation 2025-01-31 22:50:18 +00:00
			`# Print the filenames that are not reflected in the column`
			`print("Filenames in the directory that are not reflected in the CSV column:")`
			`for file in missing_files:`
			`print(file)`

getting lagged documents, now with index 2025-02-06 05:50:54 +00:00			`csv_file = "/home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/0205_all_fvf_README_manifest-link.csv" # Replace with your CSV file path`
fix more file validation 2025-01-31 22:50:18 +00:00			`column_name = 'fvf_filepath' # Replace with your column name containing filenames`
			`# Read the CSV file`
			`df = pd.read_csv(csv_file)`

getting lagged documents, now with index 2025-02-06 05:50:54 +00:00			`file_directory = '/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/'`
fix more file validation 2025-01-31 22:50:18 +00:00			`directory_files = os.listdir(file_directory)`
			`for file in directory_files:`
			`filtered_df = df[df[column_name] == file]`
			`if len(filtered_df) != 1:`
			`print(filtered_df)`
updates to readme 2025-02-06 23:52:01 +00:00			`'''`
			`'''`
			`with open("../misc_data_files/main/0205_README_publication_commits.csv", 'r') as file:`
			`csv_reader = csv.DictReader(file)`
			`lines = [line for line in file]`
			`new_manifest_list = []`
			`missing_list = []`
			`index = -1`
			`for row in tqdm(csv.reader(lines), total=len(lines)):`
			`index += 1`
			`if index < 1:`
			`continue`
			`manifest_df = pd.DataFrame({`
			`'commit_hash': [row[0]],`
			`'upstream_vcs_link': [row[15]],`
			`'repo_id': [row[13]],`
			`'project_handle': [row[14]],`
			`'lagged_hash': [row[12]],`
			`'project_index': index`
			`})`
			`prefix = f"{index}_{manifest_df['repo_id'][0]}"`
			`matched_file = ""`
			`for filename in os.listdir("/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/"):`
			`if fnmatch.fnmatch(filename, f'{prefix}*'):`
			`matched_file = filename`
			`if matched_file == "":`
			`missing_list.append(manifest_df)`
			`else:`
			`manifest_df['new_filepath'] = matched_file`
			`new_manifest_list.append(manifest_df)`
			`manifest_df = pd.concat(new_manifest_list, ignore_index=True)`
			`missing_df = pd.concat(missing_list, ignore_index=True)`
			`missing_df.to_csv(f"020625_README_missing.csv", index=False)`
			`manifest_df.to_csv(f"020525_README_manifest.csv", index=False)`
			`'''`
			`'''`
			`with open("/home/SOC.NORTHWESTERN.EDU/nws8519/git/24_deb_pkg_gov/12825_revision/misc_data_files/020625_README_missing.csv", 'r') as file:`
			`lines = [line for line in file]`
			`index = -1`
			`dir_path = "/data/users/mgaughan/kkex/012825_cam_revision_main/lagged_files/readme/"`
			`for row in tqdm(csv.reader(lines), total=len(lines)):`
			`index += 1`
			`if index < 0:`
			`continue`
			`prefix = f"{row[4]}_{row[2]}"`
			`matched_file = ""`
			`for filename in os.listdir(dir_path):`
			`if fnmatch.fnmatch(filename, f'{prefix}*'):`
			`matched_file = filename`
			`break`

			`if matched_file:`
			`new_filename = matched_file.replace(row[4], row[5], 1)`
			`os.rename(os.path.join(dir_path, matched_file), os.path.join(dir_path, new_filename))`
			`'''`