import pandas as pd import csv import os import re subset_list = [] other_list = [] bad_lines = 0 def handle_bad_line(line): #print(f"Bad line at line {line_num}: {line}") # Skip the bad line global bad_lines bad_lines += 1 return None def main(): error_files = [f for f in os.listdir("/data/users/mgaughan/kkex/012825_cam_revision_main/12825_output_files") if f.endswith('-clone-error-output.txt')] subset_list = [] other_error_count = 0 https_pattern = re.compile(r'(https?://[^\s]+)') for file in error_files: # error bad lines here makes the assumption that the errors that are nonconforming are cloning errors error_file_df = pd.read_csv("/data/users/mgaughan/kkex/012825_cam_revision_main/12825_output_files/" + file, header=None, on_bad_lines=handle_bad_line, engine='python') error_file_df[1] = error_file_df[1].fillna('') error_file_df[3] = error_file_df[0].str.count(r'(https?://[^\s]+)', flags=re.IGNORECASE) char_error_vcs = error_file_df[error_file_df[1].str.startswith(" 'utf-8'")] #other_errors = error_file_df[~error_file_df[1].str.startswith(" 'utf-8'")] subset_list.append(char_error_vcs) other_error_count += error_file_df[3].sum() #other_list.append(other_errors) result_df = pd.concat(subset_list, ignore_index=True) #other_df = pd.concat(other_list, ignore_index=True ) print(len(result_df)) print(other_error_count) print(bad_lines) #result_df.to_csv("charset_error_list.csv", index=False) if __name__ == "__main__": main()