2025-01-30 18:21:10 +00:00
|
|
|
import pandas as pd
|
|
|
|
import csv
|
|
|
|
import os
|
2025-01-30 23:19:27 +00:00
|
|
|
import re
|
2025-01-30 18:21:10 +00:00
|
|
|
|
|
|
|
subset_list = []
|
2025-01-30 23:19:27 +00:00
|
|
|
other_list = []
|
|
|
|
bad_lines = 0
|
|
|
|
|
|
|
|
def handle_bad_line(line):
|
|
|
|
#print(f"Bad line at line {line_num}: {line}")
|
|
|
|
# Skip the bad line
|
|
|
|
global bad_lines
|
|
|
|
bad_lines += 1
|
|
|
|
return None
|
|
|
|
|
|
|
|
def main():
|
|
|
|
error_files = [f for f in os.listdir("/data/users/mgaughan/kkex/012825_cam_revision_main/12825_output_files") if f.endswith('-clone-error-output.txt')]
|
|
|
|
subset_list = []
|
2025-01-31 04:09:38 +00:00
|
|
|
all_error_count = 0
|
2025-01-30 23:19:27 +00:00
|
|
|
https_pattern = re.compile(r'(https?://[^\s]+)')
|
|
|
|
for file in error_files:
|
|
|
|
# error bad lines here makes the assumption that the errors that are nonconforming are cloning errors
|
|
|
|
error_file_df = pd.read_csv("/data/users/mgaughan/kkex/012825_cam_revision_main/12825_output_files/" + file, header=None, on_bad_lines=handle_bad_line, engine='python')
|
|
|
|
error_file_df[1] = error_file_df[1].fillna('')
|
|
|
|
error_file_df[3] = error_file_df[0].str.count(r'(https?://[^\s]+)', flags=re.IGNORECASE)
|
|
|
|
char_error_vcs = error_file_df[error_file_df[1].str.startswith(" 'utf-8'")]
|
|
|
|
#other_errors = error_file_df[~error_file_df[1].str.startswith(" 'utf-8'")]
|
|
|
|
subset_list.append(char_error_vcs)
|
2025-01-31 04:09:38 +00:00
|
|
|
all_error_count += error_file_df[3].sum()
|
2025-01-30 23:19:27 +00:00
|
|
|
#other_list.append(other_errors)
|
2025-01-30 18:21:10 +00:00
|
|
|
|
2025-01-30 23:19:27 +00:00
|
|
|
result_df = pd.concat(subset_list, ignore_index=True)
|
|
|
|
#other_df = pd.concat(other_list, ignore_index=True )
|
2025-01-30 18:21:10 +00:00
|
|
|
|
2025-01-30 23:19:27 +00:00
|
|
|
print(len(result_df))
|
2025-01-31 04:09:38 +00:00
|
|
|
print(all_error_count)
|
2025-01-30 23:19:27 +00:00
|
|
|
print(bad_lines)
|
2025-01-31 04:09:38 +00:00
|
|
|
result_df.to_csv("charset_error_list.csv", index=False)
|
2025-01-30 18:21:10 +00:00
|
|
|
|
2025-01-30 23:19:27 +00:00
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|