24_deb_pkg_gov/12825_revision/error_isolation.py

import pandas as pd
import csv 
import os
import re

subset_list = []
other_list = []
bad_lines = 0

def handle_bad_line(line):
    #print(f"Bad line at line {line_num}: {line}")
    # Skip the bad line
    global bad_lines
    bad_lines += 1
    return None

def main():
    error_files = [f for f in os.listdir("/data/users/mgaughan/kkex/012825_cam_revision_main/12825_output_files") if f.endswith('-clone-error-output.txt')]
    subset_list = []
    all_error_count = 0
    https_pattern = re.compile(r'(https?://[^\s]+)')
    for file in error_files:
        # error bad lines here makes the assumption that the errors that are nonconforming are cloning errors
        error_file_df = pd.read_csv("/data/users/mgaughan/kkex/012825_cam_revision_main/12825_output_files/" + file, header=None, on_bad_lines=handle_bad_line, engine='python')
        error_file_df[1] = error_file_df[1].fillna('')
        error_file_df[3] = error_file_df[0].str.count(r'(https?://[^\s]+)', flags=re.IGNORECASE)
        char_error_vcs = error_file_df[error_file_df[1].str.startswith(" 'utf-8'")]
        #other_errors = error_file_df[~error_file_df[1].str.startswith(" 'utf-8'")]
        subset_list.append(char_error_vcs)
        all_error_count += error_file_df[3].sum()
        #other_list.append(other_errors)

    result_df = pd.concat(subset_list, ignore_index=True)
    #other_df = pd.concat(other_list, ignore_index=True )

    print(len(result_df))
    print(all_error_count)
    print(bad_lines)
    result_df.to_csv("charset_error_list.csv", index=False)

if __name__ == "__main__":
    main()
updates to both intermediary script and data handling 2025-01-30 18:21:10 +00:00			`import pandas as pd`
			`import csv`
			`import os`
checking-data-scripts 2025-01-30 23:19:27 +00:00			`import re`
updates to both intermediary script and data handling 2025-01-30 18:21:10 +00:00
			`subset_list = []`
checking-data-scripts 2025-01-30 23:19:27 +00:00			`other_list = []`
			`bad_lines = 0`

			`def handle_bad_line(line):`
			`#print(f"Bad line at line {line_num}: {line}")`
			`# Skip the bad line`
			`global bad_lines`
			`bad_lines += 1`
			`return None`

			`def main():`
			`error_files = [f for f in os.listdir("/data/users/mgaughan/kkex/012825_cam_revision_main/12825_output_files") if f.endswith('-clone-error-output.txt')]`
			`subset_list = []`
getting lists of files to get docs for 2025-01-31 04:09:38 +00:00			`all_error_count = 0`
checking-data-scripts 2025-01-30 23:19:27 +00:00			`https_pattern = re.compile(r'(https?://[^\s]+)')`
			`for file in error_files:`
			`# error bad lines here makes the assumption that the errors that are nonconforming are cloning errors`
			`error_file_df = pd.read_csv("/data/users/mgaughan/kkex/012825_cam_revision_main/12825_output_files/" + file, header=None, on_bad_lines=handle_bad_line, engine='python')`
			`error_file_df[1] = error_file_df[1].fillna('')`
			`error_file_df[3] = error_file_df[0].str.count(r'(https?://[^\s]+)', flags=re.IGNORECASE)`
			`char_error_vcs = error_file_df[error_file_df[1].str.startswith(" 'utf-8'")]`
			`#other_errors = error_file_df[~error_file_df[1].str.startswith(" 'utf-8'")]`
			`subset_list.append(char_error_vcs)`
getting lists of files to get docs for 2025-01-31 04:09:38 +00:00			`all_error_count += error_file_df[3].sum()`
checking-data-scripts 2025-01-30 23:19:27 +00:00			`#other_list.append(other_errors)`
updates to both intermediary script and data handling 2025-01-30 18:21:10 +00:00
checking-data-scripts 2025-01-30 23:19:27 +00:00			`result_df = pd.concat(subset_list, ignore_index=True)`
			`#other_df = pd.concat(other_list, ignore_index=True )`
updates to both intermediary script and data handling 2025-01-30 18:21:10 +00:00
checking-data-scripts 2025-01-30 23:19:27 +00:00			`print(len(result_df))`
getting lists of files to get docs for 2025-01-31 04:09:38 +00:00			`print(all_error_count)`
checking-data-scripts 2025-01-30 23:19:27 +00:00			`print(bad_lines)`
getting lists of files to get docs for 2025-01-31 04:09:38 +00:00			`result_df.to_csv("charset_error_list.csv", index=False)`
updates to both intermediary script and data handling 2025-01-30 18:21:10 +00:00
checking-data-scripts 2025-01-30 23:19:27 +00:00			`if __name__ == "__main__":`
			`main()`