From ba42ca4ca78c110cbb7b85b1f686f69205f03d4d Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Wed, 29 Jan 2025 15:51:09 -0600 Subject: [PATCH] updating data collection --- .../for_batching/intermediary_script.py | 17 +++++++++-------- 12825_revision/for_batching/requirements.txt | 3 +++ 12825_revision/intermediary_script.py | 6 +++--- 3 files changed, 15 insertions(+), 11 deletions(-) create mode 100644 12825_revision/for_batching/requirements.txt diff --git a/12825_revision/for_batching/intermediary_script.py b/12825_revision/for_batching/intermediary_script.py index 1c73672..7768b93 100644 --- a/12825_revision/for_batching/intermediary_script.py +++ b/12825_revision/for_batching/intermediary_script.py @@ -10,16 +10,17 @@ import argparse ''' RUNNING INSTRUCTIONS: -[1] set up tmux environment, most likely also using venv within it +[1] set up tmux environment, install requirements.txt [2] edit this file where marked "FIX BELOW" -[3] install pip packages -[4] in your tmux environment, run the following three commands to handle password prompts +[3] in your tmux environment, run the following commands to handle password prompts + - eval "$(ssh-agent -s)" + - ssh-add ~/.ssh/id_ed25519 - export GIT_SSH_COMMAND='ssh -o StrictHostKeyChecking=no' - export GIT_ASKPASS='false' - - export GIT_TERMINAL_PROMPT = '0' -[5] in tmux, run the script from the terminal as follows with your START and STOP values + - export GIT_TERMINAL_PROMPT='0' +[4] in tmux, run the script from the terminal as follows with your START and STOP values - python3 intermediary_script.py --start_index START --stop_index STOP -[6] the password handling is imperfect, so I would appreciate if you could check on the script every so often in case anything hangs +[5] the password handling is imperfect, so I would appreciate if you could check on the script every so often in case anything hangs THANK YOU VERY MUCH - matt ''' @@ -209,7 +210,7 @@ def for_all_files(start_index, stop_index): index=False, ) except Exception as e: - clone_error.append([row[5], e]) + clone_error.append([row[5], str(e)]) print(f"outside cloning error: {e}") finally: und_repo_id = "" @@ -224,7 +225,7 @@ def for_all_files(start_index, stop_index): print(clone_error) with open(f"{stop_index}-clone-error-output.txt", "w") as txt_file: for error in clone_error: - txt_file.write(error + "\n") + txt_file.write(', '.join(error) + "\n") with open(f"{stop_index}-success-output.txt", "w") as txt_file: txt_file.write(f"Number of Empty Rows: {empty_row} \n") txt_file.write(f"Number of Cloning Errors: {len(clone_error)} \n") diff --git a/12825_revision/for_batching/requirements.txt b/12825_revision/for_batching/requirements.txt new file mode 100644 index 0000000..d696d59 --- /dev/null +++ b/12825_revision/for_batching/requirements.txt @@ -0,0 +1,3 @@ +GitPython==3.1.40 +pandas==2.1.2 +tqdm==4.66.1 diff --git a/12825_revision/intermediary_script.py b/12825_revision/intermediary_script.py index 53d062d..c33f0a7 100644 --- a/12825_revision/intermediary_script.py +++ b/12825_revision/intermediary_script.py @@ -132,7 +132,7 @@ def diff_analysis(diffs): def for_all_files(start_index, stop_index): cwd = os.getcwd() - csv_path = "../final_data/deb_full_data.csv" + csv_path = "for_batching/deb_full_data.csv" index = -1 saved = [] empty_row = 0 @@ -209,7 +209,7 @@ def for_all_files(start_index, stop_index): index=False, ) except Exception as e: - clone_error.append([row[5], e]) + clone_error.append([row[5], str(e)]) print(f"outside cloning error: {e}") finally: und_repo_id = "" @@ -224,7 +224,7 @@ def for_all_files(start_index, stop_index): print(clone_error) with open(f"{stop_index}-clone-error-output.txt", "w") as txt_file: for error in clone_error: - txt_file.write(error + "\n") + txt_file.write(', '.join(error) + "\n") with open(f"{stop_index}-success-output.txt", "w") as txt_file: txt_file.write(f"Number of Empty Rows: {empty_row} \n") txt_file.write(f"Number of Cloning Errors: {len(clone_error)} \n")