updating data collection

This commit is contained in:
Matthew Gaughan 2025-01-29 15:51:09 -06:00
parent eefc940a7c
commit ba42ca4ca7
3 changed files with 15 additions and 11 deletions

View File

@ -10,16 +10,17 @@ import argparse
'''
RUNNING INSTRUCTIONS:
[1] set up tmux environment, most likely also using venv within it
[1] set up tmux environment, install requirements.txt
[2] edit this file where marked "FIX BELOW"
[3] install pip packages
[4] in your tmux environment, run the following three commands to handle password prompts
[3] in your tmux environment, run the following commands to handle password prompts
- eval "$(ssh-agent -s)"
- ssh-add ~/.ssh/id_ed25519
- export GIT_SSH_COMMAND='ssh -o StrictHostKeyChecking=no'
- export GIT_ASKPASS='false'
- export GIT_TERMINAL_PROMPT = '0'
[5] in tmux, run the script from the terminal as follows with your START and STOP values
- export GIT_TERMINAL_PROMPT='0'
[4] in tmux, run the script from the terminal as follows with your START and STOP values
- python3 intermediary_script.py --start_index START --stop_index STOP
[6] the password handling is imperfect, so I would appreciate if you could check on the script every so often in case anything hangs
[5] the password handling is imperfect, so I would appreciate if you could check on the script every so often in case anything hangs
THANK YOU VERY MUCH - matt
'''
@ -209,7 +210,7 @@ def for_all_files(start_index, stop_index):
index=False,
)
except Exception as e:
clone_error.append([row[5], e])
clone_error.append([row[5], str(e)])
print(f"outside cloning error: {e}")
finally:
und_repo_id = ""
@ -224,7 +225,7 @@ def for_all_files(start_index, stop_index):
print(clone_error)
with open(f"{stop_index}-clone-error-output.txt", "w") as txt_file:
for error in clone_error:
txt_file.write(error + "\n")
txt_file.write(', '.join(error) + "\n")
with open(f"{stop_index}-success-output.txt", "w") as txt_file:
txt_file.write(f"Number of Empty Rows: {empty_row} \n")
txt_file.write(f"Number of Cloning Errors: {len(clone_error)} \n")

View File

@ -0,0 +1,3 @@
GitPython==3.1.40
pandas==2.1.2
tqdm==4.66.1

View File

@ -132,7 +132,7 @@ def diff_analysis(diffs):
def for_all_files(start_index, stop_index):
cwd = os.getcwd()
csv_path = "../final_data/deb_full_data.csv"
csv_path = "for_batching/deb_full_data.csv"
index = -1
saved = []
empty_row = 0
@ -209,7 +209,7 @@ def for_all_files(start_index, stop_index):
index=False,
)
except Exception as e:
clone_error.append([row[5], e])
clone_error.append([row[5], str(e)])
print(f"outside cloning error: {e}")
finally:
und_repo_id = ""
@ -224,7 +224,7 @@ def for_all_files(start_index, stop_index):
print(clone_error)
with open(f"{stop_index}-clone-error-output.txt", "w") as txt_file:
for error in clone_error:
txt_file.write(error + "\n")
txt_file.write(', '.join(error) + "\n")
with open(f"{stop_index}-success-output.txt", "w") as txt_file:
txt_file.write(f"Number of Empty Rows: {empty_row} \n")
txt_file.write(f"Number of Cloning Errors: {len(clone_error)} \n")