1
0

updated the labels to try to store in a better format

This commit is contained in:
mgaughan 2025-07-15 14:17:46 -05:00
parent 7e8fb1982b
commit 43fb346318
4 changed files with 203750 additions and 4 deletions

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,6 @@
starting the job at: Tue Jul 15 14:09:10 CDT 2025
setting up the environment
running the neurobiber labeling script
neurobiber labeling pau
job finished, cleaning up
job pau at: Tue Jul 15 14:12:26 CDT 2025

View File

@ -110,9 +110,13 @@ if __name__ == "__main__":
model, tokenizer = load_model_and_tokenizer()
preds = predict_batch(model, tokenizer, docs)
#new columns in the df for the predicted neurobiber items
preds_cols = [f"neurobiber_{i+1}" for i in range(96)]
preds_df = pd.DataFrame(preds, columns=preds_cols, index=first_discussion_df.index)
final_discussion_df = pd.concat([first_discussion_df, preds_df], axis=1)
#preds_cols = [f"neurobiber_{i+1}" for i in range(96)]
#preds_df = pd.DataFrame(preds, columns=preds_cols, index=first_discussion_df.index)
#final_discussion_df = pd.concat([first_discussion_df, preds_df], axis=1)
#assigning the preditions as a new column
final_discussion_df = first_discussion_df.copy()
final_discussion_df["neurobiber_preds"] = list(preds)
#assert that order has been preserved
for _ in range(10):
random_index = random.choice(first_discussion_df.index)
@ -120,7 +124,7 @@ if __name__ == "__main__":
#assert that there are the same number of rows in first_discussion_df and second_discussion_df
assert len(first_discussion_df) == len(final_discussion_df)
# if passing the prior asserts, let's write to a csv
final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/071425_neurobiber_labels.csv", index=False)
final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/071525_neurobiber_labels.csv", index=False)
print('neurobiber labeling pau')