updating with tentative neurobiber labels, need to verify outputs
This commit is contained in:
parent
c4dd45e344
commit
7e8fb1982b
151688
p2/quest/071425_neurobiber_labels.csv
Normal file
151688
p2/quest/071425_neurobiber_labels.csv
Normal file
File diff suppressed because one or more lines are too long
@ -1,6 +1,8 @@
|
|||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
||||||
|
import random
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
MODEL_NAME = "Blablablab/neurobiber"
|
MODEL_NAME = "Blablablab/neurobiber"
|
||||||
CHUNK_SIZE = 512 # Neurobiber was trained with max_length=512
|
CHUNK_SIZE = 512 # Neurobiber was trained with max_length=512
|
||||||
@ -93,5 +95,32 @@ def predict_text(model, tokenizer, text, chunk_size=CHUNK_SIZE, subbatch_size=32
|
|||||||
return batch_preds[0]
|
return batch_preds[0]
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
print("my brain hurts!")
|
|
||||||
#https://huggingface.co/Blablablab/neurobiber
|
#https://huggingface.co/Blablablab/neurobiber
|
||||||
|
'''
|
||||||
|
docs = [
|
||||||
|
"First text goes here.",
|
||||||
|
"Second text, slightly different style."
|
||||||
|
]
|
||||||
|
'''
|
||||||
|
#loading in the discussion data from the universal CSV
|
||||||
|
first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv")
|
||||||
|
#formatting for the neurobiber model
|
||||||
|
docs = first_discussion_df["comment_text"].astype(str).tolist()
|
||||||
|
#load model and run
|
||||||
|
model, tokenizer = load_model_and_tokenizer()
|
||||||
|
preds = predict_batch(model, tokenizer, docs)
|
||||||
|
#new columns in the df for the predicted neurobiber items
|
||||||
|
preds_cols = [f"neurobiber_{i+1}" for i in range(96)]
|
||||||
|
preds_df = pd.DataFrame(preds, columns=preds_cols, index=first_discussion_df.index)
|
||||||
|
final_discussion_df = pd.concat([first_discussion_df, preds_df], axis=1)
|
||||||
|
#assert that order has been preserved
|
||||||
|
for _ in range(10):
|
||||||
|
random_index = random.choice(first_discussion_df.index)
|
||||||
|
assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"]
|
||||||
|
#assert that there are the same number of rows in first_discussion_df and second_discussion_df
|
||||||
|
assert len(first_discussion_df) == len(final_discussion_df)
|
||||||
|
# if passing the prior asserts, let's write to a csv
|
||||||
|
final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/071425_neurobiber_labels.csv", index=False)
|
||||||
|
print('neurobiber labeling pau')
|
||||||
|
|
||||||
|
|
||||||
|
6
p2/quest/slurm_jobs/neurobiber-categorization.log
Normal file
6
p2/quest/slurm_jobs/neurobiber-categorization.log
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
starting the job at: Mon Jul 14 15:25:44 CDT 2025
|
||||||
|
setting up the environment
|
||||||
|
running the neurobiber labeling script
|
||||||
|
neurobiber labeling pau
|
||||||
|
job finished, cleaning up
|
||||||
|
job pau at: Mon Jul 14 15:27:39 CDT 2025
|
@ -20,7 +20,7 @@ module purge
|
|||||||
eval "$(conda shell.bash hook)"
|
eval "$(conda shell.bash hook)"
|
||||||
conda activate neurobiber
|
conda activate neurobiber
|
||||||
|
|
||||||
echo "running the p1 categorization script"
|
echo "running the neurobiber labeling script"
|
||||||
|
|
||||||
python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/neurobiber_labeling.py
|
python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/neurobiber_labeling.py
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[ ] generate clean rows for each comment from the discussion data
|
[ x ] generate clean rows for each comment from the discussion data
|
||||||
[ ] get data onto quest
|
[ x ] get data onto quest
|
||||||
[ ] run neurobiber over the data set, appending vectors onto the array
|
[ x ] run neurobiber over the data set, appending vectors onto the array
|
||||||
[ ] set up the unsupervised classification pipeline
|
[ ] set up the unsupervised classification pipeline
|
||||||
[ ] iterate the prompt
|
[ ] iterate the prompt
|
||||||
[ ] run the classification of the prompt
|
[ ] run the classification of the prompt
|
||||||
|
Loading…
Reference in New Issue
Block a user