diff --git a/p2/quest/biberplus-categorization.log b/p2/quest/biberplus-categorization.log new file mode 100644 index 0000000..6f453be --- /dev/null +++ b/p2/quest/biberplus-categorization.log @@ -0,0 +1,3 @@ +starting the job at: Tue Jul 22 16:43:27 CDT 2025 +setting up the environment +running the biberplus labeling script diff --git a/p2/quest/python_scripts/biberplus_labeling.py b/p2/quest/python_scripts/biberplus_labeling.py new file mode 100644 index 0000000..1094110 --- /dev/null +++ b/p2/quest/python_scripts/biberplus_labeling.py @@ -0,0 +1,95 @@ +''' +import torch +import numpy as np +from transformers import AutoTokenizer, AutoModelForSequenceClassification +import random +import pandas as pd + +MODEL_NAME = "Blablablab/neurobiber" +CHUNK_SIZE = 512 # Neurobiber was trained with max_length=512 + +# List of the 96 features that Neurobiber can predict +BIBER_FEATURES = [ + "BIN_QUAN","BIN_QUPR","BIN_AMP","BIN_PASS","BIN_XX0","BIN_JJ", + "BIN_BEMA","BIN_CAUS","BIN_CONC","BIN_COND","BIN_CONJ","BIN_CONT", + "BIN_DPAR","BIN_DWNT","BIN_EX","BIN_FPP1","BIN_GER","BIN_RB", + "BIN_PIN","BIN_INPR","BIN_TO","BIN_NEMD","BIN_OSUB","BIN_PASTP", + "BIN_VBD","BIN_PHC","BIN_PIRE","BIN_PLACE","BIN_POMD","BIN_PRMD", + "BIN_WZPRES","BIN_VPRT","BIN_PRIV","BIN_PIT","BIN_PUBV","BIN_SPP2", + "BIN_SMP","BIN_SERE","BIN_STPR","BIN_SUAV","BIN_SYNE","BIN_TPP3", + "BIN_TIME","BIN_NOMZ","BIN_BYPA","BIN_PRED","BIN_TOBJ","BIN_TSUB", + "BIN_THVC","BIN_NN","BIN_DEMP","BIN_DEMO","BIN_WHQU","BIN_EMPH", + "BIN_HDG","BIN_WZPAST","BIN_THAC","BIN_PEAS","BIN_ANDC","BIN_PRESP", + "BIN_PROD","BIN_SPAU","BIN_SPIN","BIN_THATD","BIN_WHOBJ","BIN_WHSUB", + "BIN_WHCL","BIN_ART","BIN_AUXB","BIN_CAP","BIN_SCONJ","BIN_CCONJ", + "BIN_DET","BIN_EMOJ","BIN_EMOT","BIN_EXCL","BIN_HASH","BIN_INF", + "BIN_UH","BIN_NUM","BIN_LAUGH","BIN_PRP","BIN_PREP","BIN_NNP", + "BIN_QUES","BIN_QUOT","BIN_AT","BIN_SBJP","BIN_URL","BIN_WH", + "BIN_INDA","BIN_ACCU","BIN_PGAS","BIN_CMADJ","BIN_SPADJ","BIN_X" +] +''' +import pandas as pd +import numpy as np +from biberplus.tagger import load_config, load_pipeline, calculate_tag_frequencies +import cupy +import random + +def biberplus_labeler(text): + print(len(text)) + config = load_config() + config.update({'use_gpu': False, 'biber': True, 'function_words': False, 'token_normalization': 100}) + pipeline = load_pipeline(config) + #test = ['London-based DJ Imogen takes on the NTS airwaves, bouncing between fuzzy electro and punishing techno.', ' Built upon the spaCy library, it delivers fast part-of-speech tagging along with supplemental features such as a function word tagger, PCA, and factor analysis'] + features_list = [] + for message in text: + message_label = calculate_tag_frequencies(message, pipeline, config) + mean_row = message_label.set_index('tag')['mean'] + mean_row = mean_row.rename(lambda tag: f"normalized_{tag}") + features_list.append(mean_row) + print(len(features_list)) + frequencies_df = pd.DataFrame(features_list) + frequencies_df['comment_text'] = text + frequencies_df = frequencies_df.reset_index(drop=True) + return frequencies_df + +if __name__ == "__main__": + #https://huggingface.co/Blablablab/neurobiber + ''' + docs = [ + "First text goes here.", + "Second text, slightly different style." + ] + ''' + #loading in the discussion data from the universal CSV + first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv") + #formatting for the neurobiber model + docs = first_discussion_df["comment_text"].astype(str).tolist() + #load model and run + #model, tokenizer = load_model_and_tokenizer() + preds_df = biberplus_labeler(docs) + #new columns in the df for the predicted neurobiber items + #preds_cols = [f"neurobiber_{i+1}" for i in range(96)] + #preds_df = pd.DataFrame(preds, columns=preds_cols, index=first_discussion_df.index) + final_discussion_df = pd.concat([first_discussion_df, preds_df], axis=1) + #print(type(preds)) + #assigning the preditions as a new column + ''' + final_discussion_df = pd.merge( + first_discussion_df, + preds_df, + on='comment_text', # replace with your actual key + how='inner' + ) + ''' + print(len(final_discussion_df)) + #final_discussion_df["biberplus_preds"] = list(preds) + #assert that order has been preserved + for _ in range(10): + random_index = random.choice(first_discussion_df.index) + assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"] + #assert that there are the same number of rows in first_discussion_df and second_discussion_df + assert len(first_discussion_df) == len(final_discussion_df) + # if passing the prior asserts, let's write to a csv + final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072225_biberplus_labels.csv", index=False) + print('biberplus labeling pau') + diff --git a/p2/quest/slurm_jobs/biberplus_label.sh b/p2/quest/slurm_jobs/biberplus_label.sh new file mode 100644 index 0000000..df79d4f --- /dev/null +++ b/p2/quest/slurm_jobs/biberplus_label.sh @@ -0,0 +1,32 @@ +#!/bin/bash +#SBATCH -A p32852 +#SBATCH -p gengpu +#SBATCH --gres=gpu:a100:1 +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=1 +#SBATCH --time=24:00:00 +#SBATCH --mem=64G +#SBATCH --cpus-per-task=4 +#SBATCH --job-name=biberplus-categorization +#SBATCH --output=biberplus-categorization.log +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --mail-user=gaughan@u.northwestern.edu + +echo "starting the job at: $(date)" + +echo "setting up the environment" + +module purge +eval "$(conda shell.bash hook)" +conda activate neurobiber + +echo "running the biberplus labeling script" + +python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/biberplus_labeling.py + +echo "job finished, cleaning up" + +conda deactivate + +echo "job pau at: $(date)" +