''' import torch import numpy as np from transformers import AutoTokenizer, AutoModelForSequenceClassification import random import pandas as pd MODEL_NAME = "Blablablab/neurobiber" CHUNK_SIZE = 512 # Neurobiber was trained with max_length=512 # List of the 96 features that Neurobiber can predict BIBER_FEATURES = [ "BIN_QUAN","BIN_QUPR","BIN_AMP","BIN_PASS","BIN_XX0","BIN_JJ", "BIN_BEMA","BIN_CAUS","BIN_CONC","BIN_COND","BIN_CONJ","BIN_CONT", "BIN_DPAR","BIN_DWNT","BIN_EX","BIN_FPP1","BIN_GER","BIN_RB", "BIN_PIN","BIN_INPR","BIN_TO","BIN_NEMD","BIN_OSUB","BIN_PASTP", "BIN_VBD","BIN_PHC","BIN_PIRE","BIN_PLACE","BIN_POMD","BIN_PRMD", "BIN_WZPRES","BIN_VPRT","BIN_PRIV","BIN_PIT","BIN_PUBV","BIN_SPP2", "BIN_SMP","BIN_SERE","BIN_STPR","BIN_SUAV","BIN_SYNE","BIN_TPP3", "BIN_TIME","BIN_NOMZ","BIN_BYPA","BIN_PRED","BIN_TOBJ","BIN_TSUB", "BIN_THVC","BIN_NN","BIN_DEMP","BIN_DEMO","BIN_WHQU","BIN_EMPH", "BIN_HDG","BIN_WZPAST","BIN_THAC","BIN_PEAS","BIN_ANDC","BIN_PRESP", "BIN_PROD","BIN_SPAU","BIN_SPIN","BIN_THATD","BIN_WHOBJ","BIN_WHSUB", "BIN_WHCL","BIN_ART","BIN_AUXB","BIN_CAP","BIN_SCONJ","BIN_CCONJ", "BIN_DET","BIN_EMOJ","BIN_EMOT","BIN_EXCL","BIN_HASH","BIN_INF", "BIN_UH","BIN_NUM","BIN_LAUGH","BIN_PRP","BIN_PREP","BIN_NNP", "BIN_QUES","BIN_QUOT","BIN_AT","BIN_SBJP","BIN_URL","BIN_WH", "BIN_INDA","BIN_ACCU","BIN_PGAS","BIN_CMADJ","BIN_SPADJ","BIN_X" ] ''' import pandas as pd import numpy as np from biberplus.tagger import load_config, load_pipeline, calculate_tag_frequencies import cupy import random def biberplus_labeler(text): print(len(text)) config = load_config() config.update({'use_gpu': False, 'biber': True, 'function_words': False, 'token_normalization': 100}) pipeline = load_pipeline(config) features_list = [] for message in text: message_label = calculate_tag_frequencies(message, pipeline, config) mean_row = message_label.set_index('tag')['mean'] mean_row = mean_row.rename(lambda tag: f"normalized_{tag}") features_list.append(mean_row) print(len(features_list)) frequencies_df = pd.DataFrame(features_list) frequencies_df['message'] = text frequencies_df = frequencies_df.reset_index(drop=True) return frequencies_df if __name__ == "__main__": #loading in the discussion data from the universal CSV first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv") #formatting for the neurobiber model docs = first_discussion_df["comment_text"].astype(str).tolist() #load model and run #model, tokenizer = load_model_and_tokenizer() preds_df = biberplus_labeler(docs) #new columns in the df for the predicted neurobiber items #preds_cols = [f"neurobiber_{i+1}" for i in range(96)] #preds_df = pd.DataFrame(preds, columns=preds_cols, index=first_discussion_df.index) final_discussion_df = pd.concat([first_discussion_df, preds_df], axis=1) #print(type(preds)) #assigning the preditions as a new column ''' final_discussion_df = pd.merge( first_discussion_df, preds_df, on='comment_text', # replace with your actual key how='inner' ) ''' print(first_discussion_df) print(final_discussion_df) #final_discussion_df["biberplus_preds"] = list(preds) #assert that order has been preserved for _ in range(1000): random_index = random.randrange(len(final_discussion_df)) assert first_discussion_df.iloc[random_index]["id"] == final_discussion_df.iloc[random_index]["id"] #assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"] #assert that there are the same number of rows in first_discussion_df and second_discussion_df assert len(first_discussion_df) == len(final_discussion_df) final_discussion_df = final_discussion_df.drop(columns=["message"]) # if passing the prior asserts, let's write to a csv final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_labels.csv", index=False) print('biberplus labeling pau')