''' import torch import numpy as np from transformers import AutoTokenizer, AutoModelForSequenceClassification import random import pandas as pd MODEL_NAME = "Blablablab/neurobiber" CHUNK_SIZE = 512 # Neurobiber was trained with max_length=512 # List of the 96 features that Neurobiber can predict BIBER_FEATURES = [ "BIN_QUAN","BIN_QUPR","BIN_AMP","BIN_PASS","BIN_XX0","BIN_JJ", "BIN_BEMA","BIN_CAUS","BIN_CONC","BIN_COND","BIN_CONJ","BIN_CONT", "BIN_DPAR","BIN_DWNT","BIN_EX","BIN_FPP1","BIN_GER","BIN_RB", "BIN_PIN","BIN_INPR","BIN_TO","BIN_NEMD","BIN_OSUB","BIN_PASTP", "BIN_VBD","BIN_PHC","BIN_PIRE","BIN_PLACE","BIN_POMD","BIN_PRMD", "BIN_WZPRES","BIN_VPRT","BIN_PRIV","BIN_PIT","BIN_PUBV","BIN_SPP2", "BIN_SMP","BIN_SERE","BIN_STPR","BIN_SUAV","BIN_SYNE","BIN_TPP3", "BIN_TIME","BIN_NOMZ","BIN_BYPA","BIN_PRED","BIN_TOBJ","BIN_TSUB", "BIN_THVC","BIN_NN","BIN_DEMP","BIN_DEMO","BIN_WHQU","BIN_EMPH", "BIN_HDG","BIN_WZPAST","BIN_THAC","BIN_PEAS","BIN_ANDC","BIN_PRESP", "BIN_PROD","BIN_SPAU","BIN_SPIN","BIN_THATD","BIN_WHOBJ","BIN_WHSUB", "BIN_WHCL","BIN_ART","BIN_AUXB","BIN_CAP","BIN_SCONJ","BIN_CCONJ", "BIN_DET","BIN_EMOJ","BIN_EMOT","BIN_EXCL","BIN_HASH","BIN_INF", "BIN_UH","BIN_NUM","BIN_LAUGH","BIN_PRP","BIN_PREP","BIN_NNP", "BIN_QUES","BIN_QUOT","BIN_AT","BIN_SBJP","BIN_URL","BIN_WH", "BIN_INDA","BIN_ACCU","BIN_PGAS","BIN_CMADJ","BIN_SPADJ","BIN_X" ] ''' import pandas as pd import numpy as np from biberplus.tagger import load_config, load_pipeline, calculate_tag_frequencies import cupy import random import re def biberplus_labeler(text): print(len(text)) config = load_config() config.update({'use_gpu': False, 'biber': True, 'function_words': False, 'token_normalization': 100}) pipeline = load_pipeline(config) features_list = [] cleaned_messages = [] for message in text: # comment_text preprocessing per https://arxiv.org/pdf/1902.07093 # 1. replace code with CODE comment_text = re.sub(r'`[^`]+`', 'CODE', message) # Inline code comment_text = re.sub(r'```[\s\S]+?```', 'CODE', comment_text) # Block code # 2. replace quotes with QUOTE lines = comment_text.split('\n') lines = ['QUOTE' if line.strip().startswith('>') else line for line in lines] comment_text = '\n'.join(lines) # 3. replace Gerrit URLs with GERRIT URL gerrit_url_pattern = r'https://gerrit\.wikimedia\.org/r/\d+' comment_text = re.sub(gerrit_url_pattern, 'GERRIT_URL', comment_text) # replace URL with URL url_pattern = r'https?://[^\s]+' comment_text = re.sub(url_pattern, 'URL', comment_text) # 4. if possible, replace @ with SCREEN_NAME cleaned_message = re.sub(r'(^|\s)@\w+', 'SCREEN_NAME', comment_text) cleaned_messages.append(cleaned_message) message_label = calculate_tag_frequencies(cleaned_message, pipeline, config) mean_row = message_label.set_index('tag')['mean'] mean_row = mean_row.rename(lambda tag: f"normalized_{tag}") features_list.append(mean_row) print(len(features_list)) frequencies_df = pd.DataFrame(features_list) frequencies_df['message'] = text frequencies_df['cleaned_messages'] = cleaned_messages frequencies_df = frequencies_df.reset_index(drop=True) return frequencies_df if __name__ == "__main__": #loading in the discussion data from the universal CSV first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv") #formatting for the neurobiber model docs = first_discussion_df["comment_text"].astype(str).tolist() #load model and run #model, tokenizer = load_model_and_tokenizer() preds_df = biberplus_labeler(docs) #new columns in the df for the predicted neurobiber items #preds_cols = [f"neurobiber_{i+1}" for i in range(96)] #preds_df = pd.DataFrame(preds, columns=preds_cols, index=first_discussion_df.index) final_discussion_df = pd.concat([first_discussion_df, preds_df], axis=1) #print(type(preds)) #assigning the preditions as a new column ''' final_discussion_df = pd.merge( first_discussion_df, preds_df, on='comment_text', # replace with your actual key how='inner' ) ''' print(first_discussion_df) print(final_discussion_df) #final_discussion_df["biberplus_preds"] = list(preds) #assert that order has been preserved for _ in range(1000): random_index = random.randrange(len(final_discussion_df)) assert first_discussion_df.iloc[random_index]["id"] == final_discussion_df.iloc[random_index]["id"] #assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"] #assert that there are the same number of rows in first_discussion_df and second_discussion_df assert len(first_discussion_df) == len(final_discussion_df) final_discussion_df = final_discussion_df.drop(columns=["message"]) # if passing the prior asserts, let's write to a csv final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_biberplus_labels.csv", index=False) print('biberplus labeling pau')