mw-lifecycle-analysis/p2/quest/python_scripts/biberplus_labeling.py

'''
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import random
import pandas as pd

MODEL_NAME = "Blablablab/neurobiber"
CHUNK_SIZE = 512  # Neurobiber was trained with max_length=512

# List of the 96 features that Neurobiber can predict
BIBER_FEATURES = [
    "BIN_QUAN","BIN_QUPR","BIN_AMP","BIN_PASS","BIN_XX0","BIN_JJ",
    "BIN_BEMA","BIN_CAUS","BIN_CONC","BIN_COND","BIN_CONJ","BIN_CONT",
    "BIN_DPAR","BIN_DWNT","BIN_EX","BIN_FPP1","BIN_GER","BIN_RB",
    "BIN_PIN","BIN_INPR","BIN_TO","BIN_NEMD","BIN_OSUB","BIN_PASTP",
    "BIN_VBD","BIN_PHC","BIN_PIRE","BIN_PLACE","BIN_POMD","BIN_PRMD",
    "BIN_WZPRES","BIN_VPRT","BIN_PRIV","BIN_PIT","BIN_PUBV","BIN_SPP2",
    "BIN_SMP","BIN_SERE","BIN_STPR","BIN_SUAV","BIN_SYNE","BIN_TPP3",
    "BIN_TIME","BIN_NOMZ","BIN_BYPA","BIN_PRED","BIN_TOBJ","BIN_TSUB",
    "BIN_THVC","BIN_NN","BIN_DEMP","BIN_DEMO","BIN_WHQU","BIN_EMPH",
    "BIN_HDG","BIN_WZPAST","BIN_THAC","BIN_PEAS","BIN_ANDC","BIN_PRESP",
    "BIN_PROD","BIN_SPAU","BIN_SPIN","BIN_THATD","BIN_WHOBJ","BIN_WHSUB",
    "BIN_WHCL","BIN_ART","BIN_AUXB","BIN_CAP","BIN_SCONJ","BIN_CCONJ",
    "BIN_DET","BIN_EMOJ","BIN_EMOT","BIN_EXCL","BIN_HASH","BIN_INF",
    "BIN_UH","BIN_NUM","BIN_LAUGH","BIN_PRP","BIN_PREP","BIN_NNP",
    "BIN_QUES","BIN_QUOT","BIN_AT","BIN_SBJP","BIN_URL","BIN_WH",
    "BIN_INDA","BIN_ACCU","BIN_PGAS","BIN_CMADJ","BIN_SPADJ","BIN_X"
]
'''
import pandas as pd
import numpy as np
from biberplus.tagger import load_config, load_pipeline, calculate_tag_frequencies
import cupy
import random
import re

def biberplus_labeler(text):
    print(len(text))
    config = load_config()
    config.update({'use_gpu': False, 'biber': True, 'function_words': False, 'token_normalization': 100})
    pipeline = load_pipeline(config)
    features_list = []
    cleaned_messages = []
    for message in text:

        # comment_text preprocessing per https://arxiv.org/pdf/1902.07093
        # 1. replace code with CODE
        comment_text = re.sub(r'`[^`]+`', 'CODE', message)      # Inline code
        comment_text = re.sub(r'```[\s\S]+?```', 'CODE', comment_text)  # Block code
        # 2. replace quotes with QUOTE
        lines = comment_text.split('\n')
        lines = ['QUOTE' if line.strip().startswith('>') else line for line in lines]
        comment_text = '\n'.join(lines)
        # 3. replace Gerrit URLs with GERRIT URL
        gerrit_url_pattern = r'https://gerrit\.wikimedia\.org/r/\d+'
        comment_text = re.sub(gerrit_url_pattern, 'GERRIT_URL', comment_text)
        # replace URL with URL
        url_pattern = r'https?://[^\s]+'
        comment_text = re.sub(url_pattern, 'URL', comment_text)
        # 4. if possible, replace @ with SCREEN_NAME
        cleaned_message = re.sub(r'(^|\s)@\w+', 'SCREEN_NAME', comment_text)
        cleaned_messages.append(cleaned_message)

        message_label = calculate_tag_frequencies(cleaned_message, pipeline, config)
        mean_row = message_label.set_index('tag')['mean']
        mean_row = mean_row.rename(lambda tag: f"normalized_{tag}")
        features_list.append(mean_row)
    print(len(features_list))
    frequencies_df = pd.DataFrame(features_list)
    frequencies_df['message'] = text
    frequencies_df['cleaned_messages'] = cleaned_messages
    frequencies_df = frequencies_df.reset_index(drop=True)
    return frequencies_df

if __name__ == "__main__":
    #loading in the discussion data from the universal CSV
    first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv")
    #formatting for the neurobiber model
    docs = first_discussion_df["comment_text"].astype(str).tolist()
    #load model and run
    #model, tokenizer = load_model_and_tokenizer()
    preds_df = biberplus_labeler(docs)
    #new columns in the df for the predicted neurobiber items
    #preds_cols = [f"neurobiber_{i+1}" for i in range(96)]
    #preds_df = pd.DataFrame(preds, columns=preds_cols, index=first_discussion_df.index)
    final_discussion_df = pd.concat([first_discussion_df, preds_df], axis=1)
    #print(type(preds))
    #assigning the preditions as a new column
    '''
    final_discussion_df = pd.merge(
        first_discussion_df,
        preds_df,
        on='comment_text',    # replace with your actual key
        how='inner'
    )
    '''
    print(first_discussion_df)
    print(final_discussion_df)
    #final_discussion_df["biberplus_preds"] = list(preds)
    #assert that order has been preserved
    for _ in range(1000):
        random_index = random.randrange(len(final_discussion_df))
        assert first_discussion_df.iloc[random_index]["id"] == final_discussion_df.iloc[random_index]["id"]
        #assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"]
    #assert that there are the same number of rows in first_discussion_df and second_discussion_df
    assert len(first_discussion_df) == len(final_discussion_df)
    final_discussion_df = final_discussion_df.drop(columns=["message"])
    # if passing the prior asserts, let's write to a csv
    final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_biberplus_labels.csv", index=False)
    print('biberplus labeling pau')