adding biberplus labels

2025-07-23 15:20:26 -05:00 · 2025-07-23 15:20:26 -05:00 · b0584ec1be
commit b0584ec1be
parent edd17d3269
3 changed files with 151731 additions and 15 deletions
--- a/p2/quest/072325_biberplus_labels.csv
+++ b/p2/quest/072325_biberplus_labels.csv
--- a/p2/quest/biberplus-categorization.log
+++ b/p2/quest/biberplus-categorization.log
@ -1,3 +1,36 @@
-starting the job at: Tue Jul 22 16:43:27 CDT 2025
+starting the job at: Wed Jul 23 14:49:04 CDT 2025
 setting up the environment
 running the biberplus labeling script
 26024
 26024
           id  ... http_flag
 0       56791  ...       NaN
 1      269631  ...       NaN
 2      269628  ...       NaN
 3      269622  ...       NaN
 4       56737  ...       NaN
 ...       ...  ...       ...
 26019  403186  ...      True
 26020   78646  ...      True
 26021  429163  ...      True
 26022  429137  ...      True
 26023  418783  ...      True
 [26024 rows x 22 columns]
           id  ...                                            message
 0       56791  ...  pawn character editing\n\nseen on master branc...
 1      269631  ...  Change 86685 merged by jenkins-bot:\nFollow-up...
 2      269628  ...  *** Bug 54785 has been marked as a duplicate o...
 3      269622  ...  Change 86685 had a related patch set uploaded ...
 4       56737  ...  **Author:** `Wikifram`\n\n**Description:**\nAf...
 ...       ...  ...                                                ...
 26019  403186  ...  Could you attach a screenshot please? Drag & d...
 26020   78646  ...  Hi,\n\nWe have a wiki which has a part which c...
 26021  429163  ...  Sorry for not reply-ing. I did a test and coul...
 26022  429137  ...                        @DikkieDick: Please answer.
 26023  418783  ...  I cannot replicate this. What's the name of th...
 [26024 rows x 121 columns]
 biberplus labeling pau
 job finished, cleaning up
 job pau at: Wed Jul 23 14:58:09 CDT 2025
--- a/p2/quest/python_scripts/biberplus_labeling.py
+++ b/p2/quest/python_scripts/biberplus_labeling.py
@ -39,7 +39,6 @@ def biberplus_labeler(text):
    config = load_config()
    config.update({'use_gpu': False, 'biber': True, 'function_words': False, 'token_normalization': 100})
    pipeline = load_pipeline(config)
    #test =  ['London-based DJ Imogen takes on the NTS airwaves, bouncing between fuzzy electro and punishing techno.', ' Built upon the spaCy library, it delivers fast part-of-speech tagging along with supplemental features such as a function word tagger, PCA, and factor analysis']
    features_list = []
    for message in text:
        message_label = calculate_tag_frequencies(message, pipeline, config)
@ -48,18 +47,11 @@ def biberplus_labeler(text):
        features_list.append(mean_row)
    print(len(features_list))
    frequencies_df = pd.DataFrame(features_list)
-    frequencies_df['comment_text'] = text
+    frequencies_df['message'] = text
    frequencies_df = frequencies_df.reset_index(drop=True)
    return frequencies_df
 if __name__ == "__main__":
    #https://huggingface.co/Blablablab/neurobiber
    '''
    docs = [
    "First text goes here.",
    "Second text, slightly different style."
    ]
    '''
    #loading in the discussion data from the universal CSV
    first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv")
    #formatting for the neurobiber model
@ -81,15 +73,18 @@ if __name__ == "__main__":
        how='inner'
    )
    '''
-    print(len(final_discussion_df))
+    print(first_discussion_df)
    print(final_discussion_df)
    #final_discussion_df["biberplus_preds"] = list(preds)
    #assert that order has been preserved 
-    for _ in range(10):
+    for _ in range(1000):
-        random_index = random.choice(first_discussion_df.index)
+        random_index = random.randrange(len(final_discussion_df))
-        assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"]
+        assert first_discussion_df.iloc[random_index]["id"] == final_discussion_df.iloc[random_index]["id"]
        #assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"]
    #assert that there are the same number of rows in first_discussion_df and second_discussion_df
    assert len(first_discussion_df) == len(final_discussion_df)
    final_discussion_df = final_discussion_df.drop(columns=["message"])
    # if passing the prior asserts, let's write to a csv
-    final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072225_biberplus_labels.csv", index=False)
+    final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_labels.csv", index=False)
    print('biberplus labeling pau')