adding biberplus labels

2025-07-23 15:20:26 -05:00 · 2025-07-23 15:20:26 -05:00 · b0584ec1be
commit b0584ec1be
parent edd17d3269
3 changed files with 151731 additions and 15 deletions
--- a/p2/quest/072325_biberplus_labels.csv
+++ b/p2/quest/072325_biberplus_labels.csv
--- a/p2/quest/biberplus-categorization.log
+++ b/p2/quest/biberplus-categorization.log
@ -1,3 +1,36 @@
-starting the job at: Tue Jul 22 16:43:27 CDT 2025
+starting the job at: Wed Jul 23 14:49:04 CDT 2025
 setting up the environment
 running the biberplus labeling script
+26024
+26024
+           id  ... http_flag
+0       56791  ...       NaN
+1      269631  ...       NaN
+2      269628  ...       NaN
+3      269622  ...       NaN
+4       56737  ...       NaN
+...       ...  ...       ...
+26019  403186  ...      True
+26020   78646  ...      True
+26021  429163  ...      True
+26022  429137  ...      True
+26023  418783  ...      True
+
+[26024 rows x 22 columns]
+           id  ...                                            message
+0       56791  ...  pawn character editing\n\nseen on master branc...
+1      269631  ...  Change 86685 merged by jenkins-bot:\nFollow-up...
+2      269628  ...  *** Bug 54785 has been marked as a duplicate o...
+3      269622  ...  Change 86685 had a related patch set uploaded ...
+4       56737  ...  **Author:** `Wikifram`\n\n**Description:**\nAf...
+...       ...  ...                                                ...
+26019  403186  ...  Could you attach a screenshot please? Drag & d...
+26020   78646  ...  Hi,\n\nWe have a wiki which has a part which c...
+26021  429163  ...  Sorry for not reply-ing. I did a test and coul...
+26022  429137  ...                        @DikkieDick: Please answer.
+26023  418783  ...  I cannot replicate this. What's the name of th...
+
+[26024 rows x 121 columns]
+biberplus labeling pau
+job finished, cleaning up
+job pau at: Wed Jul 23 14:58:09 CDT 2025
--- a/p2/quest/python_scripts/biberplus_labeling.py
+++ b/p2/quest/python_scripts/biberplus_labeling.py
@ -39,7 +39,6 @@ def biberplus_labeler(text):
    config = load_config()
    config.update({'use_gpu': False, 'biber': True, 'function_words': False, 'token_normalization': 100})
    pipeline = load_pipeline(config)
-    #test =  ['London-based DJ Imogen takes on the NTS airwaves, bouncing between fuzzy electro and punishing techno.', ' Built upon the spaCy library, it delivers fast part-of-speech tagging along with supplemental features such as a function word tagger, PCA, and factor analysis']
    features_list = []
    for message in text:
        message_label = calculate_tag_frequencies(message, pipeline, config)
@ -48,18 +47,11 @@ def biberplus_labeler(text):
        features_list.append(mean_row)
    print(len(features_list))
    frequencies_df = pd.DataFrame(features_list)
-    frequencies_df['comment_text'] = text
+    frequencies_df['message'] = text
    frequencies_df = frequencies_df.reset_index(drop=True)
    return frequencies_df
 
 if __name__ == "__main__":
-    #https://huggingface.co/Blablablab/neurobiber
-    '''
-    docs = [
-    "First text goes here.",
-    "Second text, slightly different style."
-    ]
-    '''
    #loading in the discussion data from the universal CSV
    first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv")
    #formatting for the neurobiber model
@ -81,15 +73,18 @@ if __name__ == "__main__":
        how='inner'
    )
    '''
-    print(len(final_discussion_df))
+    print(first_discussion_df)
+    print(final_discussion_df)
    #final_discussion_df["biberplus_preds"] = list(preds)
    #assert that order has been preserved 
-    for _ in range(10):
-        random_index = random.choice(first_discussion_df.index)
-        assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"]
+    for _ in range(1000):
+        random_index = random.randrange(len(final_discussion_df))
+        assert first_discussion_df.iloc[random_index]["id"] == final_discussion_df.iloc[random_index]["id"]
+        #assert first_discussion_df.loc[random_index, "comment_text"] == final_discussion_df.loc[random_index, "comment_text"]
    #assert that there are the same number of rows in first_discussion_df and second_discussion_df
    assert len(first_discussion_df) == len(final_discussion_df)
+    final_discussion_df = final_discussion_df.drop(columns=["message"])
    # if passing the prior asserts, let's write to a csv
-    final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072225_biberplus_labels.csv", index=False)
+    final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072325_biberplus_labels.csv", index=False)
    print('biberplus labeling pau')