updating library to account for re-running PCA

2025-09-23 16:41:32 -05:00 · 2025-09-23 16:41:32 -05:00 · 265b930578
commit 265b930578
parent 032975c4f0
4 changed files with 244 additions and 42 deletions
--- a/p2/quest/cleaned_biberplus-categorization.log
+++ b/p2/quest/cleaned_biberplus-categorization.log
@ -1,36 +1,3 @@
-starting the job at: Fri Jul 25 20:48:01 CDT 2025
+starting the job at: Tue Sep 23 16:37:07 CDT 2025
 setting up the environment
 running the biberplus labeling script
-26024
-26024
-           id  ... http_flag
-0       56791  ...       NaN
-1      269631  ...       NaN
-2      269628  ...       NaN
-3      269622  ...       NaN
-4       56737  ...       NaN
-...       ...  ...       ...
-26019  403186  ...      True
-26020   78646  ...      True
-26021  429163  ...      True
-26022  429137  ...      True
-26023  418783  ...      True
-
-[26024 rows x 22 columns]
-           id  ...                                   cleaned_messages
-0       56791  ...  pawn character editing\n\nseen on master branc...
-1      269631  ...  Change 86685 merged by jenkins-bot:\nFollow-up...
-2      269628  ...  *** Bug 54785 has been marked as a duplicate o...
-3      269622  ...  Change 86685 had a related patch set uploaded ...
-4       56737  ...  **Author:** CODE\n\n**Description:**\nAfter th...
-...       ...  ...                                                ...
-26019  403186  ...  Could you attach a screenshot please? Drag & d...
-26020   78646  ...  Hi,\n\nWe have a wiki which has a part which c...
-26021  429163  ...  Sorry for not reply-ing. I did a test and coul...
-26022  429137  ...                        SCREEN_NAME: Please answer.
-26023  418783  ...  I cannot replicate this. What's the name of th...
-
-[26024 rows x 122 columns]
-biberplus labeling pau
-job finished, cleaning up
-job pau at: Fri Jul 25 20:55:26 CDT 2025
--- a/p2/quest/neurobiber-pca.log
+++ b/p2/quest/neurobiber-pca.log
@ -1,9 +1,207 @@
-starting the job at: Thu Sep  4 15:41:55 CDT 2025
+starting the job at: Tue Sep 23 16:37:06 CDT 2025
 setting up the environment
 running the neurobiber labeling script
 Variance of each PCA component: [88.92832185 39.46471687 32.34601523 20.19544345 14.0083261  11.5837521
  7.82584723  6.89064989  6.07988254  5.80726367  5.49782354  4.50587747
  4.31482409  2.81997326  2.62989708  2.27205352  2.09396341  2.00076119]
+PC1:
+  BIN_NNP: 0.760
+  BIN_CAP: 0.524
+  BIN_DET: -0.166
+  BIN_PREP: -0.157
+  BIN_PIN: -0.157
+  BIN_ART: -0.126
+  BIN_NN: -0.119
+  BIN_RB: -0.076
+  BIN_INF: -0.070
+  BIN_VPRT: -0.069
+PC2:
+  BIN_PREP: 0.473
+  BIN_PIN: 0.473
+  BIN_NNP: 0.426
+  BIN_DET: 0.323
+  BIN_ART: 0.240
+  BIN_NOMZ: -0.233
+  BIN_VPRT: 0.142
+  BIN_RB: 0.132
+  BIN_SBJP: 0.119
+  BIN_PRP: 0.119
+PC3:
+  BIN_CAP: 0.727
+  BIN_NN: 0.546
+  BIN_NNP: -0.363
+  BIN_PREP: 0.102
+  BIN_PIN: 0.102
+  BIN_DET: 0.058
+  BIN_ART: 0.056
+  BIN_SBJP: -0.048
+  BIN_PRP: -0.048
+  BIN_PRIV: 0.036
+PC4:
+  BIN_NN: 0.659
+  BIN_CAP: -0.391
+  BIN_PRP: -0.260
+  BIN_SBJP: -0.260
+  BIN_NNP: 0.247
+  BIN_RB: -0.236
+  BIN_ART: 0.141
+  BIN_FPP1: -0.130
+  BIN_INF: -0.128
+  BIN_PREP: -0.127
+PC5:
+  BIN_DET: 0.485
+  BIN_ART: 0.422
+  BIN_PIN: -0.421
+  BIN_PREP: -0.421
+  BIN_RB: 0.245
+  BIN_VPRT: 0.196
+  BIN_INDA: 0.142
+  BIN_NOMZ: -0.123
+  BIN_PRP: 0.108
+  BIN_SBJP: 0.108
+PC6:
+  BIN_NOMZ: 0.368
+  BIN_NN: -0.345
+  BIN_DET: 0.344
+  BIN_RB: -0.339
+  BIN_ART: 0.326
+  BIN_JJ: 0.324
+  BIN_PRP: -0.262
+  BIN_SBJP: -0.262
+  BIN_FPP1: -0.144
+  BIN_INDA: 0.128
+PC7:
+  BIN_JJ: 0.448
+  BIN_X: -0.439
+  BIN_QUOT: -0.375
+  BIN_NOMZ: 0.312
+  BIN_NN: 0.271
+  BIN_RB: 0.231
+  BIN_NUM: -0.179
+  BIN_VPRT: 0.179
+  BIN_INF: -0.169
+  BIN_NNP: 0.164
+PC8:
+  BIN_RB: 0.623
+  BIN_PRP: -0.415
+  BIN_SBJP: -0.415
+  BIN_FPP1: -0.240
+  BIN_INF: 0.233
+  BIN_JJ: 0.150
+  BIN_AUXB: 0.147
+  BIN_NOMZ: -0.143
+  BIN_XX0: 0.110
+  BIN_SPAU: 0.103
+PC9:
+  BIN_INF: 0.712
+  BIN_VPRT: -0.427
+  BIN_TO: 0.206
+  BIN_X: -0.190
+  BIN_AUXB: -0.179
+  BIN_NUM: -0.173
+  BIN_QUOT: -0.161
+  BIN_NOMZ: 0.159
+  BIN_CONJ: -0.122
+  BIN_PRIV: 0.102
+PC10:
+  BIN_QUOT: 0.726
+  BIN_JJ: 0.496
+  BIN_CONT: 0.327
+  BIN_X: -0.170
+  BIN_NUM: -0.149
+  BIN_INF: 0.134
+  BIN_PASS: -0.080
+  BIN_NOMZ: -0.074
+  BIN_NN: 0.068
+  BIN_AUXB: -0.060
+PC11:
+  BIN_X: 0.620
+  BIN_JJ: 0.575
+  BIN_NOMZ: -0.292
+  BIN_QUOT: -0.288
+  BIN_INF: 0.131
+  BIN_PRP: 0.125
+  BIN_SBJP: 0.125
+  BIN_CONT: -0.123
+  BIN_RB: -0.092
+  BIN_FPP1: 0.085
+PC12:
+  BIN_VPRT: 0.529
+  BIN_AUXB: 0.431
+  BIN_RB: -0.404
+  BIN_INF: 0.364
+  BIN_TO: 0.187
+  BIN_ART: -0.186
+  BIN_PASS: 0.183
+  BIN_VBD: -0.158
+  BIN_BEMA: 0.128
+  BIN_DEMP: 0.110
+PC13:
+  BIN_NUM: 0.554
+  BIN_X: -0.544
+  BIN_NOMZ: -0.509
+  BIN_JJ: 0.160
+  BIN_RB: -0.156
+  BIN_QUOT: -0.124
+  BIN_CONT: -0.109
+  BIN_NN: -0.103
+  BIN_VPRT: -0.081
+  BIN_NNP: -0.073
+PC14:
+  BIN_NUM: 0.595
+  BIN_NOMZ: 0.366
+  BIN_VPRT: 0.348
+  BIN_AUXB: -0.332
+  BIN_VBD: -0.262
+  BIN_PASS: -0.188
+  BIN_CONT: 0.161
+  BIN_INF: 0.157
+  BIN_PGAS: -0.118
+  BIN_CONJ: -0.118
+PC15:
+  BIN_AUXB: 0.484
+  BIN_NUM: 0.450
+  BIN_NOMZ: 0.315
+  BIN_VPRT: -0.307
+  BIN_VBD: 0.262
+  BIN_PASS: 0.207
+  BIN_BEMA: 0.194
+  BIN_CONJ: 0.170
+  BIN_PRIV: -0.162
+  BIN_QUOT: 0.159
+PC16:
+  BIN_CONJ: 0.673
+  BIN_PGAS: -0.355
+  BIN_CCONJ: 0.324
+  BIN_SCONJ: -0.247
+  BIN_TO: -0.197
+  BIN_VBD: -0.185
+  BIN_WH: -0.164
+  BIN_FPP1: -0.128
+  BIN_PRIV: 0.113
+  BIN_DEMP: -0.096
+PC17:
+  BIN_CCONJ: 0.471
+  BIN_CONT: 0.462
+  BIN_INDA: -0.260
+  BIN_XX0: 0.221
+  BIN_SCONJ: -0.216
+  BIN_CONJ: -0.210
+  BIN_SPAU: 0.199
+  BIN_DET: 0.197
+  BIN_FPP1: 0.196
+  BIN_QUOT: -0.185
+PC18:
+  BIN_PGAS: 0.578
+  BIN_CCONJ: 0.564
+  BIN_CONT: -0.268
+  BIN_PRIV: -0.235
+  BIN_ANDC: 0.144
+  BIN_PASS: -0.143
+  BIN_QUOT: 0.138
+  BIN_SPAU: -0.125
+  BIN_VBD: -0.115
+  BIN_NOMZ: 0.114
 Top 10 PC1 values:
              PC1        PC2  ...      priority  closed_relevance
 19873  125.128650  24.461032  ...        Medium             False
@ -63,4 +261,4 @@ Bottom 10 PC2 values:

 [10 rows x 26 columns]
 job finished, cleaning up
-job pau at: Thu Sep  4 15:42:13 CDT 2025
+job pau at: Tue Sep 23 16:37:56 CDT 2025
--- a/p2/quest/python_scripts/biberplus_labeling.py
+++ b/p2/quest/python_scripts/biberplus_labeling.py
@ -73,20 +73,28 @@ def biberplus_labeler(text):
    frequencies_df = frequencies_df.reset_index(drop=True)
    return frequencies_df
 
+def make_text_for_analysis(row):
+    if row['comment_type'] == "task_description":
+        return f"{row['task_title']}./n/n{row['comment_text']}"
+    else:
+        return row['comment_text']
+
+
 if __name__ == "__main__":
    #loading in the discussion data from the universal CSV
    first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv")
    #formatting for the neurobiber model
    #docs = first_discussion_df["comment_text"].astype(str).tolist()
-    task_description_df = first_discussion_df[first_discussion_df['comment_type'] == "task_description"]
-    docs = task_description_df['task_title'].astype(str).tolist()
+    first_discussion_df['text_for_analysis'] = first_discussion_df.apply(make_text_for_analysis, axis=1)
+    #task_description_df = first_discussion_df[first_discussion_df['comment_type'] == "task_description"]
+    docs = first_discussion_df['text_for_analysis'].astype(str).tolist()
    #load model and run
    #model, tokenizer = load_model_and_tokenizer()
    preds_df = biberplus_labeler(docs)
    #new columns in the df for the predicted neurobiber items 
    #preds_cols = [f"neurobiber_{i+1}" for i in range(96)]
    #preds_df = pd.DataFrame(preds, columns=preds_cols, index=first_discussion_df.index)
-    final_discussion_df = pd.concat([task_description_df, preds_df], axis=1)
+    final_discussion_df = pd.concat([first_discussion_df, preds_df], axis=1)
    #print(type(preds))
    #assigning the preditions as a new column 
    '''
@ -109,6 +117,6 @@ if __name__ == "__main__":
    assert len(task_description_df) == len(final_discussion_df)
    final_discussion_df = final_discussion_df.drop(columns=["message"])
    # if passing the prior asserts, let's write to a csv
-    final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/090725_biberplus_title_labels.csv", index=False)
+    final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/092325_biberplus_complete_labels.csv", index=False)
    print('biberplus labeling pau')

--- a/p2/quest/python_scripts/neurobiber_PCA.py
+++ b/p2/quest/python_scripts/neurobiber_PCA.py
@ -6,6 +6,28 @@ import numpy as np
 import pandas as pd 
 import matplotlib.pyplot as plt
 import seaborn as sns
+import pickle
+
+# List of the 96 features that Neurobiber can predict
+BIBER_FEATURES = [
+    "BIN_QUAN","BIN_QUPR","BIN_AMP","BIN_PASS","BIN_XX0","BIN_JJ",
+    "BIN_BEMA","BIN_CAUS","BIN_CONC","BIN_COND","BIN_CONJ","BIN_CONT",
+    "BIN_DPAR","BIN_DWNT","BIN_EX","BIN_FPP1","BIN_GER","BIN_RB",
+    "BIN_PIN","BIN_INPR","BIN_TO","BIN_NEMD","BIN_OSUB","BIN_PASTP",
+    "BIN_VBD","BIN_PHC","BIN_PIRE","BIN_PLACE","BIN_POMD","BIN_PRMD",
+    "BIN_WZPRES","BIN_VPRT","BIN_PRIV","BIN_PIT","BIN_PUBV","BIN_SPP2",
+    "BIN_SMP","BIN_SERE","BIN_STPR","BIN_SUAV","BIN_SYNE","BIN_TPP3",
+    "BIN_TIME","BIN_NOMZ","BIN_BYPA","BIN_PRED","BIN_TOBJ","BIN_TSUB",
+    "BIN_THVC","BIN_NN","BIN_DEMP","BIN_DEMO","BIN_WHQU","BIN_EMPH",
+    "BIN_HDG","BIN_WZPAST","BIN_THAC","BIN_PEAS","BIN_ANDC","BIN_PRESP",
+    "BIN_PROD","BIN_SPAU","BIN_SPIN","BIN_THATD","BIN_WHOBJ","BIN_WHSUB",
+    "BIN_WHCL","BIN_ART","BIN_AUXB","BIN_CAP","BIN_SCONJ","BIN_CCONJ",
+    "BIN_DET","BIN_EMOJ","BIN_EMOT","BIN_EXCL","BIN_HASH","BIN_INF",
+    "BIN_UH","BIN_NUM","BIN_LAUGH","BIN_PRP","BIN_PREP","BIN_NNP",
+    "BIN_QUES","BIN_QUOT","BIN_AT","BIN_SBJP","BIN_URL","BIN_WH",
+    "BIN_INDA","BIN_ACCU","BIN_PGAS","BIN_CMADJ","BIN_SPADJ","BIN_X"
+]
+


 def format_df_data(df):
@ -34,11 +56,18 @@ if __name__ == "__main__":
    '''
    pca = PCA(n_components=18)
    biber_vecs_pca = pca.fit_transform(biber_vecs)
+    with open('092325_pca.pkl', 'wb') as f:
+        pickle.dump(pca, f)
    selected_axis = "AuthorWMFAffil"    
    
    component_variances = np.var(biber_vecs_pca, axis=0)
    print("Variance of each PCA component:", component_variances)
    
+    for i, component in enumerate(pca.components_):
+        print(f"PC{i+1}:")
+        indices = np.argsort(np.abs(component))[::-1]
+        for idx in indices[:10]:  # Top 10
+            print(f"  {BIBER_FEATURES[idx]}: {component[idx]:.3f}")
    
    #first looking at comment_type
    le = LabelEncoder()
@ -55,7 +84,7 @@ if __name__ == "__main__":
    pc_dict['closed_relevance'] = biber_vec_df['closed_relevance']

    plot_df = pd.DataFrame(pc_dict)
-    plot_df.to_csv("090425_description_PCA_df.csv", index=False)
+    plot_df.to_csv("092325_description_PCA_df.csv", index=False)

    print("Top 10 PC1 values:")
    print(plot_df.nlargest(10, "PC1"))
@ -92,5 +121,5 @@ if __name__ == "__main__":
    plt.legend(title=selected_axis, bbox_to_anchor=(1.05, 1), loc=2)
    '''
    g.fig.tight_layout()
-    g.savefig(f"description_{selected_axis}_090425_biber_pca_final.png", dpi=300)
+    g.savefig(f"description_{selected_axis}_092325_biber_pca_final.png", dpi=300)
    plt.show()