1
0

updating library to account for re-running PCA

This commit is contained in:
mgaughan 2025-09-23 16:41:32 -05:00
parent 032975c4f0
commit 265b930578
4 changed files with 244 additions and 42 deletions

View File

@ -1,36 +1,3 @@
starting the job at: Fri Jul 25 20:48:01 CDT 2025
starting the job at: Tue Sep 23 16:37:07 CDT 2025
setting up the environment
running the biberplus labeling script
26024
26024
id ... http_flag
0 56791 ... NaN
1 269631 ... NaN
2 269628 ... NaN
3 269622 ... NaN
4 56737 ... NaN
... ... ... ...
26019 403186 ... True
26020 78646 ... True
26021 429163 ... True
26022 429137 ... True
26023 418783 ... True
[26024 rows x 22 columns]
id ... cleaned_messages
0 56791 ... pawn character editing\n\nseen on master branc...
1 269631 ... Change 86685 merged by jenkins-bot:\nFollow-up...
2 269628 ... *** Bug 54785 has been marked as a duplicate o...
3 269622 ... Change 86685 had a related patch set uploaded ...
4 56737 ... **Author:** CODE\n\n**Description:**\nAfter th...
... ... ... ...
26019 403186 ... Could you attach a screenshot please? Drag & d...
26020 78646 ... Hi,\n\nWe have a wiki which has a part which c...
26021 429163 ... Sorry for not reply-ing. I did a test and coul...
26022 429137 ... SCREEN_NAME: Please answer.
26023 418783 ... I cannot replicate this. What's the name of th...
[26024 rows x 122 columns]
biberplus labeling pau
job finished, cleaning up
job pau at: Fri Jul 25 20:55:26 CDT 2025

View File

@ -1,9 +1,207 @@
starting the job at: Thu Sep 4 15:41:55 CDT 2025
starting the job at: Tue Sep 23 16:37:06 CDT 2025
setting up the environment
running the neurobiber labeling script
Variance of each PCA component: [88.92832185 39.46471687 32.34601523 20.19544345 14.0083261 11.5837521
7.82584723 6.89064989 6.07988254 5.80726367 5.49782354 4.50587747
4.31482409 2.81997326 2.62989708 2.27205352 2.09396341 2.00076119]
PC1:
BIN_NNP: 0.760
BIN_CAP: 0.524
BIN_DET: -0.166
BIN_PREP: -0.157
BIN_PIN: -0.157
BIN_ART: -0.126
BIN_NN: -0.119
BIN_RB: -0.076
BIN_INF: -0.070
BIN_VPRT: -0.069
PC2:
BIN_PREP: 0.473
BIN_PIN: 0.473
BIN_NNP: 0.426
BIN_DET: 0.323
BIN_ART: 0.240
BIN_NOMZ: -0.233
BIN_VPRT: 0.142
BIN_RB: 0.132
BIN_SBJP: 0.119
BIN_PRP: 0.119
PC3:
BIN_CAP: 0.727
BIN_NN: 0.546
BIN_NNP: -0.363
BIN_PREP: 0.102
BIN_PIN: 0.102
BIN_DET: 0.058
BIN_ART: 0.056
BIN_SBJP: -0.048
BIN_PRP: -0.048
BIN_PRIV: 0.036
PC4:
BIN_NN: 0.659
BIN_CAP: -0.391
BIN_PRP: -0.260
BIN_SBJP: -0.260
BIN_NNP: 0.247
BIN_RB: -0.236
BIN_ART: 0.141
BIN_FPP1: -0.130
BIN_INF: -0.128
BIN_PREP: -0.127
PC5:
BIN_DET: 0.485
BIN_ART: 0.422
BIN_PIN: -0.421
BIN_PREP: -0.421
BIN_RB: 0.245
BIN_VPRT: 0.196
BIN_INDA: 0.142
BIN_NOMZ: -0.123
BIN_PRP: 0.108
BIN_SBJP: 0.108
PC6:
BIN_NOMZ: 0.368
BIN_NN: -0.345
BIN_DET: 0.344
BIN_RB: -0.339
BIN_ART: 0.326
BIN_JJ: 0.324
BIN_PRP: -0.262
BIN_SBJP: -0.262
BIN_FPP1: -0.144
BIN_INDA: 0.128
PC7:
BIN_JJ: 0.448
BIN_X: -0.439
BIN_QUOT: -0.375
BIN_NOMZ: 0.312
BIN_NN: 0.271
BIN_RB: 0.231
BIN_NUM: -0.179
BIN_VPRT: 0.179
BIN_INF: -0.169
BIN_NNP: 0.164
PC8:
BIN_RB: 0.623
BIN_PRP: -0.415
BIN_SBJP: -0.415
BIN_FPP1: -0.240
BIN_INF: 0.233
BIN_JJ: 0.150
BIN_AUXB: 0.147
BIN_NOMZ: -0.143
BIN_XX0: 0.110
BIN_SPAU: 0.103
PC9:
BIN_INF: 0.712
BIN_VPRT: -0.427
BIN_TO: 0.206
BIN_X: -0.190
BIN_AUXB: -0.179
BIN_NUM: -0.173
BIN_QUOT: -0.161
BIN_NOMZ: 0.159
BIN_CONJ: -0.122
BIN_PRIV: 0.102
PC10:
BIN_QUOT: 0.726
BIN_JJ: 0.496
BIN_CONT: 0.327
BIN_X: -0.170
BIN_NUM: -0.149
BIN_INF: 0.134
BIN_PASS: -0.080
BIN_NOMZ: -0.074
BIN_NN: 0.068
BIN_AUXB: -0.060
PC11:
BIN_X: 0.620
BIN_JJ: 0.575
BIN_NOMZ: -0.292
BIN_QUOT: -0.288
BIN_INF: 0.131
BIN_PRP: 0.125
BIN_SBJP: 0.125
BIN_CONT: -0.123
BIN_RB: -0.092
BIN_FPP1: 0.085
PC12:
BIN_VPRT: 0.529
BIN_AUXB: 0.431
BIN_RB: -0.404
BIN_INF: 0.364
BIN_TO: 0.187
BIN_ART: -0.186
BIN_PASS: 0.183
BIN_VBD: -0.158
BIN_BEMA: 0.128
BIN_DEMP: 0.110
PC13:
BIN_NUM: 0.554
BIN_X: -0.544
BIN_NOMZ: -0.509
BIN_JJ: 0.160
BIN_RB: -0.156
BIN_QUOT: -0.124
BIN_CONT: -0.109
BIN_NN: -0.103
BIN_VPRT: -0.081
BIN_NNP: -0.073
PC14:
BIN_NUM: 0.595
BIN_NOMZ: 0.366
BIN_VPRT: 0.348
BIN_AUXB: -0.332
BIN_VBD: -0.262
BIN_PASS: -0.188
BIN_CONT: 0.161
BIN_INF: 0.157
BIN_PGAS: -0.118
BIN_CONJ: -0.118
PC15:
BIN_AUXB: 0.484
BIN_NUM: 0.450
BIN_NOMZ: 0.315
BIN_VPRT: -0.307
BIN_VBD: 0.262
BIN_PASS: 0.207
BIN_BEMA: 0.194
BIN_CONJ: 0.170
BIN_PRIV: -0.162
BIN_QUOT: 0.159
PC16:
BIN_CONJ: 0.673
BIN_PGAS: -0.355
BIN_CCONJ: 0.324
BIN_SCONJ: -0.247
BIN_TO: -0.197
BIN_VBD: -0.185
BIN_WH: -0.164
BIN_FPP1: -0.128
BIN_PRIV: 0.113
BIN_DEMP: -0.096
PC17:
BIN_CCONJ: 0.471
BIN_CONT: 0.462
BIN_INDA: -0.260
BIN_XX0: 0.221
BIN_SCONJ: -0.216
BIN_CONJ: -0.210
BIN_SPAU: 0.199
BIN_DET: 0.197
BIN_FPP1: 0.196
BIN_QUOT: -0.185
PC18:
BIN_PGAS: 0.578
BIN_CCONJ: 0.564
BIN_CONT: -0.268
BIN_PRIV: -0.235
BIN_ANDC: 0.144
BIN_PASS: -0.143
BIN_QUOT: 0.138
BIN_SPAU: -0.125
BIN_VBD: -0.115
BIN_NOMZ: 0.114
Top 10 PC1 values:
PC1 PC2 ... priority closed_relevance
19873 125.128650 24.461032 ... Medium False
@ -63,4 +261,4 @@ Bottom 10 PC2 values:
[10 rows x 26 columns]
job finished, cleaning up
job pau at: Thu Sep 4 15:42:13 CDT 2025
job pau at: Tue Sep 23 16:37:56 CDT 2025

View File

@ -73,20 +73,28 @@ def biberplus_labeler(text):
frequencies_df = frequencies_df.reset_index(drop=True)
return frequencies_df
def make_text_for_analysis(row):
if row['comment_type'] == "task_description":
return f"{row['task_title']}./n/n{row['comment_text']}"
else:
return row['comment_text']
if __name__ == "__main__":
#loading in the discussion data from the universal CSV
first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv")
#formatting for the neurobiber model
#docs = first_discussion_df["comment_text"].astype(str).tolist()
task_description_df = first_discussion_df[first_discussion_df['comment_type'] == "task_description"]
docs = task_description_df['task_title'].astype(str).tolist()
first_discussion_df['text_for_analysis'] = first_discussion_df.apply(make_text_for_analysis, axis=1)
#task_description_df = first_discussion_df[first_discussion_df['comment_type'] == "task_description"]
docs = first_discussion_df['text_for_analysis'].astype(str).tolist()
#load model and run
#model, tokenizer = load_model_and_tokenizer()
preds_df = biberplus_labeler(docs)
#new columns in the df for the predicted neurobiber items
#preds_cols = [f"neurobiber_{i+1}" for i in range(96)]
#preds_df = pd.DataFrame(preds, columns=preds_cols, index=first_discussion_df.index)
final_discussion_df = pd.concat([task_description_df, preds_df], axis=1)
final_discussion_df = pd.concat([first_discussion_df, preds_df], axis=1)
#print(type(preds))
#assigning the preditions as a new column
'''
@ -109,6 +117,6 @@ if __name__ == "__main__":
assert len(task_description_df) == len(final_discussion_df)
final_discussion_df = final_discussion_df.drop(columns=["message"])
# if passing the prior asserts, let's write to a csv
final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/090725_biberplus_title_labels.csv", index=False)
final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/092325_biberplus_complete_labels.csv", index=False)
print('biberplus labeling pau')

View File

@ -6,6 +6,28 @@ import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
# List of the 96 features that Neurobiber can predict
BIBER_FEATURES = [
"BIN_QUAN","BIN_QUPR","BIN_AMP","BIN_PASS","BIN_XX0","BIN_JJ",
"BIN_BEMA","BIN_CAUS","BIN_CONC","BIN_COND","BIN_CONJ","BIN_CONT",
"BIN_DPAR","BIN_DWNT","BIN_EX","BIN_FPP1","BIN_GER","BIN_RB",
"BIN_PIN","BIN_INPR","BIN_TO","BIN_NEMD","BIN_OSUB","BIN_PASTP",
"BIN_VBD","BIN_PHC","BIN_PIRE","BIN_PLACE","BIN_POMD","BIN_PRMD",
"BIN_WZPRES","BIN_VPRT","BIN_PRIV","BIN_PIT","BIN_PUBV","BIN_SPP2",
"BIN_SMP","BIN_SERE","BIN_STPR","BIN_SUAV","BIN_SYNE","BIN_TPP3",
"BIN_TIME","BIN_NOMZ","BIN_BYPA","BIN_PRED","BIN_TOBJ","BIN_TSUB",
"BIN_THVC","BIN_NN","BIN_DEMP","BIN_DEMO","BIN_WHQU","BIN_EMPH",
"BIN_HDG","BIN_WZPAST","BIN_THAC","BIN_PEAS","BIN_ANDC","BIN_PRESP",
"BIN_PROD","BIN_SPAU","BIN_SPIN","BIN_THATD","BIN_WHOBJ","BIN_WHSUB",
"BIN_WHCL","BIN_ART","BIN_AUXB","BIN_CAP","BIN_SCONJ","BIN_CCONJ",
"BIN_DET","BIN_EMOJ","BIN_EMOT","BIN_EXCL","BIN_HASH","BIN_INF",
"BIN_UH","BIN_NUM","BIN_LAUGH","BIN_PRP","BIN_PREP","BIN_NNP",
"BIN_QUES","BIN_QUOT","BIN_AT","BIN_SBJP","BIN_URL","BIN_WH",
"BIN_INDA","BIN_ACCU","BIN_PGAS","BIN_CMADJ","BIN_SPADJ","BIN_X"
]
def format_df_data(df):
@ -34,11 +56,18 @@ if __name__ == "__main__":
'''
pca = PCA(n_components=18)
biber_vecs_pca = pca.fit_transform(biber_vecs)
with open('092325_pca.pkl', 'wb') as f:
pickle.dump(pca, f)
selected_axis = "AuthorWMFAffil"
component_variances = np.var(biber_vecs_pca, axis=0)
print("Variance of each PCA component:", component_variances)
for i, component in enumerate(pca.components_):
print(f"PC{i+1}:")
indices = np.argsort(np.abs(component))[::-1]
for idx in indices[:10]: # Top 10
print(f" {BIBER_FEATURES[idx]}: {component[idx]:.3f}")
#first looking at comment_type
le = LabelEncoder()
@ -55,7 +84,7 @@ if __name__ == "__main__":
pc_dict['closed_relevance'] = biber_vec_df['closed_relevance']
plot_df = pd.DataFrame(pc_dict)
plot_df.to_csv("090425_description_PCA_df.csv", index=False)
plot_df.to_csv("092325_description_PCA_df.csv", index=False)
print("Top 10 PC1 values:")
print(plot_df.nlargest(10, "PC1"))
@ -92,5 +121,5 @@ if __name__ == "__main__":
plt.legend(title=selected_axis, bbox_to_anchor=(1.05, 1), loc=2)
'''
g.fig.tight_layout()
g.savefig(f"description_{selected_axis}_090425_biber_pca_final.png", dpi=300)
g.savefig(f"description_{selected_axis}_092325_biber_pca_final.png", dpi=300)
plt.show()