updating library to account for re-running PCA
This commit is contained in:
parent
032975c4f0
commit
265b930578
@ -1,36 +1,3 @@
|
||||
starting the job at: Fri Jul 25 20:48:01 CDT 2025
|
||||
starting the job at: Tue Sep 23 16:37:07 CDT 2025
|
||||
setting up the environment
|
||||
running the biberplus labeling script
|
||||
26024
|
||||
26024
|
||||
id ... http_flag
|
||||
0 56791 ... NaN
|
||||
1 269631 ... NaN
|
||||
2 269628 ... NaN
|
||||
3 269622 ... NaN
|
||||
4 56737 ... NaN
|
||||
... ... ... ...
|
||||
26019 403186 ... True
|
||||
26020 78646 ... True
|
||||
26021 429163 ... True
|
||||
26022 429137 ... True
|
||||
26023 418783 ... True
|
||||
|
||||
[26024 rows x 22 columns]
|
||||
id ... cleaned_messages
|
||||
0 56791 ... pawn character editing\n\nseen on master branc...
|
||||
1 269631 ... Change 86685 merged by jenkins-bot:\nFollow-up...
|
||||
2 269628 ... *** Bug 54785 has been marked as a duplicate o...
|
||||
3 269622 ... Change 86685 had a related patch set uploaded ...
|
||||
4 56737 ... **Author:** CODE\n\n**Description:**\nAfter th...
|
||||
... ... ... ...
|
||||
26019 403186 ... Could you attach a screenshot please? Drag & d...
|
||||
26020 78646 ... Hi,\n\nWe have a wiki which has a part which c...
|
||||
26021 429163 ... Sorry for not reply-ing. I did a test and coul...
|
||||
26022 429137 ... SCREEN_NAME: Please answer.
|
||||
26023 418783 ... I cannot replicate this. What's the name of th...
|
||||
|
||||
[26024 rows x 122 columns]
|
||||
biberplus labeling pau
|
||||
job finished, cleaning up
|
||||
job pau at: Fri Jul 25 20:55:26 CDT 2025
|
||||
|
||||
@ -1,9 +1,207 @@
|
||||
starting the job at: Thu Sep 4 15:41:55 CDT 2025
|
||||
starting the job at: Tue Sep 23 16:37:06 CDT 2025
|
||||
setting up the environment
|
||||
running the neurobiber labeling script
|
||||
Variance of each PCA component: [88.92832185 39.46471687 32.34601523 20.19544345 14.0083261 11.5837521
|
||||
7.82584723 6.89064989 6.07988254 5.80726367 5.49782354 4.50587747
|
||||
4.31482409 2.81997326 2.62989708 2.27205352 2.09396341 2.00076119]
|
||||
PC1:
|
||||
BIN_NNP: 0.760
|
||||
BIN_CAP: 0.524
|
||||
BIN_DET: -0.166
|
||||
BIN_PREP: -0.157
|
||||
BIN_PIN: -0.157
|
||||
BIN_ART: -0.126
|
||||
BIN_NN: -0.119
|
||||
BIN_RB: -0.076
|
||||
BIN_INF: -0.070
|
||||
BIN_VPRT: -0.069
|
||||
PC2:
|
||||
BIN_PREP: 0.473
|
||||
BIN_PIN: 0.473
|
||||
BIN_NNP: 0.426
|
||||
BIN_DET: 0.323
|
||||
BIN_ART: 0.240
|
||||
BIN_NOMZ: -0.233
|
||||
BIN_VPRT: 0.142
|
||||
BIN_RB: 0.132
|
||||
BIN_SBJP: 0.119
|
||||
BIN_PRP: 0.119
|
||||
PC3:
|
||||
BIN_CAP: 0.727
|
||||
BIN_NN: 0.546
|
||||
BIN_NNP: -0.363
|
||||
BIN_PREP: 0.102
|
||||
BIN_PIN: 0.102
|
||||
BIN_DET: 0.058
|
||||
BIN_ART: 0.056
|
||||
BIN_SBJP: -0.048
|
||||
BIN_PRP: -0.048
|
||||
BIN_PRIV: 0.036
|
||||
PC4:
|
||||
BIN_NN: 0.659
|
||||
BIN_CAP: -0.391
|
||||
BIN_PRP: -0.260
|
||||
BIN_SBJP: -0.260
|
||||
BIN_NNP: 0.247
|
||||
BIN_RB: -0.236
|
||||
BIN_ART: 0.141
|
||||
BIN_FPP1: -0.130
|
||||
BIN_INF: -0.128
|
||||
BIN_PREP: -0.127
|
||||
PC5:
|
||||
BIN_DET: 0.485
|
||||
BIN_ART: 0.422
|
||||
BIN_PIN: -0.421
|
||||
BIN_PREP: -0.421
|
||||
BIN_RB: 0.245
|
||||
BIN_VPRT: 0.196
|
||||
BIN_INDA: 0.142
|
||||
BIN_NOMZ: -0.123
|
||||
BIN_PRP: 0.108
|
||||
BIN_SBJP: 0.108
|
||||
PC6:
|
||||
BIN_NOMZ: 0.368
|
||||
BIN_NN: -0.345
|
||||
BIN_DET: 0.344
|
||||
BIN_RB: -0.339
|
||||
BIN_ART: 0.326
|
||||
BIN_JJ: 0.324
|
||||
BIN_PRP: -0.262
|
||||
BIN_SBJP: -0.262
|
||||
BIN_FPP1: -0.144
|
||||
BIN_INDA: 0.128
|
||||
PC7:
|
||||
BIN_JJ: 0.448
|
||||
BIN_X: -0.439
|
||||
BIN_QUOT: -0.375
|
||||
BIN_NOMZ: 0.312
|
||||
BIN_NN: 0.271
|
||||
BIN_RB: 0.231
|
||||
BIN_NUM: -0.179
|
||||
BIN_VPRT: 0.179
|
||||
BIN_INF: -0.169
|
||||
BIN_NNP: 0.164
|
||||
PC8:
|
||||
BIN_RB: 0.623
|
||||
BIN_PRP: -0.415
|
||||
BIN_SBJP: -0.415
|
||||
BIN_FPP1: -0.240
|
||||
BIN_INF: 0.233
|
||||
BIN_JJ: 0.150
|
||||
BIN_AUXB: 0.147
|
||||
BIN_NOMZ: -0.143
|
||||
BIN_XX0: 0.110
|
||||
BIN_SPAU: 0.103
|
||||
PC9:
|
||||
BIN_INF: 0.712
|
||||
BIN_VPRT: -0.427
|
||||
BIN_TO: 0.206
|
||||
BIN_X: -0.190
|
||||
BIN_AUXB: -0.179
|
||||
BIN_NUM: -0.173
|
||||
BIN_QUOT: -0.161
|
||||
BIN_NOMZ: 0.159
|
||||
BIN_CONJ: -0.122
|
||||
BIN_PRIV: 0.102
|
||||
PC10:
|
||||
BIN_QUOT: 0.726
|
||||
BIN_JJ: 0.496
|
||||
BIN_CONT: 0.327
|
||||
BIN_X: -0.170
|
||||
BIN_NUM: -0.149
|
||||
BIN_INF: 0.134
|
||||
BIN_PASS: -0.080
|
||||
BIN_NOMZ: -0.074
|
||||
BIN_NN: 0.068
|
||||
BIN_AUXB: -0.060
|
||||
PC11:
|
||||
BIN_X: 0.620
|
||||
BIN_JJ: 0.575
|
||||
BIN_NOMZ: -0.292
|
||||
BIN_QUOT: -0.288
|
||||
BIN_INF: 0.131
|
||||
BIN_PRP: 0.125
|
||||
BIN_SBJP: 0.125
|
||||
BIN_CONT: -0.123
|
||||
BIN_RB: -0.092
|
||||
BIN_FPP1: 0.085
|
||||
PC12:
|
||||
BIN_VPRT: 0.529
|
||||
BIN_AUXB: 0.431
|
||||
BIN_RB: -0.404
|
||||
BIN_INF: 0.364
|
||||
BIN_TO: 0.187
|
||||
BIN_ART: -0.186
|
||||
BIN_PASS: 0.183
|
||||
BIN_VBD: -0.158
|
||||
BIN_BEMA: 0.128
|
||||
BIN_DEMP: 0.110
|
||||
PC13:
|
||||
BIN_NUM: 0.554
|
||||
BIN_X: -0.544
|
||||
BIN_NOMZ: -0.509
|
||||
BIN_JJ: 0.160
|
||||
BIN_RB: -0.156
|
||||
BIN_QUOT: -0.124
|
||||
BIN_CONT: -0.109
|
||||
BIN_NN: -0.103
|
||||
BIN_VPRT: -0.081
|
||||
BIN_NNP: -0.073
|
||||
PC14:
|
||||
BIN_NUM: 0.595
|
||||
BIN_NOMZ: 0.366
|
||||
BIN_VPRT: 0.348
|
||||
BIN_AUXB: -0.332
|
||||
BIN_VBD: -0.262
|
||||
BIN_PASS: -0.188
|
||||
BIN_CONT: 0.161
|
||||
BIN_INF: 0.157
|
||||
BIN_PGAS: -0.118
|
||||
BIN_CONJ: -0.118
|
||||
PC15:
|
||||
BIN_AUXB: 0.484
|
||||
BIN_NUM: 0.450
|
||||
BIN_NOMZ: 0.315
|
||||
BIN_VPRT: -0.307
|
||||
BIN_VBD: 0.262
|
||||
BIN_PASS: 0.207
|
||||
BIN_BEMA: 0.194
|
||||
BIN_CONJ: 0.170
|
||||
BIN_PRIV: -0.162
|
||||
BIN_QUOT: 0.159
|
||||
PC16:
|
||||
BIN_CONJ: 0.673
|
||||
BIN_PGAS: -0.355
|
||||
BIN_CCONJ: 0.324
|
||||
BIN_SCONJ: -0.247
|
||||
BIN_TO: -0.197
|
||||
BIN_VBD: -0.185
|
||||
BIN_WH: -0.164
|
||||
BIN_FPP1: -0.128
|
||||
BIN_PRIV: 0.113
|
||||
BIN_DEMP: -0.096
|
||||
PC17:
|
||||
BIN_CCONJ: 0.471
|
||||
BIN_CONT: 0.462
|
||||
BIN_INDA: -0.260
|
||||
BIN_XX0: 0.221
|
||||
BIN_SCONJ: -0.216
|
||||
BIN_CONJ: -0.210
|
||||
BIN_SPAU: 0.199
|
||||
BIN_DET: 0.197
|
||||
BIN_FPP1: 0.196
|
||||
BIN_QUOT: -0.185
|
||||
PC18:
|
||||
BIN_PGAS: 0.578
|
||||
BIN_CCONJ: 0.564
|
||||
BIN_CONT: -0.268
|
||||
BIN_PRIV: -0.235
|
||||
BIN_ANDC: 0.144
|
||||
BIN_PASS: -0.143
|
||||
BIN_QUOT: 0.138
|
||||
BIN_SPAU: -0.125
|
||||
BIN_VBD: -0.115
|
||||
BIN_NOMZ: 0.114
|
||||
Top 10 PC1 values:
|
||||
PC1 PC2 ... priority closed_relevance
|
||||
19873 125.128650 24.461032 ... Medium False
|
||||
@ -63,4 +261,4 @@ Bottom 10 PC2 values:
|
||||
|
||||
[10 rows x 26 columns]
|
||||
job finished, cleaning up
|
||||
job pau at: Thu Sep 4 15:42:13 CDT 2025
|
||||
job pau at: Tue Sep 23 16:37:56 CDT 2025
|
||||
|
||||
@ -73,20 +73,28 @@ def biberplus_labeler(text):
|
||||
frequencies_df = frequencies_df.reset_index(drop=True)
|
||||
return frequencies_df
|
||||
|
||||
def make_text_for_analysis(row):
|
||||
if row['comment_type'] == "task_description":
|
||||
return f"{row['task_title']}./n/n{row['comment_text']}"
|
||||
else:
|
||||
return row['comment_text']
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
#loading in the discussion data from the universal CSV
|
||||
first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv")
|
||||
#formatting for the neurobiber model
|
||||
#docs = first_discussion_df["comment_text"].astype(str).tolist()
|
||||
task_description_df = first_discussion_df[first_discussion_df['comment_type'] == "task_description"]
|
||||
docs = task_description_df['task_title'].astype(str).tolist()
|
||||
first_discussion_df['text_for_analysis'] = first_discussion_df.apply(make_text_for_analysis, axis=1)
|
||||
#task_description_df = first_discussion_df[first_discussion_df['comment_type'] == "task_description"]
|
||||
docs = first_discussion_df['text_for_analysis'].astype(str).tolist()
|
||||
#load model and run
|
||||
#model, tokenizer = load_model_and_tokenizer()
|
||||
preds_df = biberplus_labeler(docs)
|
||||
#new columns in the df for the predicted neurobiber items
|
||||
#preds_cols = [f"neurobiber_{i+1}" for i in range(96)]
|
||||
#preds_df = pd.DataFrame(preds, columns=preds_cols, index=first_discussion_df.index)
|
||||
final_discussion_df = pd.concat([task_description_df, preds_df], axis=1)
|
||||
final_discussion_df = pd.concat([first_discussion_df, preds_df], axis=1)
|
||||
#print(type(preds))
|
||||
#assigning the preditions as a new column
|
||||
'''
|
||||
@ -109,6 +117,6 @@ if __name__ == "__main__":
|
||||
assert len(task_description_df) == len(final_discussion_df)
|
||||
final_discussion_df = final_discussion_df.drop(columns=["message"])
|
||||
# if passing the prior asserts, let's write to a csv
|
||||
final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/090725_biberplus_title_labels.csv", index=False)
|
||||
final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/092325_biberplus_complete_labels.csv", index=False)
|
||||
print('biberplus labeling pau')
|
||||
|
||||
|
||||
@ -6,6 +6,28 @@ import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
import pickle
|
||||
|
||||
# List of the 96 features that Neurobiber can predict
|
||||
BIBER_FEATURES = [
|
||||
"BIN_QUAN","BIN_QUPR","BIN_AMP","BIN_PASS","BIN_XX0","BIN_JJ",
|
||||
"BIN_BEMA","BIN_CAUS","BIN_CONC","BIN_COND","BIN_CONJ","BIN_CONT",
|
||||
"BIN_DPAR","BIN_DWNT","BIN_EX","BIN_FPP1","BIN_GER","BIN_RB",
|
||||
"BIN_PIN","BIN_INPR","BIN_TO","BIN_NEMD","BIN_OSUB","BIN_PASTP",
|
||||
"BIN_VBD","BIN_PHC","BIN_PIRE","BIN_PLACE","BIN_POMD","BIN_PRMD",
|
||||
"BIN_WZPRES","BIN_VPRT","BIN_PRIV","BIN_PIT","BIN_PUBV","BIN_SPP2",
|
||||
"BIN_SMP","BIN_SERE","BIN_STPR","BIN_SUAV","BIN_SYNE","BIN_TPP3",
|
||||
"BIN_TIME","BIN_NOMZ","BIN_BYPA","BIN_PRED","BIN_TOBJ","BIN_TSUB",
|
||||
"BIN_THVC","BIN_NN","BIN_DEMP","BIN_DEMO","BIN_WHQU","BIN_EMPH",
|
||||
"BIN_HDG","BIN_WZPAST","BIN_THAC","BIN_PEAS","BIN_ANDC","BIN_PRESP",
|
||||
"BIN_PROD","BIN_SPAU","BIN_SPIN","BIN_THATD","BIN_WHOBJ","BIN_WHSUB",
|
||||
"BIN_WHCL","BIN_ART","BIN_AUXB","BIN_CAP","BIN_SCONJ","BIN_CCONJ",
|
||||
"BIN_DET","BIN_EMOJ","BIN_EMOT","BIN_EXCL","BIN_HASH","BIN_INF",
|
||||
"BIN_UH","BIN_NUM","BIN_LAUGH","BIN_PRP","BIN_PREP","BIN_NNP",
|
||||
"BIN_QUES","BIN_QUOT","BIN_AT","BIN_SBJP","BIN_URL","BIN_WH",
|
||||
"BIN_INDA","BIN_ACCU","BIN_PGAS","BIN_CMADJ","BIN_SPADJ","BIN_X"
|
||||
]
|
||||
|
||||
|
||||
|
||||
def format_df_data(df):
|
||||
@ -34,11 +56,18 @@ if __name__ == "__main__":
|
||||
'''
|
||||
pca = PCA(n_components=18)
|
||||
biber_vecs_pca = pca.fit_transform(biber_vecs)
|
||||
with open('092325_pca.pkl', 'wb') as f:
|
||||
pickle.dump(pca, f)
|
||||
selected_axis = "AuthorWMFAffil"
|
||||
|
||||
component_variances = np.var(biber_vecs_pca, axis=0)
|
||||
print("Variance of each PCA component:", component_variances)
|
||||
|
||||
for i, component in enumerate(pca.components_):
|
||||
print(f"PC{i+1}:")
|
||||
indices = np.argsort(np.abs(component))[::-1]
|
||||
for idx in indices[:10]: # Top 10
|
||||
print(f" {BIBER_FEATURES[idx]}: {component[idx]:.3f}")
|
||||
|
||||
#first looking at comment_type
|
||||
le = LabelEncoder()
|
||||
@ -55,7 +84,7 @@ if __name__ == "__main__":
|
||||
pc_dict['closed_relevance'] = biber_vec_df['closed_relevance']
|
||||
|
||||
plot_df = pd.DataFrame(pc_dict)
|
||||
plot_df.to_csv("090425_description_PCA_df.csv", index=False)
|
||||
plot_df.to_csv("092325_description_PCA_df.csv", index=False)
|
||||
|
||||
print("Top 10 PC1 values:")
|
||||
print(plot_df.nlargest(10, "PC1"))
|
||||
@ -92,5 +121,5 @@ if __name__ == "__main__":
|
||||
plt.legend(title=selected_axis, bbox_to_anchor=(1.05, 1), loc=2)
|
||||
'''
|
||||
g.fig.tight_layout()
|
||||
g.savefig(f"description_{selected_axis}_090425_biber_pca_final.png", dpi=300)
|
||||
g.savefig(f"description_{selected_axis}_092325_biber_pca_final.png", dpi=300)
|
||||
plt.show()
|
||||
|
||||
Loading…
Reference in New Issue
Block a user