updating library to account for re-running PCA
This commit is contained in:
parent
032975c4f0
commit
265b930578
@ -1,36 +1,3 @@
|
|||||||
starting the job at: Fri Jul 25 20:48:01 CDT 2025
|
starting the job at: Tue Sep 23 16:37:07 CDT 2025
|
||||||
setting up the environment
|
setting up the environment
|
||||||
running the biberplus labeling script
|
running the biberplus labeling script
|
||||||
26024
|
|
||||||
26024
|
|
||||||
id ... http_flag
|
|
||||||
0 56791 ... NaN
|
|
||||||
1 269631 ... NaN
|
|
||||||
2 269628 ... NaN
|
|
||||||
3 269622 ... NaN
|
|
||||||
4 56737 ... NaN
|
|
||||||
... ... ... ...
|
|
||||||
26019 403186 ... True
|
|
||||||
26020 78646 ... True
|
|
||||||
26021 429163 ... True
|
|
||||||
26022 429137 ... True
|
|
||||||
26023 418783 ... True
|
|
||||||
|
|
||||||
[26024 rows x 22 columns]
|
|
||||||
id ... cleaned_messages
|
|
||||||
0 56791 ... pawn character editing\n\nseen on master branc...
|
|
||||||
1 269631 ... Change 86685 merged by jenkins-bot:\nFollow-up...
|
|
||||||
2 269628 ... *** Bug 54785 has been marked as a duplicate o...
|
|
||||||
3 269622 ... Change 86685 had a related patch set uploaded ...
|
|
||||||
4 56737 ... **Author:** CODE\n\n**Description:**\nAfter th...
|
|
||||||
... ... ... ...
|
|
||||||
26019 403186 ... Could you attach a screenshot please? Drag & d...
|
|
||||||
26020 78646 ... Hi,\n\nWe have a wiki which has a part which c...
|
|
||||||
26021 429163 ... Sorry for not reply-ing. I did a test and coul...
|
|
||||||
26022 429137 ... SCREEN_NAME: Please answer.
|
|
||||||
26023 418783 ... I cannot replicate this. What's the name of th...
|
|
||||||
|
|
||||||
[26024 rows x 122 columns]
|
|
||||||
biberplus labeling pau
|
|
||||||
job finished, cleaning up
|
|
||||||
job pau at: Fri Jul 25 20:55:26 CDT 2025
|
|
||||||
|
|||||||
@ -1,9 +1,207 @@
|
|||||||
starting the job at: Thu Sep 4 15:41:55 CDT 2025
|
starting the job at: Tue Sep 23 16:37:06 CDT 2025
|
||||||
setting up the environment
|
setting up the environment
|
||||||
running the neurobiber labeling script
|
running the neurobiber labeling script
|
||||||
Variance of each PCA component: [88.92832185 39.46471687 32.34601523 20.19544345 14.0083261 11.5837521
|
Variance of each PCA component: [88.92832185 39.46471687 32.34601523 20.19544345 14.0083261 11.5837521
|
||||||
7.82584723 6.89064989 6.07988254 5.80726367 5.49782354 4.50587747
|
7.82584723 6.89064989 6.07988254 5.80726367 5.49782354 4.50587747
|
||||||
4.31482409 2.81997326 2.62989708 2.27205352 2.09396341 2.00076119]
|
4.31482409 2.81997326 2.62989708 2.27205352 2.09396341 2.00076119]
|
||||||
|
PC1:
|
||||||
|
BIN_NNP: 0.760
|
||||||
|
BIN_CAP: 0.524
|
||||||
|
BIN_DET: -0.166
|
||||||
|
BIN_PREP: -0.157
|
||||||
|
BIN_PIN: -0.157
|
||||||
|
BIN_ART: -0.126
|
||||||
|
BIN_NN: -0.119
|
||||||
|
BIN_RB: -0.076
|
||||||
|
BIN_INF: -0.070
|
||||||
|
BIN_VPRT: -0.069
|
||||||
|
PC2:
|
||||||
|
BIN_PREP: 0.473
|
||||||
|
BIN_PIN: 0.473
|
||||||
|
BIN_NNP: 0.426
|
||||||
|
BIN_DET: 0.323
|
||||||
|
BIN_ART: 0.240
|
||||||
|
BIN_NOMZ: -0.233
|
||||||
|
BIN_VPRT: 0.142
|
||||||
|
BIN_RB: 0.132
|
||||||
|
BIN_SBJP: 0.119
|
||||||
|
BIN_PRP: 0.119
|
||||||
|
PC3:
|
||||||
|
BIN_CAP: 0.727
|
||||||
|
BIN_NN: 0.546
|
||||||
|
BIN_NNP: -0.363
|
||||||
|
BIN_PREP: 0.102
|
||||||
|
BIN_PIN: 0.102
|
||||||
|
BIN_DET: 0.058
|
||||||
|
BIN_ART: 0.056
|
||||||
|
BIN_SBJP: -0.048
|
||||||
|
BIN_PRP: -0.048
|
||||||
|
BIN_PRIV: 0.036
|
||||||
|
PC4:
|
||||||
|
BIN_NN: 0.659
|
||||||
|
BIN_CAP: -0.391
|
||||||
|
BIN_PRP: -0.260
|
||||||
|
BIN_SBJP: -0.260
|
||||||
|
BIN_NNP: 0.247
|
||||||
|
BIN_RB: -0.236
|
||||||
|
BIN_ART: 0.141
|
||||||
|
BIN_FPP1: -0.130
|
||||||
|
BIN_INF: -0.128
|
||||||
|
BIN_PREP: -0.127
|
||||||
|
PC5:
|
||||||
|
BIN_DET: 0.485
|
||||||
|
BIN_ART: 0.422
|
||||||
|
BIN_PIN: -0.421
|
||||||
|
BIN_PREP: -0.421
|
||||||
|
BIN_RB: 0.245
|
||||||
|
BIN_VPRT: 0.196
|
||||||
|
BIN_INDA: 0.142
|
||||||
|
BIN_NOMZ: -0.123
|
||||||
|
BIN_PRP: 0.108
|
||||||
|
BIN_SBJP: 0.108
|
||||||
|
PC6:
|
||||||
|
BIN_NOMZ: 0.368
|
||||||
|
BIN_NN: -0.345
|
||||||
|
BIN_DET: 0.344
|
||||||
|
BIN_RB: -0.339
|
||||||
|
BIN_ART: 0.326
|
||||||
|
BIN_JJ: 0.324
|
||||||
|
BIN_PRP: -0.262
|
||||||
|
BIN_SBJP: -0.262
|
||||||
|
BIN_FPP1: -0.144
|
||||||
|
BIN_INDA: 0.128
|
||||||
|
PC7:
|
||||||
|
BIN_JJ: 0.448
|
||||||
|
BIN_X: -0.439
|
||||||
|
BIN_QUOT: -0.375
|
||||||
|
BIN_NOMZ: 0.312
|
||||||
|
BIN_NN: 0.271
|
||||||
|
BIN_RB: 0.231
|
||||||
|
BIN_NUM: -0.179
|
||||||
|
BIN_VPRT: 0.179
|
||||||
|
BIN_INF: -0.169
|
||||||
|
BIN_NNP: 0.164
|
||||||
|
PC8:
|
||||||
|
BIN_RB: 0.623
|
||||||
|
BIN_PRP: -0.415
|
||||||
|
BIN_SBJP: -0.415
|
||||||
|
BIN_FPP1: -0.240
|
||||||
|
BIN_INF: 0.233
|
||||||
|
BIN_JJ: 0.150
|
||||||
|
BIN_AUXB: 0.147
|
||||||
|
BIN_NOMZ: -0.143
|
||||||
|
BIN_XX0: 0.110
|
||||||
|
BIN_SPAU: 0.103
|
||||||
|
PC9:
|
||||||
|
BIN_INF: 0.712
|
||||||
|
BIN_VPRT: -0.427
|
||||||
|
BIN_TO: 0.206
|
||||||
|
BIN_X: -0.190
|
||||||
|
BIN_AUXB: -0.179
|
||||||
|
BIN_NUM: -0.173
|
||||||
|
BIN_QUOT: -0.161
|
||||||
|
BIN_NOMZ: 0.159
|
||||||
|
BIN_CONJ: -0.122
|
||||||
|
BIN_PRIV: 0.102
|
||||||
|
PC10:
|
||||||
|
BIN_QUOT: 0.726
|
||||||
|
BIN_JJ: 0.496
|
||||||
|
BIN_CONT: 0.327
|
||||||
|
BIN_X: -0.170
|
||||||
|
BIN_NUM: -0.149
|
||||||
|
BIN_INF: 0.134
|
||||||
|
BIN_PASS: -0.080
|
||||||
|
BIN_NOMZ: -0.074
|
||||||
|
BIN_NN: 0.068
|
||||||
|
BIN_AUXB: -0.060
|
||||||
|
PC11:
|
||||||
|
BIN_X: 0.620
|
||||||
|
BIN_JJ: 0.575
|
||||||
|
BIN_NOMZ: -0.292
|
||||||
|
BIN_QUOT: -0.288
|
||||||
|
BIN_INF: 0.131
|
||||||
|
BIN_PRP: 0.125
|
||||||
|
BIN_SBJP: 0.125
|
||||||
|
BIN_CONT: -0.123
|
||||||
|
BIN_RB: -0.092
|
||||||
|
BIN_FPP1: 0.085
|
||||||
|
PC12:
|
||||||
|
BIN_VPRT: 0.529
|
||||||
|
BIN_AUXB: 0.431
|
||||||
|
BIN_RB: -0.404
|
||||||
|
BIN_INF: 0.364
|
||||||
|
BIN_TO: 0.187
|
||||||
|
BIN_ART: -0.186
|
||||||
|
BIN_PASS: 0.183
|
||||||
|
BIN_VBD: -0.158
|
||||||
|
BIN_BEMA: 0.128
|
||||||
|
BIN_DEMP: 0.110
|
||||||
|
PC13:
|
||||||
|
BIN_NUM: 0.554
|
||||||
|
BIN_X: -0.544
|
||||||
|
BIN_NOMZ: -0.509
|
||||||
|
BIN_JJ: 0.160
|
||||||
|
BIN_RB: -0.156
|
||||||
|
BIN_QUOT: -0.124
|
||||||
|
BIN_CONT: -0.109
|
||||||
|
BIN_NN: -0.103
|
||||||
|
BIN_VPRT: -0.081
|
||||||
|
BIN_NNP: -0.073
|
||||||
|
PC14:
|
||||||
|
BIN_NUM: 0.595
|
||||||
|
BIN_NOMZ: 0.366
|
||||||
|
BIN_VPRT: 0.348
|
||||||
|
BIN_AUXB: -0.332
|
||||||
|
BIN_VBD: -0.262
|
||||||
|
BIN_PASS: -0.188
|
||||||
|
BIN_CONT: 0.161
|
||||||
|
BIN_INF: 0.157
|
||||||
|
BIN_PGAS: -0.118
|
||||||
|
BIN_CONJ: -0.118
|
||||||
|
PC15:
|
||||||
|
BIN_AUXB: 0.484
|
||||||
|
BIN_NUM: 0.450
|
||||||
|
BIN_NOMZ: 0.315
|
||||||
|
BIN_VPRT: -0.307
|
||||||
|
BIN_VBD: 0.262
|
||||||
|
BIN_PASS: 0.207
|
||||||
|
BIN_BEMA: 0.194
|
||||||
|
BIN_CONJ: 0.170
|
||||||
|
BIN_PRIV: -0.162
|
||||||
|
BIN_QUOT: 0.159
|
||||||
|
PC16:
|
||||||
|
BIN_CONJ: 0.673
|
||||||
|
BIN_PGAS: -0.355
|
||||||
|
BIN_CCONJ: 0.324
|
||||||
|
BIN_SCONJ: -0.247
|
||||||
|
BIN_TO: -0.197
|
||||||
|
BIN_VBD: -0.185
|
||||||
|
BIN_WH: -0.164
|
||||||
|
BIN_FPP1: -0.128
|
||||||
|
BIN_PRIV: 0.113
|
||||||
|
BIN_DEMP: -0.096
|
||||||
|
PC17:
|
||||||
|
BIN_CCONJ: 0.471
|
||||||
|
BIN_CONT: 0.462
|
||||||
|
BIN_INDA: -0.260
|
||||||
|
BIN_XX0: 0.221
|
||||||
|
BIN_SCONJ: -0.216
|
||||||
|
BIN_CONJ: -0.210
|
||||||
|
BIN_SPAU: 0.199
|
||||||
|
BIN_DET: 0.197
|
||||||
|
BIN_FPP1: 0.196
|
||||||
|
BIN_QUOT: -0.185
|
||||||
|
PC18:
|
||||||
|
BIN_PGAS: 0.578
|
||||||
|
BIN_CCONJ: 0.564
|
||||||
|
BIN_CONT: -0.268
|
||||||
|
BIN_PRIV: -0.235
|
||||||
|
BIN_ANDC: 0.144
|
||||||
|
BIN_PASS: -0.143
|
||||||
|
BIN_QUOT: 0.138
|
||||||
|
BIN_SPAU: -0.125
|
||||||
|
BIN_VBD: -0.115
|
||||||
|
BIN_NOMZ: 0.114
|
||||||
Top 10 PC1 values:
|
Top 10 PC1 values:
|
||||||
PC1 PC2 ... priority closed_relevance
|
PC1 PC2 ... priority closed_relevance
|
||||||
19873 125.128650 24.461032 ... Medium False
|
19873 125.128650 24.461032 ... Medium False
|
||||||
@ -63,4 +261,4 @@ Bottom 10 PC2 values:
|
|||||||
|
|
||||||
[10 rows x 26 columns]
|
[10 rows x 26 columns]
|
||||||
job finished, cleaning up
|
job finished, cleaning up
|
||||||
job pau at: Thu Sep 4 15:42:13 CDT 2025
|
job pau at: Tue Sep 23 16:37:56 CDT 2025
|
||||||
|
|||||||
@ -73,20 +73,28 @@ def biberplus_labeler(text):
|
|||||||
frequencies_df = frequencies_df.reset_index(drop=True)
|
frequencies_df = frequencies_df.reset_index(drop=True)
|
||||||
return frequencies_df
|
return frequencies_df
|
||||||
|
|
||||||
|
def make_text_for_analysis(row):
|
||||||
|
if row['comment_type'] == "task_description":
|
||||||
|
return f"{row['task_title']}./n/n{row['comment_text']}"
|
||||||
|
else:
|
||||||
|
return row['comment_text']
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
#loading in the discussion data from the universal CSV
|
#loading in the discussion data from the universal CSV
|
||||||
first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv")
|
first_discussion_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/071425_master_discussion_data.csv")
|
||||||
#formatting for the neurobiber model
|
#formatting for the neurobiber model
|
||||||
#docs = first_discussion_df["comment_text"].astype(str).tolist()
|
#docs = first_discussion_df["comment_text"].astype(str).tolist()
|
||||||
task_description_df = first_discussion_df[first_discussion_df['comment_type'] == "task_description"]
|
first_discussion_df['text_for_analysis'] = first_discussion_df.apply(make_text_for_analysis, axis=1)
|
||||||
docs = task_description_df['task_title'].astype(str).tolist()
|
#task_description_df = first_discussion_df[first_discussion_df['comment_type'] == "task_description"]
|
||||||
|
docs = first_discussion_df['text_for_analysis'].astype(str).tolist()
|
||||||
#load model and run
|
#load model and run
|
||||||
#model, tokenizer = load_model_and_tokenizer()
|
#model, tokenizer = load_model_and_tokenizer()
|
||||||
preds_df = biberplus_labeler(docs)
|
preds_df = biberplus_labeler(docs)
|
||||||
#new columns in the df for the predicted neurobiber items
|
#new columns in the df for the predicted neurobiber items
|
||||||
#preds_cols = [f"neurobiber_{i+1}" for i in range(96)]
|
#preds_cols = [f"neurobiber_{i+1}" for i in range(96)]
|
||||||
#preds_df = pd.DataFrame(preds, columns=preds_cols, index=first_discussion_df.index)
|
#preds_df = pd.DataFrame(preds, columns=preds_cols, index=first_discussion_df.index)
|
||||||
final_discussion_df = pd.concat([task_description_df, preds_df], axis=1)
|
final_discussion_df = pd.concat([first_discussion_df, preds_df], axis=1)
|
||||||
#print(type(preds))
|
#print(type(preds))
|
||||||
#assigning the preditions as a new column
|
#assigning the preditions as a new column
|
||||||
'''
|
'''
|
||||||
@ -109,6 +117,6 @@ if __name__ == "__main__":
|
|||||||
assert len(task_description_df) == len(final_discussion_df)
|
assert len(task_description_df) == len(final_discussion_df)
|
||||||
final_discussion_df = final_discussion_df.drop(columns=["message"])
|
final_discussion_df = final_discussion_df.drop(columns=["message"])
|
||||||
# if passing the prior asserts, let's write to a csv
|
# if passing the prior asserts, let's write to a csv
|
||||||
final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/090725_biberplus_title_labels.csv", index=False)
|
final_discussion_df.to_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/092325_biberplus_complete_labels.csv", index=False)
|
||||||
print('biberplus labeling pau')
|
print('biberplus labeling pau')
|
||||||
|
|
||||||
|
|||||||
@ -6,6 +6,28 @@ import numpy as np
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
# List of the 96 features that Neurobiber can predict
|
||||||
|
BIBER_FEATURES = [
|
||||||
|
"BIN_QUAN","BIN_QUPR","BIN_AMP","BIN_PASS","BIN_XX0","BIN_JJ",
|
||||||
|
"BIN_BEMA","BIN_CAUS","BIN_CONC","BIN_COND","BIN_CONJ","BIN_CONT",
|
||||||
|
"BIN_DPAR","BIN_DWNT","BIN_EX","BIN_FPP1","BIN_GER","BIN_RB",
|
||||||
|
"BIN_PIN","BIN_INPR","BIN_TO","BIN_NEMD","BIN_OSUB","BIN_PASTP",
|
||||||
|
"BIN_VBD","BIN_PHC","BIN_PIRE","BIN_PLACE","BIN_POMD","BIN_PRMD",
|
||||||
|
"BIN_WZPRES","BIN_VPRT","BIN_PRIV","BIN_PIT","BIN_PUBV","BIN_SPP2",
|
||||||
|
"BIN_SMP","BIN_SERE","BIN_STPR","BIN_SUAV","BIN_SYNE","BIN_TPP3",
|
||||||
|
"BIN_TIME","BIN_NOMZ","BIN_BYPA","BIN_PRED","BIN_TOBJ","BIN_TSUB",
|
||||||
|
"BIN_THVC","BIN_NN","BIN_DEMP","BIN_DEMO","BIN_WHQU","BIN_EMPH",
|
||||||
|
"BIN_HDG","BIN_WZPAST","BIN_THAC","BIN_PEAS","BIN_ANDC","BIN_PRESP",
|
||||||
|
"BIN_PROD","BIN_SPAU","BIN_SPIN","BIN_THATD","BIN_WHOBJ","BIN_WHSUB",
|
||||||
|
"BIN_WHCL","BIN_ART","BIN_AUXB","BIN_CAP","BIN_SCONJ","BIN_CCONJ",
|
||||||
|
"BIN_DET","BIN_EMOJ","BIN_EMOT","BIN_EXCL","BIN_HASH","BIN_INF",
|
||||||
|
"BIN_UH","BIN_NUM","BIN_LAUGH","BIN_PRP","BIN_PREP","BIN_NNP",
|
||||||
|
"BIN_QUES","BIN_QUOT","BIN_AT","BIN_SBJP","BIN_URL","BIN_WH",
|
||||||
|
"BIN_INDA","BIN_ACCU","BIN_PGAS","BIN_CMADJ","BIN_SPADJ","BIN_X"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def format_df_data(df):
|
def format_df_data(df):
|
||||||
@ -34,11 +56,18 @@ if __name__ == "__main__":
|
|||||||
'''
|
'''
|
||||||
pca = PCA(n_components=18)
|
pca = PCA(n_components=18)
|
||||||
biber_vecs_pca = pca.fit_transform(biber_vecs)
|
biber_vecs_pca = pca.fit_transform(biber_vecs)
|
||||||
|
with open('092325_pca.pkl', 'wb') as f:
|
||||||
|
pickle.dump(pca, f)
|
||||||
selected_axis = "AuthorWMFAffil"
|
selected_axis = "AuthorWMFAffil"
|
||||||
|
|
||||||
component_variances = np.var(biber_vecs_pca, axis=0)
|
component_variances = np.var(biber_vecs_pca, axis=0)
|
||||||
print("Variance of each PCA component:", component_variances)
|
print("Variance of each PCA component:", component_variances)
|
||||||
|
|
||||||
|
for i, component in enumerate(pca.components_):
|
||||||
|
print(f"PC{i+1}:")
|
||||||
|
indices = np.argsort(np.abs(component))[::-1]
|
||||||
|
for idx in indices[:10]: # Top 10
|
||||||
|
print(f" {BIBER_FEATURES[idx]}: {component[idx]:.3f}")
|
||||||
|
|
||||||
#first looking at comment_type
|
#first looking at comment_type
|
||||||
le = LabelEncoder()
|
le = LabelEncoder()
|
||||||
@ -55,7 +84,7 @@ if __name__ == "__main__":
|
|||||||
pc_dict['closed_relevance'] = biber_vec_df['closed_relevance']
|
pc_dict['closed_relevance'] = biber_vec_df['closed_relevance']
|
||||||
|
|
||||||
plot_df = pd.DataFrame(pc_dict)
|
plot_df = pd.DataFrame(pc_dict)
|
||||||
plot_df.to_csv("090425_description_PCA_df.csv", index=False)
|
plot_df.to_csv("092325_description_PCA_df.csv", index=False)
|
||||||
|
|
||||||
print("Top 10 PC1 values:")
|
print("Top 10 PC1 values:")
|
||||||
print(plot_df.nlargest(10, "PC1"))
|
print(plot_df.nlargest(10, "PC1"))
|
||||||
@ -92,5 +121,5 @@ if __name__ == "__main__":
|
|||||||
plt.legend(title=selected_axis, bbox_to_anchor=(1.05, 1), loc=2)
|
plt.legend(title=selected_axis, bbox_to_anchor=(1.05, 1), loc=2)
|
||||||
'''
|
'''
|
||||||
g.fig.tight_layout()
|
g.fig.tight_layout()
|
||||||
g.savefig(f"description_{selected_axis}_090425_biber_pca_final.png", dpi=300)
|
g.savefig(f"description_{selected_axis}_092325_biber_pca_final.png", dpi=300)
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user