1
0

updating neurobiber PCA values

This commit is contained in:
mgaughan 2025-12-16 11:14:04 -06:00
parent 0ded278e9e
commit 30a828fc56
5 changed files with 136228 additions and 18 deletions

View File

@ -0,0 +1,421 @@
starting the job at: Tue Dec 16 10:14:44 CST 2025
setting up the environment
running the neurobiber labeling script
0 [The #Cloud-Services project tag is not intend...
1 [Token is used for 2-factor auth., I'm surpris...
2 [Oh, of course it's visible since you /might/ ...
3 [Can this be closed?, We can now use the proxy...
4 [Can this be closed?, We can now use the proxy...
...
26018 [LocalSettings.php lacks wgSecureLogin, wgCook...
26019 [``CODE`CODE$wgSecureLoginCODE$wgServerCODE$wg...
26020 [``CODE`CODE$wgSecureLoginCODE$wgServerCODE$wg...
26021 [``CODE`CODE$wgSecureLoginCODE$wgServerCODE$wg...
26022 [``CODE`CODE$wgSecureLoginCODE$wgServerCODE$wg...
Name: olmo_cleaned_sentences, Length: 22172, dtype: object
[[11. ]
[15.5]
[10.5]
...
[ 5. ]
[ 5. ]
[ 5. ]]
Number of PCs explaining 90% variance: 25
Variance of each PCA component: [226.26016489 149.72080608 76.13824024 65.84659233 60.86229348
35.83902424 32.60176441 25.96606033 21.16336438 18.72865599
16.54468721 14.84254647 11.35629294 10.92463002 9.32732926
9.00220126 8.32992311 8.16541285 7.1981045 7.03685064
6.65889481 6.46214766 6.23468215 5.81151603 5.51271131]
PC1:
normalized_CAP: 0.605
normalized_NNP: 0.559
median_sentence_length: -0.491
normalized_DET: -0.122
normalized_PREP: -0.112
normalized_PIN: -0.112
normalized_NN: -0.088
normalized_ART: -0.088
normalized_VPRT: -0.055
normalized_JJ: -0.054
PC2:
median_sentence_length: 0.850
normalized_NNP: 0.448
normalized_CAP: 0.194
normalized_DET: -0.086
normalized_NN: -0.075
normalized_VPRT: -0.061
normalized_PREP: -0.055
normalized_PIN: -0.055
normalized_RB: -0.052
normalized_AUXB: -0.049
PC3:
normalized_NN: 0.674
normalized_CAP: 0.477
normalized_NNP: -0.318
normalized_PREP: 0.191
normalized_PIN: 0.191
normalized_RB: -0.178
normalized_PRP: -0.156
normalized_SBJP: -0.156
normalized_VPRT: -0.104
median_sentence_length: 0.103
PC4:
normalized_PREP: 0.452
normalized_PIN: 0.452
normalized_CAP: 0.384
normalized_NN: -0.367
normalized_PRP: 0.259
normalized_SBJP: 0.259
normalized_RB: 0.193
normalized_INF: 0.174
normalized_FPP1: 0.142
normalized_NNP: -0.129
PC5:
normalized_PREP: 0.453
normalized_PIN: 0.453
normalized_NNP: 0.452
normalized_CAP: -0.397
normalized_RB: -0.299
normalized_SBJP: -0.162
normalized_PRP: -0.162
median_sentence_length: -0.123
normalized_CONJ: 0.115
normalized_VPRT: -0.105
PC6:
normalized_DET: 0.679
normalized_ART: 0.404
normalized_VPRT: 0.301
normalized_AUXB: 0.246
normalized_NNP: 0.194
normalized_INDA: 0.178
normalized_NUM: -0.144
normalized_DEMP: 0.123
normalized_NN: 0.119
normalized_BEMA: 0.114
PC7:
normalized_PRP: 0.483
normalized_SBJP: 0.482
normalized_NN: 0.463
normalized_FPP1: 0.253
normalized_NNP: 0.236
normalized_RB: -0.191
normalized_AUXB: -0.168
normalized_PASS: -0.143
normalized_CAP: -0.137
normalized_PIT: 0.128
PC8:
normalized_RB: 0.807
normalized_NN: 0.341
normalized_NNP: 0.213
normalized_NUM: -0.155
normalized_CAP: -0.131
normalized_DET: -0.127
normalized_TIME: 0.121
normalized_XX0: 0.112
normalized_SPAU: 0.102
normalized_PRP: -0.098
PC9:
normalized_JJ: 0.564
normalized_INF: 0.376
normalized_VPRT: -0.317
sentence_count: 0.292
normalized_NUM: -0.259
normalized_ART: 0.235
normalized_PASS: -0.217
normalized_AUXB: -0.216
normalized_CONJ: -0.141
normalized_DET: 0.134
PC10:
normalized_JJ: 0.598
normalized_INF: -0.518
normalized_VPRT: 0.391
normalized_DET: -0.256
normalized_ART: -0.139
normalized_NUM: -0.131
normalized_AUXB: 0.125
sentence_count: 0.108
normalized_BEMA: 0.099
normalized_TO: -0.092
PC11:
sentence_count: 0.551
normalized_INF: -0.489
normalized_AUXB: -0.303
normalized_JJ: -0.264
normalized_VPRT: -0.214
normalized_ART: 0.198
normalized_RB: 0.176
normalized_TO: -0.157
normalized_PASS: -0.141
normalized_DET: 0.126
PC12:
sentence_count: 0.703
normalized_VPRT: 0.320
normalized_INF: 0.281
normalized_JJ: -0.256
normalized_AUXB: 0.242
normalized_VBD: -0.229
normalized_PASS: 0.141
normalized_PUBV: -0.125
normalized_ART: -0.120
normalized_DET: -0.116
PC13:
normalized_NUM: 0.616
normalized_VBD: -0.462
normalized_AUXB: -0.281
normalized_VPRT: 0.262
normalized_PASS: -0.231
normalized_ART: 0.158
normalized_CONJ: 0.156
normalized_INDA: 0.134
normalized_TIME: -0.129
normalized_DEMP: -0.124
PC14:
normalized_NUM: 0.422
normalized_QUOT: -0.408
normalized_VBD: 0.306
normalized_CONT: -0.291
normalized_JJ: 0.256
normalized_AUXB: 0.255
sentence_count: 0.232
normalized_VPRT: -0.223
normalized_PUBV: 0.201
normalized_UH: -0.174
PC15:
normalized_QUOT: 0.480
normalized_UH: -0.452
normalized_CONT: 0.362
normalized_VBD: 0.354
normalized_PUBV: 0.349
normalized_NUM: 0.225
normalized_CONJ: -0.169
normalized_STPR: 0.106
normalized_VPRT: 0.104
normalized_TO: 0.090
PC16:
normalized_PUBV: 0.449
normalized_CONJ: -0.400
normalized_QUOT: -0.371
normalized_CONT: -0.298
normalized_PGAS: 0.286
normalized_VPRT: 0.246
normalized_NOMZ: 0.230
normalized_PASS: -0.217
normalized_AUXB: -0.161
normalized_JJ: -0.125
PC17:
normalized_UH: 0.749
normalized_PGAS: -0.343
normalized_PUBV: 0.277
normalized_VBD: 0.195
normalized_NOMZ: -0.176
normalized_CCONJ: -0.136
normalized_VPRT: 0.129
normalized_X: -0.128
normalized_QUOT: 0.110
normalized_TIME: -0.106
PC18:
normalized_CONJ: 0.558
normalized_PUBV: 0.445
normalized_NUM: -0.342
normalized_ART: 0.236
normalized_NOMZ: 0.218
normalized_DEMP: -0.172
normalized_DET: -0.159
normalized_UH: -0.157
normalized_VPRT: 0.157
normalized_PGAS: -0.141
PC19:
normalized_ART: 0.438
normalized_CONJ: -0.353
normalized_DEMO: -0.299
normalized_DET: -0.284
normalized_AUXB: 0.267
normalized_PIT: 0.244
normalized_DEMP: -0.238
normalized_INDA: 0.228
normalized_FPP1: -0.227
normalized_PUBV: -0.161
PC20:
normalized_PGAS: 0.777
normalized_X: -0.317
normalized_UH: 0.244
normalized_AUXB: 0.184
normalized_CONJ: 0.178
normalized_NUM: 0.144
normalized_QUES: -0.140
normalized_CONT: 0.137
normalized_PUBV: 0.095
normalized_QUOT: 0.095
PC21:
normalized_CCONJ: 0.582
normalized_QUES: -0.386
normalized_X: -0.268
normalized_PRIV: 0.217
normalized_BEMA: -0.203
normalized_AUXB: -0.197
normalized_VPRT: 0.191
normalized_TIME: 0.173
normalized_FPP1: 0.170
normalized_CONJ: -0.141
PC22:
normalized_X: 0.639
normalized_PRIV: 0.451
normalized_QUES: -0.272
normalized_CCONJ: -0.216
normalized_VBD: 0.215
normalized_CONJ: 0.196
normalized_DEMO: -0.171
normalized_FPP1: 0.150
normalized_PGAS: 0.117
normalized_SPP2: -0.101
PC23:
normalized_NOMZ: 0.580
normalized_X: -0.394
normalized_CCONJ: -0.381
normalized_PRIV: 0.340
normalized_PUBV: -0.191
normalized_PASS: 0.140
normalized_FPP1: 0.130
normalized_PGAS: -0.127
normalized_TIME: -0.124
normalized_TO: -0.122
PC24:
normalized_QUES: 0.424
normalized_CCONJ: 0.408
normalized_CONJ: 0.286
normalized_PRIV: 0.278
normalized_VBD: 0.247
normalized_WH: 0.231
normalized_PUBV: -0.229
normalized_PASS: -0.203
normalized_SCONJ: 0.193
normalized_TIME: -0.161
PC25:
normalized_NOMZ: 0.451
normalized_DEMP: 0.404
normalized_DEMO: -0.354
normalized_PRIV: -0.226
normalized_VBD: -0.219
normalized_SPAU: 0.219
normalized_INDA: -0.201
normalized_UH: 0.169
normalized_PEAS: -0.167
normalized_NUM: 0.153
Top 10 PC1 values:
PC1 PC2 ... date_created comment_type
2575 117.753138 51.176752 ... 1424754141 task_subcomment
998 117.750117 51.170922 ... 1354316739 task_subcomment
1011 117.750117 51.170922 ... 1359160095 task_subcomment
1014 117.750117 51.170922 ... 1362102239 task_subcomment
1016 117.750117 51.170922 ... 1362441994 task_subcomment
1559 117.750117 51.170922 ... 1362478487 task_subcomment
3911 117.750117 51.170922 ... 1344625237 task_subcomment
3917 117.750117 51.170922 ... 1345813989 task_subcomment
3923 117.750117 51.170922 ... 1348771229 task_subcomment
3933 117.750117 51.170922 ... 1349619536 task_subcomment
[10 rows x 36 columns]
Bottom 10 PC1 values:
PC1 PC2 ... date_created comment_type
25853 -263.814907 444.911878 ... 1350678600 task_description
4664 -227.278888 401.944364 ... 1463441072 task_subcomment
4663 -227.088961 402.095691 ... 1463441050 task_subcomment
18937 -147.873909 250.705670 ... 1380947348 task_subcomment
18938 -147.873909 250.705670 ... 1380947348 task_subcomment
11819 -146.279577 271.844540 ... 1354470131 task_subcomment
7137 -111.434239 178.022308 ... 1374730027 task_subcomment
14021 -105.059698 195.829065 ... 1440633395 task_subcomment
712 -102.589613 179.044415 ... 1379611711 task_subcomment
4661 -97.861683 158.793070 ... 1463439992 task_subcomment
[10 rows x 36 columns]
Top 10 PC2 values:
PC1 PC2 ... date_created comment_type
25853 -263.814907 444.911878 ... 1350678600 task_description
4663 -227.088961 402.095691 ... 1463441050 task_subcomment
4664 -227.278888 401.944364 ... 1463441072 task_subcomment
11819 -146.279577 271.844540 ... 1354470131 task_subcomment
18937 -147.873909 250.705670 ... 1380947348 task_subcomment
18938 -147.873909 250.705670 ... 1380947348 task_subcomment
14021 -105.059698 195.829065 ... 1440633395 task_subcomment
712 -102.589613 179.044415 ... 1379611711 task_subcomment
7137 -111.434239 178.022308 ... 1374730027 task_subcomment
47 -93.548429 177.346008 ... 1441031208 task_subcomment
[10 rows x 36 columns]
Bottom 10 PC2 values:
PC1 PC2 ... date_created comment_type
12960 -7.559239 -20.524622 ... 1422554389 task_subcomment
23743 -7.559239 -20.524622 ... 1462375135 task_subcomment
20335 -7.550174 -20.507132 ... 1317586881 task_subcomment
25842 -7.547152 -20.501302 ... 1327978205 task_subcomment
19517 -10.124263 -19.546380 ... 1377104818 task_subcomment
1394 -4.163268 -19.107198 ... 1321220595 task_subcomment
1154 -5.357197 -18.669861 ... 1428954897 task_subcomment
616 -14.810933 -18.487368 ... 1384635692 task_subcomment
617 -14.810933 -18.487368 ... 1384635692 task_subcomment
15258 -5.606616 -17.903726 ... 1412324629 task_subcomment
[10 rows x 36 columns]
Top 10 PC3 values:
PC1 PC2 ... date_created comment_type
1718 53.006784 -1.020596 ... 1375331403 task_subcomment
3746 52.952393 -1.125535 ... 1456539439 task_subcomment
16384 52.952393 -1.125535 ... 1384994015 task_subcomment
13327 52.949371 -1.131365 ... 1442087854 task_subcomment
11596 22.205239 -9.975087 ... 1377544788 task_subcomment
11597 22.205239 -9.975087 ... 1377544792 task_subcomment
11599 22.205239 -9.975087 ... 1377545735 task_subcomment
12417 56.876889 3.447518 ... 1350946140 task_subcomment
13828 35.649201 -3.312173 ... 1434130529 task_subcomment
3665 14.096589 -10.651178 ... 1374012685 task_subcomment
[10 rows x 36 columns]
Bottom 10 PC3 values:
PC1 PC2 ... date_created comment_type
106 57.740232 30.913077 ... 1453561068 task_subcomment
107 57.740232 30.913077 ... 1453561129 task_subcomment
172 57.740232 30.913077 ... 1420466644 task_subcomment
173 57.740232 30.913077 ... 1420473867 task_subcomment
524 57.740232 30.913077 ... 1491557250 task_subcomment
525 57.740232 30.913077 ... 1491557269 task_subcomment
1253 57.740232 30.913077 ... 1601272820 task_subcomment
1276 57.740232 30.913077 ... 1431790268 task_subcomment
1277 57.740232 30.913077 ... 1431790446 task_subcomment
1328 57.740232 30.913077 ... 1624208185 task_subcomment
[10 rows x 36 columns]
Top 10 PC4 values:
PC1 PC2 ... date_created comment_type
14805 78.055845 24.070094 ... 1676690655 task_subcomment
1160 30.338430 -4.905832 ... 1430255616 task_subcomment
14198 31.404810 -2.952579 ... 1399747303 task_subcomment
1571 14.767571 -9.509647 ... 1424206043 task_subcomment
16080 50.035111 11.152005 ... 1355446597 task_subcomment
14116 32.879352 -2.867424 ... 1354149956 task_subcomment
616 -14.810933 -18.487368 ... 1384635692 task_subcomment
617 -14.810933 -18.487368 ... 1384635692 task_subcomment
21375 16.306772 -8.785275 ... 1361832639 task_subcomment
13808 16.304506 -8.789647 ... 1438011707 task_subcomment
[10 rows x 36 columns]
Bottom 10 PC4 values:
PC1 PC2 ... date_created comment_type
12960 -7.559239 -20.524622 ... 1422554389 task_subcomment
23743 -7.559239 -20.524622 ... 1462375135 task_subcomment
20335 -7.550174 -20.507132 ... 1317586881 task_subcomment
25842 -7.547152 -20.501302 ... 1327978205 task_subcomment
1154 -5.357197 -18.669861 ... 1428954897 task_subcomment
1399 -5.341802 -14.960571 ... 1328300138 task_subcomment
15258 -5.606616 -17.903726 ... 1412324629 task_subcomment
22066 8.796176 -12.108964 ... 1372042733 task_subcomment
15293 1.994613 -10.713217 ... 1434020520 task_subcomment
11596 22.205239 -9.975087 ... 1377544788 task_subcomment
[10 rows x 36 columns]
job finished, cleaning up
job pau at: Tue Dec 16 10:15:20 CST 2025

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@ -85,10 +85,10 @@ def format_df_data(df):
return x
if __name__ == "__main__":
biber_vec_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/analysis_data/100325_unified_phab.csv", low_memory=False)
biber_vec_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/analysis_data/120725_unified.csv", low_memory=False)
#biber_vec_df = biber_vec_df[biber_vec_df['comment_type'] != 'task_description']
#biber_vec_df = biber_vec_df[biber_vec_df['AuthorPHID'] != "PHID-USER-idceizaw6elwiwm5xshb"]
#biber_vec_df = biber_vec_df[biber_vec_df['comment_text'] != 'nan']
biber_vec_df = biber_vec_df[biber_vec_df['AuthorPHID'] != "PHID-USER-idceizaw6elwiwm5xshb"]
biber_vec_df = biber_vec_df[biber_vec_df['comment_text'] != 'nan']
biber_vecs = format_df_data(biber_vec_df)
#handoff to PCA model
@ -103,7 +103,7 @@ if __name__ == "__main__":
pca = PCA(n_components=argmax_components)
biber_vecs_pca = pca.fit_transform(biber_vecs)
with open('102025_total_pca.pkl', 'wb') as f:
with open('121525_total_pca.pkl', 'wb') as f:
pickle.dump(pca, f)
selected_axis = "AuthorWMFAffil"
@ -136,7 +136,7 @@ if __name__ == "__main__":
plot_df = pd.DataFrame(pc_dict)
plot_df.to_csv("102025_total_pca_df.csv", index=False)
plot_df.to_csv("121525_total_pca_df.csv", index=False)
print("Top 10 PC1 values:")
print(plot_df.nlargest(10, "PC1"))
@ -148,17 +148,13 @@ if __name__ == "__main__":
print("\nBottom 10 PC2 values:")
print(plot_df.nsmallest(10, "PC2"))
print("Top 10 PC3 values:")
print(plot_df.nlargest(10, "PC3"))
print("\nBottom 10 PC3 values:")
print(plot_df.nsmallest(10, "PC3"))
#g = sns.FacetGrid(plot_df, col="source", row="phase", hue=selected_axis, palette="tab10", height=4, sharex=False, sharey=False)
#g.map_dataframe(sns.scatterplot, x="PC1", y="PC2", alpha=0.7, s=40)
#g.add_legend(title=selected_axis)
#g.set_axis_labels("PC1", "PC2")
#g.fig.subplots_adjust(top=0.9)
#g.fig.suptitle(f"PCA by {selected_axis}, faceted by source")
print("Top 10 PC4 values:")
print(plot_df.nlargest(10, "PC4"))
print("\nBottom 10 PC4 values:")
print(plot_df.nsmallest(10, "PC4"))
#plt.savefig("090225_biber_pca_plot.png", dpi=300)
'''
#g.fig.tight_layout()
#g.savefig(f"subcomment_{selected_axis}_100125_biber_pca_final.png", dpi=300)
#plt.show()
'''

View File

@ -8,7 +8,7 @@
#SBATCH --mem=64G
#SBATCH --cpus-per-task=4
#SBATCH --job-name=neurobiber-pca
#SBATCH --output=102025_total_neurobiber-pca.log
#SBATCH --output=121525_total_neurobiber-pca.log
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH --mail-user=gaughan@u.northwestern.edu