1
0

updating with new PCA run

This commit is contained in:
mgaughan 2025-12-16 15:35:27 -06:00
parent e0cb055ff7
commit df1dcf1224
8 changed files with 131072 additions and 4 deletions

View File

@ -0,0 +1,421 @@
starting the job at: Tue Dec 16 15:26:31 CST 2025
setting up the environment
running the neurobiber labeling script
0 [The #Cloud-Services project tag is not intend...
1 [Token is used for 2-factor auth., I'm surpris...
2 [Oh, of course it's visible since you /might/ ...
3 [Can this be closed?, We can now use the proxy...
4 [I just now tried creating a new instance, and...
...
25022 [I think this should be low priority., Only po...
25023 [Go to some long article, scroll all the way d...
25024 [In Microsoft Word, both character-level styli...
25025 [LocalSettings.php lacks wgSecureLogin, wgCook...
25026 [``CODE`CODE$wgSecureLoginCODE$wgServerCODE$wg...
Name: olmo_cleaned_sentences, Length: 21302, dtype: object
[[11. ]
[15.5]
[10.5]
...
[26. ]
[18. ]
[ 5. ]]
Number of PCs explaining 90% variance: 25
Variance of each PCA component: [227.24126404 147.97706893 75.8294832 65.33178909 59.86158651
35.58328853 32.35739864 25.76291291 21.24897975 18.6584478
16.64923169 15.07461057 11.37754832 10.83761848 9.1933966
8.88840899 8.1956979 8.05178179 7.19167725 6.9942501
6.65183757 6.39961806 6.22347534 5.83078813 5.49524439]
PC1:
normalized_CAP: 0.613
normalized_NNP: 0.566
median_sentence_length: -0.473
normalized_DET: -0.123
normalized_PIN: -0.111
normalized_PREP: -0.111
normalized_ART: -0.088
normalized_NN: -0.088
normalized_VPRT: -0.056
normalized_JJ: -0.055
PC2:
median_sentence_length: 0.860
normalized_NNP: 0.436
normalized_CAP: 0.184
normalized_DET: -0.083
normalized_NN: -0.075
normalized_VPRT: -0.058
normalized_RB: -0.050
normalized_PIN: -0.049
normalized_PREP: -0.049
normalized_AUXB: -0.048
PC3:
normalized_NN: 0.692
normalized_CAP: 0.451
normalized_NNP: -0.303
normalized_RB: -0.190
normalized_PREP: 0.184
normalized_PIN: 0.184
normalized_PRP: -0.165
normalized_SBJP: -0.165
normalized_VPRT: -0.107
median_sentence_length: 0.100
PC4:
normalized_PREP: 0.484
normalized_PIN: 0.484
normalized_CAP: 0.373
normalized_NN: -0.340
normalized_PRP: 0.244
normalized_SBJP: 0.244
normalized_RB: 0.169
normalized_INF: 0.168
normalized_FPP1: 0.135
normalized_NNP: -0.118
PC5:
normalized_NNP: 0.463
normalized_CAP: -0.426
normalized_PIN: 0.421
normalized_PREP: 0.421
normalized_RB: -0.305
normalized_SBJP: -0.178
normalized_PRP: -0.178
median_sentence_length: -0.129
normalized_CONJ: 0.112
normalized_VPRT: -0.108
PC6:
normalized_DET: 0.682
normalized_ART: 0.406
normalized_VPRT: 0.300
normalized_AUXB: 0.244
normalized_NNP: 0.191
normalized_INDA: 0.178
normalized_NUM: -0.142
normalized_DEMP: 0.124
normalized_NN: 0.114
normalized_BEMA: 0.112
PC7:
normalized_PRP: 0.482
normalized_SBJP: 0.482
normalized_NN: 0.455
normalized_FPP1: 0.253
normalized_NNP: 0.242
normalized_RB: -0.201
normalized_AUXB: -0.168
normalized_PASS: -0.144
normalized_CAP: -0.143
normalized_PIT: 0.128
PC8:
normalized_RB: 0.804
normalized_NN: 0.344
normalized_NNP: 0.219
normalized_NUM: -0.157
normalized_CAP: -0.137
normalized_DET: -0.129
normalized_TIME: 0.125
normalized_XX0: 0.110
normalized_SPAU: 0.101
normalized_PRP: -0.092
PC9:
normalized_JJ: 0.578
normalized_INF: 0.353
sentence_count: 0.314
normalized_VPRT: -0.302
normalized_NUM: -0.263
normalized_ART: 0.229
normalized_PASS: -0.218
normalized_AUXB: -0.214
normalized_CONJ: -0.139
normalized_DET: 0.125
PC10:
normalized_JJ: 0.568
normalized_INF: -0.553
normalized_VPRT: 0.390
normalized_DET: -0.252
normalized_ART: -0.140
normalized_AUXB: 0.124
normalized_NUM: -0.124
sentence_count: 0.116
normalized_TO: -0.101
normalized_BEMA: 0.093
PC11:
sentence_count: 0.617
normalized_INF: -0.440
normalized_JJ: -0.321
normalized_AUXB: -0.271
normalized_VPRT: -0.190
normalized_ART: 0.184
normalized_RB: 0.170
normalized_TO: -0.140
normalized_PASS: -0.118
normalized_DET: 0.117
PC12:
sentence_count: 0.639
normalized_VPRT: 0.347
normalized_INF: 0.325
normalized_AUXB: 0.280
normalized_VBD: -0.240
normalized_JJ: -0.223
normalized_PASS: 0.157
normalized_ART: -0.145
normalized_DET: -0.130
normalized_PUBV: -0.124
PC13:
normalized_NUM: 0.592
normalized_VBD: -0.482
normalized_AUXB: -0.289
normalized_VPRT: 0.263
normalized_PASS: -0.235
normalized_ART: 0.157
normalized_CONJ: 0.150
normalized_INDA: 0.134
normalized_TIME: -0.133
normalized_PUBV: -0.130
PC14:
normalized_NUM: 0.479
normalized_QUOT: -0.361
normalized_VBD: 0.317
normalized_JJ: 0.276
normalized_AUXB: 0.246
normalized_CONT: -0.243
sentence_count: 0.226
normalized_PUBV: 0.204
normalized_VPRT: -0.204
normalized_CONJ: 0.173
PC15:
normalized_PUBV: 0.534
normalized_CONJ: -0.332
normalized_VBD: 0.308
normalized_UH: -0.306
normalized_QUOT: 0.294
normalized_VPRT: 0.236
normalized_CONT: 0.212
normalized_NUM: 0.196
normalized_PASS: -0.162
normalized_TO: 0.135
PC16:
normalized_QUOT: 0.572
normalized_CONT: 0.438
normalized_CONJ: 0.287
normalized_PGAS: -0.274
normalized_PUBV: -0.245
normalized_NOMZ: -0.204
normalized_UH: -0.164
normalized_PASS: 0.160
normalized_VBD: 0.155
normalized_VPRT: -0.151
PC17:
normalized_PUBV: 0.491
normalized_CONJ: 0.476
normalized_PGAS: -0.306
normalized_NUM: -0.295
normalized_ART: 0.246
normalized_UH: 0.198
normalized_VPRT: 0.196
normalized_DEMP: -0.178
normalized_DET: -0.163
normalized_INDA: 0.145
PC18:
normalized_UH: 0.757
normalized_PGAS: -0.319
normalized_CONJ: -0.290
normalized_NOMZ: -0.255
normalized_VBD: 0.167
normalized_CCONJ: -0.161
normalized_NUM: 0.139
normalized_SCONJ: 0.097
normalized_JJ: 0.091
sentence_count: 0.077
PC19:
normalized_ART: 0.442
normalized_CONJ: -0.352
normalized_DEMO: -0.294
normalized_DET: -0.286
normalized_AUXB: 0.279
normalized_DEMP: -0.242
normalized_PIT: 0.234
normalized_INDA: 0.226
normalized_FPP1: -0.211
normalized_NUM: 0.172
PC20:
normalized_PGAS: 0.741
normalized_UH: 0.330
normalized_X: -0.274
normalized_CONJ: 0.238
normalized_AUXB: 0.157
normalized_CONT: 0.149
normalized_QUES: -0.136
normalized_NUM: 0.116
normalized_PUBV: 0.115
normalized_NOMZ: -0.114
PC21:
normalized_CCONJ: 0.608
normalized_QUES: -0.369
normalized_X: -0.250
normalized_AUXB: -0.218
normalized_PRIV: 0.214
normalized_VPRT: 0.208
normalized_BEMA: -0.191
normalized_TIME: 0.185
normalized_FPP1: 0.171
normalized_SCONJ: -0.135
PC22:
normalized_X: 0.585
normalized_PRIV: 0.499
normalized_QUES: -0.278
normalized_CCONJ: -0.270
normalized_VBD: 0.232
normalized_DEMO: -0.162
normalized_PUBV: -0.154
normalized_FPP1: 0.153
normalized_CONJ: 0.139
normalized_ART: 0.101
PC23:
normalized_NOMZ: 0.568
normalized_X: -0.520
normalized_CCONJ: -0.308
normalized_PRIV: 0.276
normalized_PUBV: -0.162
normalized_PASS: 0.147
normalized_FPP1: 0.128
normalized_CONJ: -0.126
normalized_PGAS: -0.125
normalized_DEMO: 0.122
PC24:
normalized_QUES: 0.430
normalized_CCONJ: 0.385
normalized_CONJ: 0.280
normalized_PRIV: 0.278
normalized_VBD: 0.259
normalized_WH: 0.242
normalized_PUBV: -0.237
normalized_SCONJ: 0.209
normalized_PASS: -0.198
normalized_TIME: -0.191
PC25:
normalized_DEMP: 0.448
normalized_NOMZ: 0.443
normalized_DEMO: -0.420
normalized_INDA: -0.209
normalized_VBD: -0.199
normalized_SPAU: 0.194
normalized_PRIV: -0.186
normalized_UH: 0.186
normalized_PEAS: -0.182
normalized_NUM: 0.157
Top 10 PC1 values:
PC1 PC2 ... date_created comment_type
2471 118.825452 48.883675 ... 1424754141 task_subcomment
971 118.822061 48.877781 ... 1354316739 task_subcomment
984 118.822061 48.877781 ... 1359160095 task_subcomment
987 118.822061 48.877781 ... 1362102239 task_subcomment
989 118.822061 48.877781 ... 1362441994 task_subcomment
1486 118.822061 48.877781 ... 1362478487 task_subcomment
3708 118.822061 48.877781 ... 1344625237 task_subcomment
3714 118.822061 48.877781 ... 1345813989 task_subcomment
3720 118.822061 48.877781 ... 1348771229 task_subcomment
3730 118.822061 48.877781 ... 1349619536 task_subcomment
[10 rows x 36 columns]
Bottom 10 PC1 values:
PC1 PC2 ... date_created comment_type
24881 -254.291673 450.247095 ... 1350678600 task_description
4413 -218.707675 406.488424 ... 1463441072 task_subcomment
4412 -218.516235 406.636956 ... 1463441050 task_subcomment
18176 -142.494770 253.636183 ... 1380947348 task_subcomment
11326 -140.543606 274.762080 ... 1354470131 task_subcomment
6778 -107.601033 180.118546 ... 1374730027 task_subcomment
13442 -100.902011 197.908102 ... 1440633395 task_subcomment
693 -98.717155 181.023968 ... 1379611711 task_subcomment
4410 -94.493897 160.641828 ... 1463439992 task_subcomment
46 -89.814365 179.163194 ... 1441031208 task_subcomment
[10 rows x 36 columns]
Top 10 PC2 values:
PC1 PC2 ... date_created comment_type
24881 -254.291673 450.247095 ... 1350678600 task_description
4412 -218.516235 406.636956 ... 1463441050 task_subcomment
4413 -218.707675 406.488424 ... 1463441072 task_subcomment
11326 -140.543606 274.762080 ... 1354470131 task_subcomment
18176 -142.494770 253.636183 ... 1380947348 task_subcomment
13442 -100.902011 197.908102 ... 1440633395 task_subcomment
693 -98.717155 181.023968 ... 1379611711 task_subcomment
6778 -107.601033 180.118546 ... 1374730027 task_subcomment
46 -89.814365 179.163194 ... 1441031208 task_subcomment
4410 -94.493897 160.641828 ... 1463439992 task_subcomment
[10 rows x 36 columns]
Bottom 10 PC2 values:
PC1 PC2 ... date_created comment_type
12410 -7.877613 -20.755159 ... 1422554389 task_subcomment
22835 -7.877613 -20.755159 ... 1462375135 task_subcomment
19501 -7.867440 -20.737476 ... 1317586881 task_subcomment
24870 -7.864049 -20.731582 ... 1327978205 task_subcomment
18694 -10.567395 -19.469923 ... 1377104818 task_subcomment
1330 -4.665646 -19.045749 ... 1321220595 task_subcomment
1103 -5.695317 -18.884254 ... 1428954897 task_subcomment
14631 -5.958980 -18.052657 ... 1412324629 task_subcomment
598 -15.028547 -17.737022 ... 1384635692 task_subcomment
4154 -10.280136 -17.191996 ... 1380638194 task_subcomment
[10 rows x 36 columns]
Top 10 PC3 values:
PC1 PC2 ... date_created comment_type
1639 53.461173 -2.188626 ... 1375331403 task_subcomment
3549 53.400133 -2.294723 ... 1456539439 task_subcomment
15715 53.400133 -2.294723 ... 1384994015 task_subcomment
12766 53.396742 -2.300618 ... 1442087854 task_subcomment
11104 22.288240 -10.664704 ... 1377544788 task_subcomment
11105 22.288240 -10.664704 ... 1377544792 task_subcomment
11107 22.288240 -10.664704 ... 1377545735 task_subcomment
11911 57.304544 2.321048 ... 1350946140 task_subcomment
3474 14.292880 -10.641835 ... 1374012685 task_subcomment
13254 35.952938 -3.871137 ... 1434130529 task_subcomment
[10 rows x 36 columns]
Bottom 10 PC3 values:
PC1 PC2 ... date_created comment_type
103 58.026764 29.547741 ... 1453561068 task_subcomment
104 58.026764 29.547741 ... 1453561129 task_subcomment
168 58.026764 29.547741 ... 1420466644 task_subcomment
169 58.026764 29.547741 ... 1420473867 task_subcomment
506 58.026764 29.547741 ... 1491557250 task_subcomment
507 58.026764 29.547741 ... 1491557269 task_subcomment
1202 58.026764 29.547741 ... 1601272820 task_subcomment
1225 58.026764 29.547741 ... 1431790268 task_subcomment
1226 58.026764 29.547741 ... 1431790446 task_subcomment
1276 58.026764 29.547741 ... 1624208185 task_subcomment
[10 rows x 36 columns]
Top 10 PC4 values:
PC1 PC2 ... date_created comment_type
14207 78.893820 23.026780 ... 1676690655 task_subcomment
1109 30.520772 -5.107689 ... 1430255616 task_subcomment
13611 31.636333 -3.165316 ... 1399747303 task_subcomment
1498 14.905192 -9.401637 ... 1424206043 task_subcomment
598 -15.028547 -17.737022 ... 1384635692 task_subcomment
15423 50.507754 10.487969 ... 1355446597 task_subcomment
13536 33.130617 -3.199009 ... 1354149956 task_subcomment
20484 16.484139 -8.759647 ... 1361832639 task_subcomment
13234 16.481595 -8.764067 ... 1438011707 task_subcomment
15790 16.797678 -8.172293 ... 1436224473 task_subcomment
[10 rows x 36 columns]
Bottom 10 PC4 values:
PC1 PC2 ... date_created comment_type
12410 -7.877613 -20.755159 ... 1422554389 task_subcomment
22835 -7.877613 -20.755159 ... 1462375135 task_subcomment
19501 -7.867440 -20.737476 ... 1317586881 task_subcomment
24870 -7.864049 -20.731582 ... 1327978205 task_subcomment
1103 -5.695317 -18.884254 ... 1428954897 task_subcomment
1335 -5.631462 -15.155290 ... 1328300138 task_subcomment
14631 -5.958980 -18.052657 ... 1412324629 task_subcomment
14666 1.866478 -10.987911 ... 1434020520 task_subcomment
21167 8.687041 -12.537389 ... 1372042733 task_subcomment
406 10.504327 -8.538062 ... 1374557457 task_subcomment
[10 rows x 36 columns]
job finished, cleaning up
job pau at: Tue Dec 16 15:27:19 CST 2025

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@ -85,7 +85,7 @@ def format_df_data(df):
return x
if __name__ == "__main__":
biber_vec_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/analysis_data/120725_unified.csv", low_memory=False)
biber_vec_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/analysis_data/forPCA_121625_unified.csv", low_memory=False)
#biber_vec_df = biber_vec_df[biber_vec_df['comment_type'] != 'task_description']
biber_vec_df = biber_vec_df[biber_vec_df['AuthorPHID'] != "PHID-USER-idceizaw6elwiwm5xshb"]
biber_vec_df = biber_vec_df[biber_vec_df['comment_text'] != 'nan']
@ -103,7 +103,7 @@ if __name__ == "__main__":
pca = PCA(n_components=argmax_components)
biber_vecs_pca = pca.fit_transform(biber_vecs)
with open('121525_total_pca.pkl', 'wb') as f:
with open('121625_total_pca.pkl', 'wb') as f:
pickle.dump(pca, f)
selected_axis = "AuthorWMFAffil"
@ -136,7 +136,7 @@ if __name__ == "__main__":
plot_df = pd.DataFrame(pc_dict)
plot_df.to_csv("121525_total_pca_df.csv", index=False)
plot_df.to_csv("121625_total_pca_df.csv", index=False)
print("Top 10 PC1 values:")
print(plot_df.nlargest(10, "PC1"))

View File

@ -8,7 +8,7 @@
#SBATCH --mem=64G
#SBATCH --cpus-per-task=4
#SBATCH --job-name=neurobiber-pca
#SBATCH --output=121525_total_neurobiber-pca.log
#SBATCH --output=121625_total_neurobiber-pca.log
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH --mail-user=gaughan@u.northwestern.edu