updating with new PCA run
This commit is contained in:
parent
e0cb055ff7
commit
df1dcf1224
421
p2/quest/121625_total_neurobiber-pca.log
Normal file
421
p2/quest/121625_total_neurobiber-pca.log
Normal file
@ -0,0 +1,421 @@
|
||||
starting the job at: Tue Dec 16 15:26:31 CST 2025
|
||||
setting up the environment
|
||||
running the neurobiber labeling script
|
||||
0 [The #Cloud-Services project tag is not intend...
|
||||
1 [Token is used for 2-factor auth., I'm surpris...
|
||||
2 [Oh, of course it's visible since you /might/ ...
|
||||
3 [Can this be closed?, We can now use the proxy...
|
||||
4 [I just now tried creating a new instance, and...
|
||||
...
|
||||
25022 [I think this should be low priority., Only po...
|
||||
25023 [Go to some long article, scroll all the way d...
|
||||
25024 [In Microsoft Word, both character-level styli...
|
||||
25025 [LocalSettings.php lacks wgSecureLogin, wgCook...
|
||||
25026 [``CODE`CODE$wgSecureLoginCODE$wgServerCODE$wg...
|
||||
Name: olmo_cleaned_sentences, Length: 21302, dtype: object
|
||||
[[11. ]
|
||||
[15.5]
|
||||
[10.5]
|
||||
...
|
||||
[26. ]
|
||||
[18. ]
|
||||
[ 5. ]]
|
||||
Number of PCs explaining 90% variance: 25
|
||||
Variance of each PCA component: [227.24126404 147.97706893 75.8294832 65.33178909 59.86158651
|
||||
35.58328853 32.35739864 25.76291291 21.24897975 18.6584478
|
||||
16.64923169 15.07461057 11.37754832 10.83761848 9.1933966
|
||||
8.88840899 8.1956979 8.05178179 7.19167725 6.9942501
|
||||
6.65183757 6.39961806 6.22347534 5.83078813 5.49524439]
|
||||
PC1:
|
||||
normalized_CAP: 0.613
|
||||
normalized_NNP: 0.566
|
||||
median_sentence_length: -0.473
|
||||
normalized_DET: -0.123
|
||||
normalized_PIN: -0.111
|
||||
normalized_PREP: -0.111
|
||||
normalized_ART: -0.088
|
||||
normalized_NN: -0.088
|
||||
normalized_VPRT: -0.056
|
||||
normalized_JJ: -0.055
|
||||
PC2:
|
||||
median_sentence_length: 0.860
|
||||
normalized_NNP: 0.436
|
||||
normalized_CAP: 0.184
|
||||
normalized_DET: -0.083
|
||||
normalized_NN: -0.075
|
||||
normalized_VPRT: -0.058
|
||||
normalized_RB: -0.050
|
||||
normalized_PIN: -0.049
|
||||
normalized_PREP: -0.049
|
||||
normalized_AUXB: -0.048
|
||||
PC3:
|
||||
normalized_NN: 0.692
|
||||
normalized_CAP: 0.451
|
||||
normalized_NNP: -0.303
|
||||
normalized_RB: -0.190
|
||||
normalized_PREP: 0.184
|
||||
normalized_PIN: 0.184
|
||||
normalized_PRP: -0.165
|
||||
normalized_SBJP: -0.165
|
||||
normalized_VPRT: -0.107
|
||||
median_sentence_length: 0.100
|
||||
PC4:
|
||||
normalized_PREP: 0.484
|
||||
normalized_PIN: 0.484
|
||||
normalized_CAP: 0.373
|
||||
normalized_NN: -0.340
|
||||
normalized_PRP: 0.244
|
||||
normalized_SBJP: 0.244
|
||||
normalized_RB: 0.169
|
||||
normalized_INF: 0.168
|
||||
normalized_FPP1: 0.135
|
||||
normalized_NNP: -0.118
|
||||
PC5:
|
||||
normalized_NNP: 0.463
|
||||
normalized_CAP: -0.426
|
||||
normalized_PIN: 0.421
|
||||
normalized_PREP: 0.421
|
||||
normalized_RB: -0.305
|
||||
normalized_SBJP: -0.178
|
||||
normalized_PRP: -0.178
|
||||
median_sentence_length: -0.129
|
||||
normalized_CONJ: 0.112
|
||||
normalized_VPRT: -0.108
|
||||
PC6:
|
||||
normalized_DET: 0.682
|
||||
normalized_ART: 0.406
|
||||
normalized_VPRT: 0.300
|
||||
normalized_AUXB: 0.244
|
||||
normalized_NNP: 0.191
|
||||
normalized_INDA: 0.178
|
||||
normalized_NUM: -0.142
|
||||
normalized_DEMP: 0.124
|
||||
normalized_NN: 0.114
|
||||
normalized_BEMA: 0.112
|
||||
PC7:
|
||||
normalized_PRP: 0.482
|
||||
normalized_SBJP: 0.482
|
||||
normalized_NN: 0.455
|
||||
normalized_FPP1: 0.253
|
||||
normalized_NNP: 0.242
|
||||
normalized_RB: -0.201
|
||||
normalized_AUXB: -0.168
|
||||
normalized_PASS: -0.144
|
||||
normalized_CAP: -0.143
|
||||
normalized_PIT: 0.128
|
||||
PC8:
|
||||
normalized_RB: 0.804
|
||||
normalized_NN: 0.344
|
||||
normalized_NNP: 0.219
|
||||
normalized_NUM: -0.157
|
||||
normalized_CAP: -0.137
|
||||
normalized_DET: -0.129
|
||||
normalized_TIME: 0.125
|
||||
normalized_XX0: 0.110
|
||||
normalized_SPAU: 0.101
|
||||
normalized_PRP: -0.092
|
||||
PC9:
|
||||
normalized_JJ: 0.578
|
||||
normalized_INF: 0.353
|
||||
sentence_count: 0.314
|
||||
normalized_VPRT: -0.302
|
||||
normalized_NUM: -0.263
|
||||
normalized_ART: 0.229
|
||||
normalized_PASS: -0.218
|
||||
normalized_AUXB: -0.214
|
||||
normalized_CONJ: -0.139
|
||||
normalized_DET: 0.125
|
||||
PC10:
|
||||
normalized_JJ: 0.568
|
||||
normalized_INF: -0.553
|
||||
normalized_VPRT: 0.390
|
||||
normalized_DET: -0.252
|
||||
normalized_ART: -0.140
|
||||
normalized_AUXB: 0.124
|
||||
normalized_NUM: -0.124
|
||||
sentence_count: 0.116
|
||||
normalized_TO: -0.101
|
||||
normalized_BEMA: 0.093
|
||||
PC11:
|
||||
sentence_count: 0.617
|
||||
normalized_INF: -0.440
|
||||
normalized_JJ: -0.321
|
||||
normalized_AUXB: -0.271
|
||||
normalized_VPRT: -0.190
|
||||
normalized_ART: 0.184
|
||||
normalized_RB: 0.170
|
||||
normalized_TO: -0.140
|
||||
normalized_PASS: -0.118
|
||||
normalized_DET: 0.117
|
||||
PC12:
|
||||
sentence_count: 0.639
|
||||
normalized_VPRT: 0.347
|
||||
normalized_INF: 0.325
|
||||
normalized_AUXB: 0.280
|
||||
normalized_VBD: -0.240
|
||||
normalized_JJ: -0.223
|
||||
normalized_PASS: 0.157
|
||||
normalized_ART: -0.145
|
||||
normalized_DET: -0.130
|
||||
normalized_PUBV: -0.124
|
||||
PC13:
|
||||
normalized_NUM: 0.592
|
||||
normalized_VBD: -0.482
|
||||
normalized_AUXB: -0.289
|
||||
normalized_VPRT: 0.263
|
||||
normalized_PASS: -0.235
|
||||
normalized_ART: 0.157
|
||||
normalized_CONJ: 0.150
|
||||
normalized_INDA: 0.134
|
||||
normalized_TIME: -0.133
|
||||
normalized_PUBV: -0.130
|
||||
PC14:
|
||||
normalized_NUM: 0.479
|
||||
normalized_QUOT: -0.361
|
||||
normalized_VBD: 0.317
|
||||
normalized_JJ: 0.276
|
||||
normalized_AUXB: 0.246
|
||||
normalized_CONT: -0.243
|
||||
sentence_count: 0.226
|
||||
normalized_PUBV: 0.204
|
||||
normalized_VPRT: -0.204
|
||||
normalized_CONJ: 0.173
|
||||
PC15:
|
||||
normalized_PUBV: 0.534
|
||||
normalized_CONJ: -0.332
|
||||
normalized_VBD: 0.308
|
||||
normalized_UH: -0.306
|
||||
normalized_QUOT: 0.294
|
||||
normalized_VPRT: 0.236
|
||||
normalized_CONT: 0.212
|
||||
normalized_NUM: 0.196
|
||||
normalized_PASS: -0.162
|
||||
normalized_TO: 0.135
|
||||
PC16:
|
||||
normalized_QUOT: 0.572
|
||||
normalized_CONT: 0.438
|
||||
normalized_CONJ: 0.287
|
||||
normalized_PGAS: -0.274
|
||||
normalized_PUBV: -0.245
|
||||
normalized_NOMZ: -0.204
|
||||
normalized_UH: -0.164
|
||||
normalized_PASS: 0.160
|
||||
normalized_VBD: 0.155
|
||||
normalized_VPRT: -0.151
|
||||
PC17:
|
||||
normalized_PUBV: 0.491
|
||||
normalized_CONJ: 0.476
|
||||
normalized_PGAS: -0.306
|
||||
normalized_NUM: -0.295
|
||||
normalized_ART: 0.246
|
||||
normalized_UH: 0.198
|
||||
normalized_VPRT: 0.196
|
||||
normalized_DEMP: -0.178
|
||||
normalized_DET: -0.163
|
||||
normalized_INDA: 0.145
|
||||
PC18:
|
||||
normalized_UH: 0.757
|
||||
normalized_PGAS: -0.319
|
||||
normalized_CONJ: -0.290
|
||||
normalized_NOMZ: -0.255
|
||||
normalized_VBD: 0.167
|
||||
normalized_CCONJ: -0.161
|
||||
normalized_NUM: 0.139
|
||||
normalized_SCONJ: 0.097
|
||||
normalized_JJ: 0.091
|
||||
sentence_count: 0.077
|
||||
PC19:
|
||||
normalized_ART: 0.442
|
||||
normalized_CONJ: -0.352
|
||||
normalized_DEMO: -0.294
|
||||
normalized_DET: -0.286
|
||||
normalized_AUXB: 0.279
|
||||
normalized_DEMP: -0.242
|
||||
normalized_PIT: 0.234
|
||||
normalized_INDA: 0.226
|
||||
normalized_FPP1: -0.211
|
||||
normalized_NUM: 0.172
|
||||
PC20:
|
||||
normalized_PGAS: 0.741
|
||||
normalized_UH: 0.330
|
||||
normalized_X: -0.274
|
||||
normalized_CONJ: 0.238
|
||||
normalized_AUXB: 0.157
|
||||
normalized_CONT: 0.149
|
||||
normalized_QUES: -0.136
|
||||
normalized_NUM: 0.116
|
||||
normalized_PUBV: 0.115
|
||||
normalized_NOMZ: -0.114
|
||||
PC21:
|
||||
normalized_CCONJ: 0.608
|
||||
normalized_QUES: -0.369
|
||||
normalized_X: -0.250
|
||||
normalized_AUXB: -0.218
|
||||
normalized_PRIV: 0.214
|
||||
normalized_VPRT: 0.208
|
||||
normalized_BEMA: -0.191
|
||||
normalized_TIME: 0.185
|
||||
normalized_FPP1: 0.171
|
||||
normalized_SCONJ: -0.135
|
||||
PC22:
|
||||
normalized_X: 0.585
|
||||
normalized_PRIV: 0.499
|
||||
normalized_QUES: -0.278
|
||||
normalized_CCONJ: -0.270
|
||||
normalized_VBD: 0.232
|
||||
normalized_DEMO: -0.162
|
||||
normalized_PUBV: -0.154
|
||||
normalized_FPP1: 0.153
|
||||
normalized_CONJ: 0.139
|
||||
normalized_ART: 0.101
|
||||
PC23:
|
||||
normalized_NOMZ: 0.568
|
||||
normalized_X: -0.520
|
||||
normalized_CCONJ: -0.308
|
||||
normalized_PRIV: 0.276
|
||||
normalized_PUBV: -0.162
|
||||
normalized_PASS: 0.147
|
||||
normalized_FPP1: 0.128
|
||||
normalized_CONJ: -0.126
|
||||
normalized_PGAS: -0.125
|
||||
normalized_DEMO: 0.122
|
||||
PC24:
|
||||
normalized_QUES: 0.430
|
||||
normalized_CCONJ: 0.385
|
||||
normalized_CONJ: 0.280
|
||||
normalized_PRIV: 0.278
|
||||
normalized_VBD: 0.259
|
||||
normalized_WH: 0.242
|
||||
normalized_PUBV: -0.237
|
||||
normalized_SCONJ: 0.209
|
||||
normalized_PASS: -0.198
|
||||
normalized_TIME: -0.191
|
||||
PC25:
|
||||
normalized_DEMP: 0.448
|
||||
normalized_NOMZ: 0.443
|
||||
normalized_DEMO: -0.420
|
||||
normalized_INDA: -0.209
|
||||
normalized_VBD: -0.199
|
||||
normalized_SPAU: 0.194
|
||||
normalized_PRIV: -0.186
|
||||
normalized_UH: 0.186
|
||||
normalized_PEAS: -0.182
|
||||
normalized_NUM: 0.157
|
||||
Top 10 PC1 values:
|
||||
PC1 PC2 ... date_created comment_type
|
||||
2471 118.825452 48.883675 ... 1424754141 task_subcomment
|
||||
971 118.822061 48.877781 ... 1354316739 task_subcomment
|
||||
984 118.822061 48.877781 ... 1359160095 task_subcomment
|
||||
987 118.822061 48.877781 ... 1362102239 task_subcomment
|
||||
989 118.822061 48.877781 ... 1362441994 task_subcomment
|
||||
1486 118.822061 48.877781 ... 1362478487 task_subcomment
|
||||
3708 118.822061 48.877781 ... 1344625237 task_subcomment
|
||||
3714 118.822061 48.877781 ... 1345813989 task_subcomment
|
||||
3720 118.822061 48.877781 ... 1348771229 task_subcomment
|
||||
3730 118.822061 48.877781 ... 1349619536 task_subcomment
|
||||
|
||||
[10 rows x 36 columns]
|
||||
|
||||
Bottom 10 PC1 values:
|
||||
PC1 PC2 ... date_created comment_type
|
||||
24881 -254.291673 450.247095 ... 1350678600 task_description
|
||||
4413 -218.707675 406.488424 ... 1463441072 task_subcomment
|
||||
4412 -218.516235 406.636956 ... 1463441050 task_subcomment
|
||||
18176 -142.494770 253.636183 ... 1380947348 task_subcomment
|
||||
11326 -140.543606 274.762080 ... 1354470131 task_subcomment
|
||||
6778 -107.601033 180.118546 ... 1374730027 task_subcomment
|
||||
13442 -100.902011 197.908102 ... 1440633395 task_subcomment
|
||||
693 -98.717155 181.023968 ... 1379611711 task_subcomment
|
||||
4410 -94.493897 160.641828 ... 1463439992 task_subcomment
|
||||
46 -89.814365 179.163194 ... 1441031208 task_subcomment
|
||||
|
||||
[10 rows x 36 columns]
|
||||
Top 10 PC2 values:
|
||||
PC1 PC2 ... date_created comment_type
|
||||
24881 -254.291673 450.247095 ... 1350678600 task_description
|
||||
4412 -218.516235 406.636956 ... 1463441050 task_subcomment
|
||||
4413 -218.707675 406.488424 ... 1463441072 task_subcomment
|
||||
11326 -140.543606 274.762080 ... 1354470131 task_subcomment
|
||||
18176 -142.494770 253.636183 ... 1380947348 task_subcomment
|
||||
13442 -100.902011 197.908102 ... 1440633395 task_subcomment
|
||||
693 -98.717155 181.023968 ... 1379611711 task_subcomment
|
||||
6778 -107.601033 180.118546 ... 1374730027 task_subcomment
|
||||
46 -89.814365 179.163194 ... 1441031208 task_subcomment
|
||||
4410 -94.493897 160.641828 ... 1463439992 task_subcomment
|
||||
|
||||
[10 rows x 36 columns]
|
||||
|
||||
Bottom 10 PC2 values:
|
||||
PC1 PC2 ... date_created comment_type
|
||||
12410 -7.877613 -20.755159 ... 1422554389 task_subcomment
|
||||
22835 -7.877613 -20.755159 ... 1462375135 task_subcomment
|
||||
19501 -7.867440 -20.737476 ... 1317586881 task_subcomment
|
||||
24870 -7.864049 -20.731582 ... 1327978205 task_subcomment
|
||||
18694 -10.567395 -19.469923 ... 1377104818 task_subcomment
|
||||
1330 -4.665646 -19.045749 ... 1321220595 task_subcomment
|
||||
1103 -5.695317 -18.884254 ... 1428954897 task_subcomment
|
||||
14631 -5.958980 -18.052657 ... 1412324629 task_subcomment
|
||||
598 -15.028547 -17.737022 ... 1384635692 task_subcomment
|
||||
4154 -10.280136 -17.191996 ... 1380638194 task_subcomment
|
||||
|
||||
[10 rows x 36 columns]
|
||||
Top 10 PC3 values:
|
||||
PC1 PC2 ... date_created comment_type
|
||||
1639 53.461173 -2.188626 ... 1375331403 task_subcomment
|
||||
3549 53.400133 -2.294723 ... 1456539439 task_subcomment
|
||||
15715 53.400133 -2.294723 ... 1384994015 task_subcomment
|
||||
12766 53.396742 -2.300618 ... 1442087854 task_subcomment
|
||||
11104 22.288240 -10.664704 ... 1377544788 task_subcomment
|
||||
11105 22.288240 -10.664704 ... 1377544792 task_subcomment
|
||||
11107 22.288240 -10.664704 ... 1377545735 task_subcomment
|
||||
11911 57.304544 2.321048 ... 1350946140 task_subcomment
|
||||
3474 14.292880 -10.641835 ... 1374012685 task_subcomment
|
||||
13254 35.952938 -3.871137 ... 1434130529 task_subcomment
|
||||
|
||||
[10 rows x 36 columns]
|
||||
|
||||
Bottom 10 PC3 values:
|
||||
PC1 PC2 ... date_created comment_type
|
||||
103 58.026764 29.547741 ... 1453561068 task_subcomment
|
||||
104 58.026764 29.547741 ... 1453561129 task_subcomment
|
||||
168 58.026764 29.547741 ... 1420466644 task_subcomment
|
||||
169 58.026764 29.547741 ... 1420473867 task_subcomment
|
||||
506 58.026764 29.547741 ... 1491557250 task_subcomment
|
||||
507 58.026764 29.547741 ... 1491557269 task_subcomment
|
||||
1202 58.026764 29.547741 ... 1601272820 task_subcomment
|
||||
1225 58.026764 29.547741 ... 1431790268 task_subcomment
|
||||
1226 58.026764 29.547741 ... 1431790446 task_subcomment
|
||||
1276 58.026764 29.547741 ... 1624208185 task_subcomment
|
||||
|
||||
[10 rows x 36 columns]
|
||||
Top 10 PC4 values:
|
||||
PC1 PC2 ... date_created comment_type
|
||||
14207 78.893820 23.026780 ... 1676690655 task_subcomment
|
||||
1109 30.520772 -5.107689 ... 1430255616 task_subcomment
|
||||
13611 31.636333 -3.165316 ... 1399747303 task_subcomment
|
||||
1498 14.905192 -9.401637 ... 1424206043 task_subcomment
|
||||
598 -15.028547 -17.737022 ... 1384635692 task_subcomment
|
||||
15423 50.507754 10.487969 ... 1355446597 task_subcomment
|
||||
13536 33.130617 -3.199009 ... 1354149956 task_subcomment
|
||||
20484 16.484139 -8.759647 ... 1361832639 task_subcomment
|
||||
13234 16.481595 -8.764067 ... 1438011707 task_subcomment
|
||||
15790 16.797678 -8.172293 ... 1436224473 task_subcomment
|
||||
|
||||
[10 rows x 36 columns]
|
||||
|
||||
Bottom 10 PC4 values:
|
||||
PC1 PC2 ... date_created comment_type
|
||||
12410 -7.877613 -20.755159 ... 1422554389 task_subcomment
|
||||
22835 -7.877613 -20.755159 ... 1462375135 task_subcomment
|
||||
19501 -7.867440 -20.737476 ... 1317586881 task_subcomment
|
||||
24870 -7.864049 -20.731582 ... 1327978205 task_subcomment
|
||||
1103 -5.695317 -18.884254 ... 1428954897 task_subcomment
|
||||
1335 -5.631462 -15.155290 ... 1328300138 task_subcomment
|
||||
14631 -5.958980 -18.052657 ... 1412324629 task_subcomment
|
||||
14666 1.866478 -10.987911 ... 1434020520 task_subcomment
|
||||
21167 8.687041 -12.537389 ... 1372042733 task_subcomment
|
||||
406 10.504327 -8.538062 ... 1374557457 task_subcomment
|
||||
|
||||
[10 rows x 36 columns]
|
||||
job finished, cleaning up
|
||||
job pau at: Tue Dec 16 15:27:19 CST 2025
|
||||
BIN
p2/quest/121625_total_pca.pkl
Normal file
BIN
p2/quest/121625_total_pca.pkl
Normal file
Binary file not shown.
130647
p2/quest/121625_total_pca_df.csv
Normal file
130647
p2/quest/121625_total_pca_df.csv
Normal file
File diff suppressed because one or more lines are too long
@ -85,7 +85,7 @@ def format_df_data(df):
|
||||
return x
|
||||
|
||||
if __name__ == "__main__":
|
||||
biber_vec_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/analysis_data/120725_unified.csv", low_memory=False)
|
||||
biber_vec_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/analysis_data/forPCA_121625_unified.csv", low_memory=False)
|
||||
#biber_vec_df = biber_vec_df[biber_vec_df['comment_type'] != 'task_description']
|
||||
biber_vec_df = biber_vec_df[biber_vec_df['AuthorPHID'] != "PHID-USER-idceizaw6elwiwm5xshb"]
|
||||
biber_vec_df = biber_vec_df[biber_vec_df['comment_text'] != 'nan']
|
||||
@ -103,7 +103,7 @@ if __name__ == "__main__":
|
||||
|
||||
pca = PCA(n_components=argmax_components)
|
||||
biber_vecs_pca = pca.fit_transform(biber_vecs)
|
||||
with open('121525_total_pca.pkl', 'wb') as f:
|
||||
with open('121625_total_pca.pkl', 'wb') as f:
|
||||
pickle.dump(pca, f)
|
||||
selected_axis = "AuthorWMFAffil"
|
||||
|
||||
@ -136,7 +136,7 @@ if __name__ == "__main__":
|
||||
|
||||
|
||||
plot_df = pd.DataFrame(pc_dict)
|
||||
plot_df.to_csv("121525_total_pca_df.csv", index=False)
|
||||
plot_df.to_csv("121625_total_pca_df.csv", index=False)
|
||||
|
||||
print("Top 10 PC1 values:")
|
||||
print(plot_df.nlargest(10, "PC1"))
|
||||
|
||||
@ -8,7 +8,7 @@
|
||||
#SBATCH --mem=64G
|
||||
#SBATCH --cpus-per-task=4
|
||||
#SBATCH --job-name=neurobiber-pca
|
||||
#SBATCH --output=121525_total_neurobiber-pca.log
|
||||
#SBATCH --output=121625_total_neurobiber-pca.log
|
||||
#SBATCH --mail-type=BEGIN,END,FAIL
|
||||
#SBATCH --mail-user=gaughan@u.northwestern.edu
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user