1
0

updated PCA results with dropped rows

This commit is contained in:
mgaughan 2025-10-01 21:28:12 -05:00
parent e61d3b6599
commit f636969541
9 changed files with 146680 additions and 19 deletions

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,299 @@
starting the job at: Wed Oct 1 20:55:40 CDT 2025
setting up the environment
running the neurobiber labeling script
Number of PCs explaining 90% variance: 21
Variance of each PCA component: [44.14465236 25.51079987 20.02977026 11.84052754 8.73144858 8.38589906
6.95245699 5.64852989 5.25245119 4.98015739 4.87640589 3.84009303
3.46134099 2.49633957 2.31075199 2.07408882 1.83990439 1.83715267
1.69163987 1.34972345 1.21923888]
PC1:
BIN_CAP: 0.575
BIN_NNP: 0.568
BIN_DET: -0.296
BIN_ART: -0.232
BIN_PREP: -0.226
BIN_PIN: -0.226
BIN_RB: -0.126
BIN_INF: -0.109
BIN_PRP: -0.105
BIN_SBJP: -0.105
PC2:
BIN_PREP: 0.498
BIN_PIN: 0.498
BIN_NN: 0.460
BIN_CAP: 0.334
BIN_NNP: 0.313
BIN_DET: 0.148
BIN_NOMZ: -0.112
BIN_ART: 0.111
BIN_INF: 0.097
BIN_CONJ: 0.075
PC3:
BIN_NN: 0.811
BIN_PIN: -0.235
BIN_PREP: -0.235
BIN_NNP: -0.223
BIN_PRP: -0.196
BIN_SBJP: -0.196
BIN_RB: -0.175
BIN_INF: -0.130
BIN_FPP1: -0.091
BIN_VPRT: -0.085
PC4:
BIN_DET: 0.587
BIN_ART: 0.528
BIN_PREP: -0.282
BIN_PIN: -0.282
BIN_CAP: 0.252
BIN_INDA: 0.183
BIN_VPRT: 0.178
BIN_JJ: -0.137
BIN_NOMZ: -0.130
BIN_NNP: 0.123
PC5:
BIN_RB: 0.439
BIN_CAP: 0.348
BIN_PRP: 0.313
BIN_SBJP: 0.313
BIN_NNP: -0.285
BIN_ART: -0.234
BIN_VPRT: 0.231
BIN_NN: 0.229
BIN_DET: -0.210
BIN_NOMZ: -0.160
PC6:
BIN_JJ: 0.552
BIN_CAP: 0.454
BIN_NNP: -0.397
BIN_NOMZ: 0.374
BIN_X: -0.208
BIN_QUOT: -0.184
BIN_NN: -0.160
BIN_NUM: -0.146
BIN_CONT: -0.117
BIN_ART: 0.088
PC7:
BIN_JJ: 0.552
BIN_NNP: 0.417
BIN_VPRT: 0.374
BIN_CAP: -0.333
BIN_RB: 0.258
BIN_QUOT: -0.224
BIN_X: -0.175
BIN_INF: -0.172
BIN_AUXB: 0.157
BIN_XX0: 0.091
PC8:
BIN_INF: 0.720
BIN_QUOT: -0.330
BIN_VPRT: -0.252
BIN_RB: 0.200
BIN_TO: 0.190
BIN_NOMZ: 0.159
BIN_NUM: -0.152
BIN_NNP: 0.147
BIN_PRP: -0.132
BIN_SBJP: -0.132
PC9:
BIN_QUOT: 0.681
BIN_JJ: 0.417
BIN_INF: 0.317
BIN_CONT: 0.281
BIN_NOMZ: -0.266
BIN_PRP: -0.139
BIN_SBJP: -0.139
BIN_X: 0.129
BIN_RB: 0.084
BIN_CAP: 0.072
PC10:
BIN_RB: 0.507
BIN_PRP: -0.411
BIN_SBJP: -0.411
BIN_NNP: -0.204
BIN_X: 0.202
BIN_FPP1: -0.195
BIN_INF: -0.193
BIN_NOMZ: -0.158
BIN_NUM: 0.156
BIN_JJ: -0.154
PC11:
BIN_X: 0.632
BIN_NOMZ: -0.436
BIN_QUOT: -0.379
BIN_JJ: 0.317
BIN_CONT: -0.171
BIN_NUM: 0.159
BIN_RB: -0.149
BIN_PRP: 0.119
BIN_SBJP: 0.119
BIN_INF: 0.106
PC12:
BIN_VPRT: 0.495
BIN_X: 0.445
BIN_AUXB: 0.381
BIN_NUM: -0.346
BIN_NOMZ: 0.291
BIN_RB: -0.234
BIN_PASS: 0.177
BIN_JJ: -0.159
BIN_VBD: -0.118
BIN_BEMA: 0.112
PC13:
BIN_NUM: 0.440
BIN_X: -0.437
BIN_RB: -0.347
BIN_NOMZ: -0.338
BIN_AUXB: 0.333
BIN_VPRT: 0.223
BIN_INF: 0.210
BIN_PASS: 0.141
BIN_TO: 0.132
BIN_BEMA: 0.127
PC14:
BIN_AUXB: 0.473
BIN_VPRT: -0.443
BIN_NUM: -0.405
BIN_VBD: 0.282
BIN_CONT: -0.239
BIN_NOMZ: -0.211
BIN_PASS: 0.209
BIN_BEMA: 0.165
BIN_INF: -0.156
BIN_CCONJ: 0.149
PC15:
BIN_NUM: 0.581
BIN_NOMZ: 0.428
BIN_AUXB: 0.327
BIN_VPRT: -0.213
BIN_PGAS: -0.197
BIN_X: 0.187
BIN_RB: 0.164
BIN_BEMA: 0.163
BIN_QUOT: 0.143
BIN_CCONJ: -0.143
PC16:
BIN_PGAS: 0.702
BIN_CONJ: -0.428
BIN_CCONJ: -0.371
BIN_SCONJ: 0.217
BIN_WH: 0.138
BIN_WZPRES: 0.132
BIN_TO: 0.132
BIN_GER: 0.090
BIN_VBD: 0.089
BIN_NUM: 0.088
PC17:
BIN_CCONJ: 0.462
BIN_PGAS: 0.459
BIN_CONJ: 0.395
BIN_CONT: -0.333
BIN_QUOT: 0.184
BIN_NUM: 0.180
BIN_VPRT: 0.177
BIN_VBD: -0.172
BIN_XX0: -0.170
BIN_SPAU: -0.139
PC18:
BIN_CCONJ: 0.691
BIN_CONJ: -0.502
BIN_CONT: 0.238
BIN_VBD: 0.152
BIN_NUM: 0.149
BIN_ANDC: 0.144
BIN_INDA: -0.122
BIN_XX0: 0.120
BIN_PRIV: -0.115
BIN_PHC: 0.101
PC19:
BIN_CONT: 0.563
BIN_CONJ: 0.459
BIN_PGAS: 0.332
BIN_SPAU: 0.255
BIN_XX0: 0.234
BIN_QUOT: -0.231
BIN_RB: -0.223
BIN_SCONJ: -0.172
BIN_AUXB: 0.163
BIN_PASS: 0.131
PC20:
BIN_INDA: 0.674
BIN_DET: -0.416
BIN_QUAN: -0.265
BIN_ART: 0.217
BIN_FPP1: -0.202
BIN_PGAS: -0.152
BIN_CONJ: -0.150
BIN_SCONJ: 0.141
BIN_CCONJ: 0.130
BIN_DEMO: -0.128
PC21:
BIN_SCONJ: 0.568
BIN_PRIV: 0.541
BIN_TO: -0.332
BIN_WH: 0.270
BIN_RB: -0.158
BIN_INDA: -0.139
BIN_COND: 0.134
BIN_VPRT: -0.130
BIN_CCONJ: 0.129
BIN_CONJ: 0.102
Top 10 PC1 values:
PC1 PC2 ... AuthorPHID date_created
19173 40.268860 26.736392 ... PHID-USER-doeppszazlm3r7xah4il 1416964345
23127 34.022257 7.573103 ... PHID-USER-myidf5vlkwvrgp2iwn76 1433839792
23533 33.055352 7.623438 ... PHID-USER-sai77mtxmpqnm6pycyvz 1424498718
24553 33.053151 7.621628 ... PHID-USER-sai77mtxmpqnm6pycyvz 1424498559
23532 33.050949 7.619818 ... PHID-USER-sai77mtxmpqnm6pycyvz 1424498772
22245 31.318686 5.617453 ... PHID-USER-v7vgzvvcw7v2umf737ri 1438377936
18500 29.657022 4.747496 ... PHID-USER-hbffue25ov3attlvclze 1387662960
22023 29.625085 9.081212 ... PHID-USER-a6p24cvyblhfzc7we7nc 1440568477
14809 28.210405 6.749195 ... PHID-USER-zjzhrhmn36icnzbckqy4 1379900100
22930 27.824399 14.949181 ... PHID-USER-fo56wm4wxiwpoofn2xdu 1436249770
[10 rows x 28 columns]
Bottom 10 PC1 values:
PC1 PC2 ... AuthorPHID date_created
23485 -16.873824 13.740160 ... PHID-USER-u7udgblfyop6qd5wxot6 1425991276
22060 -16.135690 12.174259 ... PHID-USER-2nnm76h4ykalvvref2ye 1440412099
22845 -15.391146 13.319574 ... PHID-USER-2nnm76h4ykalvvref2ye 1440085454
24795 -15.084050 14.347308 ... PHID-USER-5dwuaigmkz2vzg65lape 1419297091
7451 -14.541432 5.740545 ... PHID-USER-ysftv67jxeaxdwcakvwo 1374347580
23471 -13.857781 7.962597 ... PHID-USER-2nnm76h4ykalvvref2ye 1426228927
22443 -13.803016 7.605012 ... PHID-USER-fo56wm4wxiwpoofn2xdu 1435267334
23300 -13.605468 0.980452 ... PHID-USER-evd3wnvnlb66lrwulch4 1423322226
11814 -13.401241 7.881186 ... PHID-USER-5pyvkdz65d5h5vxebodc 1372684440
968 -13.313317 0.369182 ... PHID-USER-j5ma2nageni56xp567v5 1377621000
[10 rows x 28 columns]
Top 10 PC2 values:
PC1 PC2 ... AuthorPHID date_created
24610 6.265218 29.494190 ... PHID-USER-tafngdco2cilcyr7qhhg 1422645688
20963 27.578946 27.679075 ... PHID-USER-rooknayvbydy6sodz3lx 1436311793
24082 -4.360480 27.219954 ... PHID-USER-jcypqodpdpbcicgwgh7j 1419534643
19173 40.268860 26.736392 ... PHID-USER-doeppszazlm3r7xah4il 1416964345
24824 -2.967505 23.097004 ... PHID-USER-mdihg2tyzmlvyhn3h32y 1418230141
24818 20.182195 22.630740 ... PHID-USER-hbtlbu4zftxnz4i6f7yf 1418856731
13345 6.075708 22.048374 ... PHID-USER-ydswvwhh5pm4lshahjje 1371860160
21020 6.876811 21.888275 ... PHID-USER-zcsdm7lwcehnusyhh6xp 1435194938
20973 -7.021508 20.911008 ... PHID-USER-hxwwywcyzpooynxuo7a2 1435878993
22029 0.897428 20.736628 ... PHID-USER-a6p24cvyblhfzc7we7nc 1440568357
[10 rows x 28 columns]
Bottom 10 PC2 values:
PC1 PC2 ... AuthorPHID date_created
3134 5.691116 -12.652404 ... PHID-USER-ydswvwhh5pm4lshahjje 1374855900
654 -0.763875 -12.369520 ... PHID-USER-hbtlbu4zftxnz4i6f7yf 1366408980
16080 -0.816582 -12.352041 ... PHID-USER-zjzhrhmn36icnzbckqy4 1350678600
1207 4.758836 -12.101115 ... PHID-USER-slccyo5rqasgpljxny7g 1374857700
17982 6.571867 -11.954035 ... PHID-USER-kqibbfgfpgocyzwe32lv 1412196840
1885 15.905505 -11.884510 ... PHID-USER-hyfm4swq76s4j642w46x 1372088340
2934 0.131925 -11.738040 ... PHID-USER-it53o2f2kyryqyj33uzt 1375529520
2109 -2.111122 -11.398959 ... PHID-USER-p6hvqn5njgnxuagekh4b 1367215380
13276 15.471863 -11.316666 ... PHID-USER-z6nzrwuaij3spgyg23jt 1373035320
24126 -1.622360 -11.265986 ... PHID-USER-lhtlnmkdbzlz6pbxaqdd 1430156915
[10 rows x 28 columns]
job finished, cleaning up
job pau at: Wed Oct 1 20:56:13 CDT 2025

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,344 @@
starting the job at: Wed Oct 1 21:22:40 CDT 2025
setting up the environment
running the neurobiber labeling script
Number of PCs explaining 90% variance: 25
Variance of each PCA component: [261.88760369 82.85870809 65.99452385 61.23806692 38.87318255
32.55896743 26.32760005 21.94878602 18.68985685 16.2308729
13.53042996 11.45815987 10.63830359 9.15456628 8.85934109
8.30582956 8.03192941 7.15336822 6.76087663 6.48544413
5.93048664 5.76580965 5.61052561 5.26965951 4.93708453]
PC1:
BIN_CAP: 0.683
BIN_NNP: 0.645
BIN_DET: -0.151
BIN_PIN: -0.126
BIN_PREP: -0.126
BIN_VPRT: -0.091
BIN_ART: -0.090
BIN_RB: -0.085
BIN_PRP: -0.076
BIN_SBJP: -0.076
PC2:
BIN_NN: 0.750
BIN_NNP: -0.310
BIN_RB: -0.259
BIN_PRP: -0.227
BIN_SBJP: -0.227
BIN_CAP: 0.227
BIN_VPRT: -0.164
BIN_FPP1: -0.114
BIN_NUM: 0.104
BIN_INF: -0.096
PC3:
BIN_CAP: 0.660
BIN_NNP: -0.479
BIN_RB: 0.251
BIN_PRP: 0.220
BIN_SBJP: 0.220
BIN_PREP: 0.165
BIN_PIN: 0.165
BIN_X: -0.139
BIN_VPRT: 0.133
BIN_FPP1: 0.123
PC4:
BIN_PIN: 0.641
BIN_PREP: 0.641
BIN_NNP: 0.280
BIN_RB: -0.173
BIN_CONJ: 0.159
BIN_NN: -0.077
BIN_TO: 0.076
BIN_X: -0.074
BIN_VPRT: -0.066
BIN_CAP: -0.059
PC5:
BIN_DET: 0.620
BIN_ART: 0.381
BIN_X: -0.276
BIN_NN: 0.265
BIN_VPRT: 0.262
BIN_NNP: 0.247
BIN_AUXB: 0.220
BIN_NUM: -0.186
BIN_INF: -0.166
BIN_INDA: 0.157
PC6:
BIN_NN: 0.481
BIN_PRP: 0.463
BIN_SBJP: 0.463
BIN_NNP: 0.243
BIN_FPP1: 0.239
BIN_DET: -0.184
BIN_AUXB: -0.175
BIN_PASS: -0.145
BIN_CAP: -0.142
BIN_PIT: 0.127
PC7:
BIN_RB: 0.780
BIN_NN: 0.265
BIN_DET: -0.189
BIN_PRP: -0.187
BIN_SBJP: -0.187
BIN_JJ: -0.169
BIN_NNP: 0.154
BIN_X: -0.154
BIN_TIME: 0.139
BIN_ART: -0.137
PC8:
BIN_JJ: 0.676
BIN_INF: 0.339
BIN_VPRT: -0.318
BIN_ART: 0.226
BIN_PASS: -0.221
BIN_AUXB: -0.218
BIN_NUM: -0.215
BIN_CONJ: -0.149
BIN_RB: 0.146
BIN_PEAS: -0.120
PC9:
BIN_INF: 0.655
BIN_JJ: -0.538
BIN_VPRT: -0.301
BIN_DET: 0.247
BIN_TO: 0.132
BIN_ART: 0.129
BIN_PRIV: 0.108
BIN_NUM: 0.083
BIN_RB: -0.079
BIN_POMD: 0.072
PC10:
BIN_INF: 0.423
BIN_AUXB: 0.375
BIN_VPRT: 0.375
BIN_JJ: 0.261
BIN_ART: -0.261
BIN_RB: -0.250
BIN_VBD: -0.246
BIN_X: -0.229
BIN_DET: -0.214
BIN_PASS: 0.171
PC11:
BIN_X: 0.786
BIN_PUBV: -0.276
BIN_VPRT: 0.266
BIN_VBD: -0.264
BIN_NUM: -0.197
BIN_CONJ: -0.151
BIN_JJ: -0.137
BIN_INF: 0.099
BIN_UH: -0.098
BIN_NOMZ: -0.080
PC12:
BIN_NUM: 0.765
BIN_VBD: -0.265
BIN_VPRT: 0.211
BIN_UH: -0.173
BIN_RB: 0.163
BIN_INDA: 0.145
BIN_PGAS: -0.144
BIN_JJ: 0.140
BIN_QUOT: -0.140
BIN_ART: 0.138
PC13:
BIN_VBD: 0.484
BIN_QUOT: -0.401
BIN_AUXB: 0.355
BIN_CONT: -0.283
BIN_PASS: 0.249
BIN_X: 0.241
BIN_UH: -0.200
BIN_VPRT: -0.182
BIN_NUM: 0.162
BIN_PGAS: -0.136
PC14:
BIN_PUBV: 0.472
BIN_CONJ: -0.395
BIN_UH: -0.342
BIN_VBD: 0.322
BIN_QUOT: 0.291
BIN_VPRT: 0.251
BIN_CONT: 0.221
BIN_NUM: 0.150
BIN_PASS: -0.133
BIN_TO: 0.128
PC15:
BIN_QUOT: 0.522
BIN_CONT: 0.421
BIN_PUBV: -0.315
BIN_PGAS: -0.292
BIN_CONJ: 0.238
BIN_UH: -0.231
BIN_NOMZ: -0.198
BIN_PASS: 0.193
BIN_VBD: 0.191
BIN_AUXB: 0.169
PC16:
BIN_CONJ: 0.633
BIN_PUBV: 0.516
BIN_NUM: -0.248
BIN_PGAS: -0.195
BIN_UH: -0.170
BIN_X: 0.160
BIN_VPRT: 0.158
BIN_ART: 0.157
BIN_DEMP: -0.128
BIN_TIME: -0.105
PC17:
BIN_UH: 0.685
BIN_PGAS: -0.508
BIN_VBD: 0.218
BIN_CCONJ: -0.206
BIN_VPRT: 0.160
BIN_NOMZ: -0.147
BIN_CONJ: -0.129
BIN_PUBV: 0.117
BIN_ART: 0.101
BIN_INDA: 0.099
PC18:
BIN_ART: 0.458
BIN_DET: -0.343
BIN_DEMO: -0.305
BIN_DEMP: -0.285
BIN_INDA: 0.275
BIN_CCONJ: 0.237
BIN_AUXB: 0.216
BIN_PIT: 0.216
BIN_CONJ: -0.212
BIN_FPP1: -0.211
PC19:
BIN_PGAS: 0.633
BIN_CCONJ: -0.324
BIN_UH: 0.322
BIN_AUXB: 0.268
BIN_CONJ: 0.251
BIN_PRIV: 0.213
BIN_BEMA: 0.153
BIN_TIME: -0.129
BIN_PROD: -0.128
BIN_NUM: 0.119
PC20:
BIN_PRIV: 0.446
BIN_QUES: -0.421
BIN_CCONJ: 0.397
BIN_VPRT: 0.238
BIN_FPP1: 0.228
BIN_AUXB: -0.206
BIN_VBD: 0.196
BIN_BEMA: -0.176
BIN_PIT: -0.157
BIN_SPP2: -0.149
PC21:
BIN_NOMZ: 0.493
BIN_PRIV: 0.470
BIN_CCONJ: -0.319
BIN_PUBV: -0.287
BIN_VBD: 0.191
BIN_SCONJ: 0.180
BIN_NUM: -0.174
BIN_PGAS: -0.161
BIN_UH: -0.157
BIN_DEMP: -0.150
PC22:
BIN_CCONJ: 0.511
BIN_QUES: 0.398
BIN_CONJ: 0.252
BIN_PASS: -0.240
BIN_BEMA: 0.221
BIN_WH: 0.204
BIN_DEMO: -0.183
BIN_PEAS: -0.172
BIN_VBD: 0.167
BIN_SCONJ: 0.161
PC23:
BIN_NOMZ: 0.623
BIN_VBD: -0.245
BIN_AUXB: 0.197
BIN_QUES: -0.193
BIN_INDA: -0.191
BIN_PGAS: -0.186
BIN_SPAU: 0.185
BIN_VPRT: -0.180
BIN_FPP1: 0.171
BIN_DEMP: 0.161
PC24:
BIN_DEMO: 0.550
BIN_TIME: -0.411
BIN_DEMP: -0.407
BIN_CCONJ: 0.320
BIN_XX0: 0.232
BIN_QUOT: -0.170
BIN_CONT: 0.169
BIN_PROD: -0.111
BIN_PEAS: 0.108
BIN_BEMA: 0.106
PC25:
BIN_TIME: 0.451
BIN_XX0: -0.339
BIN_SPAU: -0.327
BIN_QUOT: 0.318
BIN_DEMO: 0.293
BIN_DEMP: -0.288
BIN_PRIV: 0.214
BIN_BEMA: 0.171
BIN_CONT: -0.152
BIN_FPP1: 0.150
Top 10 PC1 values:
PC1 PC2 ... AuthorPHID date_created
23531 124.439666 -17.084926 ... PHID-USER-arjqb24x4oae7awzpfp6 1424754141
707 124.420877 -17.096490 ... PHID-USER-pun3sjvg3cemjzbgyo2t 1363132183
744 124.420877 -17.096490 ... PHID-USER-fovtl67ew4l4cc3oeypc 1353551242
749 124.420877 -17.096490 ... PHID-USER-fovtl67ew4l4cc3oeypc 1353384355
2243 124.420877 -17.096490 ... PHID-USER-fovtl67ew4l4cc3oeypc 1356175107
5921 124.420877 -17.096490 ... PHID-USER-fovtl67ew4l4cc3oeypc 1353366778
5933 124.420877 -17.096490 ... PHID-USER-fovtl67ew4l4cc3oeypc 1353123761
5935 124.420877 -17.096490 ... PHID-USER-fovtl67ew4l4cc3oeypc 1353386649
10080 124.420877 -17.096490 ... PHID-USER-fovtl67ew4l4cc3oeypc 1366298361
10418 124.420877 -17.096490 ... PHID-USER-fovtl67ew4l4cc3oeypc 1355363288
[10 rows x 28 columns]
Bottom 10 PC1 values:
PC1 PC2 ... AuthorPHID date_created
13752 -24.770207 2.744701 ... PHID-USER-43lnvui4hacyjrc2lflj 1384635692
14250 -24.494552 0.056822 ... PHID-USER-fo56wm4wxiwpoofn2xdu 1383955246
24560 -23.004747 -10.092946 ... PHID-USER-lzhljhpbm3qfphvqyill 1439545382
22484 -22.583854 -1.262632 ... PHID-USER-fo56wm4wxiwpoofn2xdu 1440034588
13907 -22.517418 10.551266 ... PHID-USER-kqibbfgfpgocyzwe32lv 1372691031
14784 -22.018336 -17.579192 ... PHID-USER-xezsyhikbr7hjrig2ofp 1644598152
23443 -21.721400 -10.425522 ... PHID-USER-ppytiem7rcsbnstfsrvq 1502483520
19108 -21.707016 26.037928 ... PHID-USER-xy6c3ul27f336aaedx2d 1417798935
19140 -21.620289 -0.871317 ... PHID-USER-fo56wm4wxiwpoofn2xdu 1432914688
21658 -21.107540 10.458097 ... PHID-USER-x7ti5ksby4ubsabntlxa 1482280859
[10 rows x 28 columns]
Top 10 PC2 values:
PC1 PC2 ... AuthorPHID date_created
117 53.656282 89.055606 ... PHID-USER-7ey733eainlhx5xqp4d3 1375331403
2447 53.318077 88.847447 ... PHID-USER-dw53c5cb2qfhyemej57o 1456539439
2471 53.318077 88.847447 ... PHID-USER-r7wrkcx7j2vutqs6hr3g 1384994015
21231 53.299288 88.835882 ... PHID-USER-lhtlnmkdbzlz6pbxaqdd 1442087854
2728 19.161791 77.490370 ... PHID-USER-it53o2f2kyryqyj33uzt 1377545735
5024 19.161791 77.490370 ... PHID-USER-it53o2f2kyryqyj33uzt 1377544792
5135 19.161791 77.490370 ... PHID-USER-it53o2f2kyryqyj33uzt 1377544788
17487 -14.919339 66.179552 ... PHID-USER-zjzhrhmn36icnzbckqy4 1327978205
17377 -14.938128 66.167987 ... PHID-USER-wrimmmr5w2zt7nk2t753 1317586881
23739 -14.994495 66.133294 ... PHID-USER-ydswvwhh5pm4lshahjje 1462375135
[10 rows x 28 columns]
Bottom 10 PC2 values:
PC1 PC2 ... AuthorPHID date_created
14532 55.995569 -39.880029 ... PHID-USER-z3kqk2bjnqneldcznht6 1384007851
6321 56.014358 -39.868465 ... PHID-USER-my5s6nat437le6q5fq7d 1555396632
6322 56.014358 -39.868465 ... PHID-USER-my5s6nat437le6q5fq7d 1457021044
6770 56.014358 -39.868465 ... PHID-USER-qduasitr62ffvc5eiivd 1445420618
6771 56.014358 -39.868465 ... PHID-USER-qduasitr62ffvc5eiivd 1445420596
10442 56.014358 -39.868465 ... PHID-USER-unpoeiyj52rmcfqi5rbw 1604534063
10443 56.014358 -39.868465 ... PHID-USER-unpoeiyj52rmcfqi5rbw 1604532057
10528 56.014358 -39.868465 ... PHID-USER-wkpnidxoctuhawexig5p 1429475491
10529 56.014358 -39.868465 ... PHID-USER-wkpnidxoctuhawexig5p 1429475153
11837 56.014358 -39.868465 ... PHID-USER-25bxvdt2svnidzfzjpk7 1453561129
[10 rows x 28 columns]
job finished, cleaning up
job pau at: Wed Oct 1 21:23:06 CDT 2025

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.6 MiB

View File

@ -38,25 +38,24 @@ def format_df_data(df):
return x
if __name__ == "__main__":
biber_vec_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/092325_biberplus_complete_labels.csv", low_memory=False)
biber_vec_df = biber_vec_df[biber_vec_df['comment_type'] == 'task_description']
biber_vec_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/analysis_data/092925_unified_phab.csv", low_memory=False)
biber_vec_df = biber_vec_df[biber_vec_df['comment_type'] != 'task_description']
#biber_vec_df = biber_vec_df[biber_vec_df['AuthorPHID'] != "PHID-USER-idceizaw6elwiwm5xshb"]
#biber_vec_df = biber_vec_df[biber_vec_df['comment_text'] != 'nan']
biber_vecs = format_df_data(biber_vec_df)
#handoff to PCA model
'''
pca_trial = PCA()
biber_vecs_pca_trial = pca_trial.fit_transform(biber_vecs)
explained_variance = pca_trial.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)
n_components = np.argmax(cumulative_variance >= 0.90) + 1
print(f"Number of PCs explaining 90% variance: {n_components}")
'''
pca = PCA(n_components=18)
argmax_components = np.argmax(cumulative_variance >= 0.90) + 1
print(f"Number of PCs explaining 90% variance: {argmax_components}")
pca = PCA(n_components=argmax_components)
biber_vecs_pca = pca.fit_transform(biber_vecs)
with open('092525_description_pca.pkl', 'wb') as f:
with open('100125_subcomment_pca.pkl', 'wb') as f:
pickle.dump(pca, f)
selected_axis = "closed_relevance"
@ -82,9 +81,13 @@ if __name__ == "__main__":
pc_dict['week_index'] = biber_vec_df['week_index']
pc_dict['priority'] = biber_vec_df['priority']
pc_dict['closed_relevance'] = biber_vec_df['closed_relevance']
pc_dict['TaskPHID'] = biber_vec_df['TaskPHID']
pc_dict['AuthorPHID'] = biber_vec_df['AuthorPHID']
pc_dict['date_created'] = biber_vec_df['date_created']
plot_df = pd.DataFrame(pc_dict)
#plot_df.to_csv("092325_subcomment_PCA_df.csv", index=False)
plot_df.to_csv("100125_subcomment_PCA_df.csv", index=False)
print("Top 10 PC1 values:")
print(plot_df.nlargest(10, "PC1"))
@ -97,12 +100,12 @@ if __name__ == "__main__":
print(plot_df.nsmallest(10, "PC2"))
g = sns.FacetGrid(plot_df, col="source", row="phase", hue=selected_axis, palette="tab10", height=4, sharex=False, sharey=False)
g.map_dataframe(sns.scatterplot, x="PC1", y="PC2", alpha=0.7, s=40)
g.add_legend(title=selected_axis)
g.set_axis_labels("PC1", "PC2")
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle(f"PCA by {selected_axis}, faceted by source")
#g = sns.FacetGrid(plot_df, col="source", row="phase", hue=selected_axis, palette="tab10", height=4, sharex=False, sharey=False)
#g.map_dataframe(sns.scatterplot, x="PC1", y="PC2", alpha=0.7, s=40)
#g.add_legend(title=selected_axis)
#g.set_axis_labels("PC1", "PC2")
#g.fig.subplots_adjust(top=0.9)
#g.fig.suptitle(f"PCA by {selected_axis}, faceted by source")
#plt.savefig("090225_biber_pca_plot.png", dpi=300)
'''
@ -120,6 +123,6 @@ if __name__ == "__main__":
plt.ylabel('component 2')
plt.legend(title=selected_axis, bbox_to_anchor=(1.05, 1), loc=2)
'''
g.fig.tight_layout()
g.savefig(f"description_{selected_axis}_092525_biber_pca_final.png", dpi=300)
plt.show()
#g.fig.tight_layout()
#g.savefig(f"subcomment_{selected_axis}_100125_biber_pca_final.png", dpi=300)
#plt.show()

View File

@ -8,7 +8,7 @@
#SBATCH --mem=64G
#SBATCH --cpus-per-task=4
#SBATCH --job-name=neurobiber-pca
#SBATCH --output=092525_neurobiber-pca.log
#SBATCH --output=100125_subcomment_neurobiber-pca.log
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH --mail-user=gaughan@u.northwestern.edu