updating PCA to account for sentence count and median length
This commit is contained in:
parent
cb2fe737cd
commit
f60f3ef120
@ -1,15 +1,16 @@
|
|||||||
setting up the environment by loading in conda environment at Sat Oct 11 00:24:37 CDT 2025
|
setting up the environment by loading in conda environment at Sat Oct 11 07:52:03 CDT 2025
|
||||||
running the batched olmo categorization job at Sat Oct 11 00:24:37 CDT 2025
|
running the batched olmo categorization job at Sat Oct 11 07:52:03 CDT 2025
|
||||||
[nltk_data] Downloading package punkt_tab to
|
[nltk_data] Downloading package punkt_tab to
|
||||||
[nltk_data] /home/nws8519/nltk_data...
|
[nltk_data] /home/nws8519/nltk_data...
|
||||||
[nltk_data] Package punkt_tab is already up-to-date!
|
[nltk_data] Package punkt_tab is already up-to-date!
|
||||||
cuda
|
cuda
|
||||||
NVIDIA A100-SXM4-80GB
|
NVIDIA A100-SXM4-80GB
|
||||||
_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81153MB, multi_processor_count=108, uuid=393ab5c3-2bcb-e4c6-52ad-eb4896a9d4fe, L2_cache_size=40MB)
|
_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81153MB, multi_processor_count=108, uuid=ee0bd2a7-af54-5f2e-c2d3-fcd3f57270c9, L2_cache_size=40MB)
|
||||||
Loading checkpoint shards: 0%| | 0/12 [00:00<?, ?it/s]
Loading checkpoint shards: 8%|▊ | 1/12 [00:00<00:03, 2.77it/s]
Loading checkpoint shards: 17%|█▋ | 2/12 [00:00<00:04, 2.16it/s]
Loading checkpoint shards: 25%|██▌ | 3/12 [00:01<00:04, 1.93it/s]
Loading checkpoint shards: 33%|███▎ | 4/12 [00:02<00:04, 1.79it/s]
Loading checkpoint shards: 42%|████▏ | 5/12 [00:02<00:03, 1.77it/s]
Loading checkpoint shards: 50%|█████ | 6/12 [00:03<00:03, 1.80it/s]
Loading checkpoint shards: 58%|█████▊ | 7/12 [00:03<00:02, 1.80it/s]
Loading checkpoint shards: 67%|██████▋ | 8/12 [00:04<00:02, 1.76it/s]
Loading checkpoint shards: 75%|███████▌ | 9/12 [00:04<00:01, 1.77it/s]
Loading checkpoint shards: 83%|████████▎ | 10/12 [00:05<00:01, 1.82it/s]
Loading checkpoint shards: 92%|█████████▏| 11/12 [00:05<00:00, 1.92it/s]
Loading checkpoint shards: 100%|██████████| 12/12 [00:05<00:00, 2.02it/s]
|
Loading checkpoint shards: 0%| | 0/12 [00:00<?, ?it/s]
Loading checkpoint shards: 8%|▊ | 1/12 [00:00<00:04, 2.72it/s]
Loading checkpoint shards: 17%|█▋ | 2/12 [00:00<00:04, 2.27it/s]
Loading checkpoint shards: 25%|██▌ | 3/12 [00:01<00:04, 2.12it/s]
Loading checkpoint shards: 33%|███▎ | 4/12 [00:01<00:03, 2.12it/s]
Loading checkpoint shards: 42%|████▏ | 5/12 [00:02<00:03, 1.96it/s]
Loading checkpoint shards: 50%|█████ | 6/12 [00:02<00:03, 1.98it/s]
Loading checkpoint shards: 58%|█████▊ | 7/12 [00:03<00:02, 1.87it/s]
Loading checkpoint shards: 67%|██████▋ | 8/12 [00:03<00:02, 1.94it/s]
Loading checkpoint shards: 75%|███████▌ | 9/12 [00:04<00:01, 1.91it/s]
Loading checkpoint shards: 83%|████████▎ | 10/12 [00:05<00:01, 1.87it/s]
Loading checkpoint shards: 92%|█████████▏| 11/12 [00:05<00:00, 2.05it/s]
Loading checkpoint shards: 100%|██████████| 12/12 [00:05<00:00, 2.18it/s]
|
||||||
|
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
|
||||||
|
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/090425_batched_olmo_cat.py", line 62, in <module>
|
File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/090425_batched_olmo_cat.py", line 66, in <module>
|
||||||
with open("/home/nws8519/git/mw-lifecycle-analysis/analysis_data/100325_unified_phab.csv", mode='r', newline='') as file:
|
for row in reader:
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
_csv.Error: field larger than field limit (131072)
|
||||||
FileNotFoundError: [Errno 2] No such file or directory: '/home/nws8519/git/mw-lifecycle-analysis/analysis_data/100325_unified_phab.csv'
|
unsupervised batched olmo categorization pau at Sun Oct 12 14:11:43 CDT 2025
|
||||||
unsupervised batched olmo categorization pau at Sat Oct 11 00:27:22 CDT 2025
|
|
||||||
|
|||||||
11
p2/quest/101325-batched-mw-olmo-info-cat.log
Normal file
11
p2/quest/101325-batched-mw-olmo-info-cat.log
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
setting up the environment by loading in conda environment at Mon Oct 13 09:25:24 CDT 2025
|
||||||
|
running the batched olmo categorization job at Mon Oct 13 09:25:24 CDT 2025
|
||||||
|
[nltk_data] Downloading package punkt_tab to
|
||||||
|
[nltk_data] /home/nws8519/nltk_data...
|
||||||
|
[nltk_data] Package punkt_tab is already up-to-date!
|
||||||
|
cuda
|
||||||
|
NVIDIA A100-SXM4-80GB
|
||||||
|
_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81153MB, multi_processor_count=108, uuid=19efa4d6-01cd-d825-4cd9-637cc23cebd3, L2_cache_size=40MB)
|
||||||
|
Loading checkpoint shards: 0%| | 0/12 [00:00<?, ?it/s]
Loading checkpoint shards: 8%|▊ | 1/12 [00:00<00:03, 3.50it/s]
Loading checkpoint shards: 17%|█▋ | 2/12 [00:00<00:03, 2.64it/s]
Loading checkpoint shards: 25%|██▌ | 3/12 [00:01<00:03, 2.39it/s]
Loading checkpoint shards: 33%|███▎ | 4/12 [00:01<00:03, 2.19it/s]
Loading checkpoint shards: 42%|████▏ | 5/12 [00:02<00:03, 2.19it/s]
Loading checkpoint shards: 50%|█████ | 6/12 [00:02<00:02, 2.08it/s]
Loading checkpoint shards: 58%|█████▊ | 7/12 [00:03<00:02, 2.04it/s]
Loading checkpoint shards: 67%|██████▋ | 8/12 [00:03<00:01, 2.01it/s]
Loading checkpoint shards: 75%|███████▌ | 9/12 [00:04<00:01, 2.03it/s]
Loading checkpoint shards: 83%|████████▎ | 10/12 [00:04<00:00, 2.07it/s]
Loading checkpoint shards: 92%|█████████▏| 11/12 [00:05<00:00, 2.19it/s]
Loading checkpoint shards: 100%|██████████| 12/12 [00:05<00:00, 2.36it/s]
|
||||||
|
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
|
||||||
|
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
|
||||||
37338
p2/quest/101325_description_PCA_df.csv
Normal file
37338
p2/quest/101325_description_PCA_df.csv
Normal file
File diff suppressed because one or more lines are too long
239
p2/quest/101325_description_neurobiber-pca.log
Normal file
239
p2/quest/101325_description_neurobiber-pca.log
Normal file
@ -0,0 +1,239 @@
|
|||||||
|
starting the job at: Tue Oct 14 15:08:48 CDT 2025
|
||||||
|
setting up the environment
|
||||||
|
running the neurobiber labeling script
|
||||||
|
[[13. ]
|
||||||
|
[14. ]
|
||||||
|
[11. ]
|
||||||
|
...
|
||||||
|
[10. ]
|
||||||
|
[14. ]
|
||||||
|
[12.5]]
|
||||||
|
Number of PCs explaining 90% variance: 15
|
||||||
|
Variance of each PCA component: [138.60156907 44.29951603 25.63179594 21.39857213 14.99271754
|
||||||
|
10.88014877 8.72969328 8.11497994 6.78712318 5.50912497
|
||||||
|
5.25006184 4.96444801 4.62359041 3.68257699 3.28506433]
|
||||||
|
PC1:
|
||||||
|
median_sentence_length: 0.994
|
||||||
|
normalized_CAP: -0.069
|
||||||
|
normalized_NNP: -0.050
|
||||||
|
normalized_NOMZ: -0.029
|
||||||
|
normalized_NUM: 0.026
|
||||||
|
normalized_DET: 0.024
|
||||||
|
normalized_ART: 0.020
|
||||||
|
normalized_PREP: 0.019
|
||||||
|
normalized_PIN: 0.019
|
||||||
|
normalized_RB: 0.016
|
||||||
|
PC2:
|
||||||
|
normalized_CAP: 0.555
|
||||||
|
normalized_NNP: 0.554
|
||||||
|
normalized_DET: -0.298
|
||||||
|
normalized_ART: -0.232
|
||||||
|
normalized_PREP: -0.220
|
||||||
|
normalized_PIN: -0.220
|
||||||
|
sentence_count: -0.189
|
||||||
|
normalized_RB: -0.125
|
||||||
|
normalized_PRP: -0.110
|
||||||
|
normalized_SBJP: -0.110
|
||||||
|
PC3:
|
||||||
|
normalized_NN: 0.509
|
||||||
|
normalized_PREP: 0.491
|
||||||
|
normalized_PIN: 0.491
|
||||||
|
normalized_CAP: 0.304
|
||||||
|
normalized_NNP: 0.279
|
||||||
|
normalized_DET: 0.143
|
||||||
|
sentence_count: -0.115
|
||||||
|
normalized_ART: 0.109
|
||||||
|
normalized_NOMZ: -0.098
|
||||||
|
normalized_INF: 0.095
|
||||||
|
PC4:
|
||||||
|
normalized_NN: 0.683
|
||||||
|
sentence_count: -0.412
|
||||||
|
normalized_NNP: -0.295
|
||||||
|
normalized_PIN: -0.217
|
||||||
|
normalized_PREP: -0.217
|
||||||
|
normalized_CAP: -0.174
|
||||||
|
normalized_PRP: -0.173
|
||||||
|
normalized_SBJP: -0.173
|
||||||
|
normalized_RB: -0.142
|
||||||
|
normalized_JJ: 0.117
|
||||||
|
PC5:
|
||||||
|
sentence_count: 0.718
|
||||||
|
normalized_NN: 0.358
|
||||||
|
normalized_DET: 0.228
|
||||||
|
normalized_PIN: -0.223
|
||||||
|
normalized_PREP: -0.223
|
||||||
|
normalized_ART: 0.221
|
||||||
|
normalized_NOMZ: -0.190
|
||||||
|
normalized_CAP: 0.186
|
||||||
|
normalized_INF: -0.137
|
||||||
|
normalized_JJ: -0.123
|
||||||
|
PC6:
|
||||||
|
normalized_DET: 0.538
|
||||||
|
normalized_ART: 0.483
|
||||||
|
sentence_count: -0.398
|
||||||
|
normalized_PREP: -0.216
|
||||||
|
normalized_PIN: -0.216
|
||||||
|
normalized_CAP: 0.206
|
||||||
|
normalized_VPRT: 0.204
|
||||||
|
normalized_INDA: 0.186
|
||||||
|
normalized_NN: -0.142
|
||||||
|
normalized_X: -0.132
|
||||||
|
PC7:
|
||||||
|
normalized_RB: 0.442
|
||||||
|
normalized_CAP: 0.343
|
||||||
|
normalized_PRP: 0.313
|
||||||
|
normalized_SBJP: 0.313
|
||||||
|
normalized_NNP: -0.278
|
||||||
|
normalized_VPRT: 0.234
|
||||||
|
normalized_ART: -0.232
|
||||||
|
normalized_NN: 0.229
|
||||||
|
normalized_DET: -0.208
|
||||||
|
normalized_NOMZ: -0.164
|
||||||
|
PC8:
|
||||||
|
normalized_JJ: 0.504
|
||||||
|
normalized_CAP: 0.502
|
||||||
|
normalized_NNP: -0.468
|
||||||
|
normalized_NOMZ: 0.296
|
||||||
|
sentence_count: 0.150
|
||||||
|
normalized_X: -0.146
|
||||||
|
normalized_QUOT: -0.145
|
||||||
|
normalized_NN: -0.142
|
||||||
|
normalized_VPRT: -0.131
|
||||||
|
normalized_RB: -0.128
|
||||||
|
PC9:
|
||||||
|
normalized_JJ: 0.637
|
||||||
|
normalized_VPRT: 0.357
|
||||||
|
normalized_NNP: 0.337
|
||||||
|
normalized_CAP: -0.265
|
||||||
|
normalized_INF: -0.258
|
||||||
|
normalized_QUOT: -0.224
|
||||||
|
normalized_RB: 0.204
|
||||||
|
normalized_X: -0.145
|
||||||
|
normalized_AUXB: 0.143
|
||||||
|
sentence_count: 0.135
|
||||||
|
PC10:
|
||||||
|
normalized_INF: 0.691
|
||||||
|
normalized_QUOT: -0.415
|
||||||
|
normalized_VPRT: -0.263
|
||||||
|
normalized_RB: 0.222
|
||||||
|
normalized_TO: 0.184
|
||||||
|
normalized_PRP: -0.155
|
||||||
|
normalized_SBJP: -0.155
|
||||||
|
normalized_CONT: -0.138
|
||||||
|
normalized_NNP: 0.122
|
||||||
|
normalized_PIN: -0.120
|
||||||
|
PC11:
|
||||||
|
normalized_QUOT: 0.714
|
||||||
|
normalized_JJ: 0.402
|
||||||
|
normalized_CONT: 0.295
|
||||||
|
normalized_INF: 0.294
|
||||||
|
normalized_NOMZ: -0.246
|
||||||
|
normalized_PRP: -0.126
|
||||||
|
normalized_SBJP: -0.126
|
||||||
|
normalized_X: 0.107
|
||||||
|
normalized_NUM: -0.076
|
||||||
|
normalized_CAP: 0.067
|
||||||
|
PC12:
|
||||||
|
normalized_RB: 0.521
|
||||||
|
normalized_PRP: -0.426
|
||||||
|
normalized_SBJP: -0.426
|
||||||
|
normalized_JJ: -0.236
|
||||||
|
normalized_INF: -0.229
|
||||||
|
normalized_FPP1: -0.211
|
||||||
|
normalized_NNP: -0.184
|
||||||
|
normalized_CONJ: 0.129
|
||||||
|
normalized_XX0: 0.125
|
||||||
|
normalized_TO: -0.124
|
||||||
|
PC13:
|
||||||
|
normalized_X: 0.808
|
||||||
|
normalized_NOMZ: -0.391
|
||||||
|
normalized_QUOT: -0.249
|
||||||
|
normalized_JJ: 0.163
|
||||||
|
sentence_count: -0.146
|
||||||
|
normalized_NNP: -0.126
|
||||||
|
normalized_CAP: 0.107
|
||||||
|
normalized_CONT: -0.097
|
||||||
|
normalized_VPRT: 0.096
|
||||||
|
normalized_RB: -0.071
|
||||||
|
PC14:
|
||||||
|
normalized_VPRT: 0.514
|
||||||
|
normalized_AUXB: 0.496
|
||||||
|
normalized_RB: -0.346
|
||||||
|
normalized_PASS: 0.221
|
||||||
|
normalized_INF: 0.218
|
||||||
|
normalized_NOMZ: 0.215
|
||||||
|
normalized_BEMA: 0.161
|
||||||
|
normalized_JJ: -0.161
|
||||||
|
normalized_VBD: -0.137
|
||||||
|
normalized_NUM: -0.136
|
||||||
|
PC15:
|
||||||
|
normalized_NOMZ: 0.554
|
||||||
|
normalized_NUM: -0.544
|
||||||
|
normalized_X: 0.438
|
||||||
|
normalized_RB: 0.239
|
||||||
|
sentence_count: 0.146
|
||||||
|
normalized_NNP: 0.135
|
||||||
|
normalized_AUXB: -0.116
|
||||||
|
normalized_NN: 0.105
|
||||||
|
normalized_CONT: 0.104
|
||||||
|
normalized_INF: -0.101
|
||||||
|
Top 10 PC1 values:
|
||||||
|
PC1 PC2 ... AuthorPHID date_created
|
||||||
|
16080 525.102703 48.280630 ... PHID-USER-zjzhrhmn36icnzbckqy4 1350678600
|
||||||
|
18859 77.466344 3.706703 ... PHID-USER-ll6tmaogat2b5q7tnqas 1405358040
|
||||||
|
20378 69.473292 -5.921977 ... PHID-USER-ynivjflmc2dcl6w5ut5v 1407551580
|
||||||
|
8874 67.305410 6.587019 ... PHID-USER-ydswvwhh5pm4lshahjje 1371667800
|
||||||
|
6468 52.113083 12.698065 ... PHID-USER-azy72hrp3tpetr52aob6 1378208100
|
||||||
|
18692 43.220624 8.230008 ... PHID-USER-arjqb24x4oae7awzpfp6 1411431840
|
||||||
|
5607 42.720768 1.581160 ... PHID-USER-ynivjflmc2dcl6w5ut5v 1360124400
|
||||||
|
19479 41.065047 8.286151 ... PHID-USER-ynivjflmc2dcl6w5ut5v 1406854860
|
||||||
|
13751 38.405351 6.445956 ... PHID-USER-v7vgzvvcw7v2umf737ri 1380947640
|
||||||
|
6503 37.060191 -2.635433 ... PHID-USER-qgqq35kbi5wss2tlgmhg 1377865740
|
||||||
|
|
||||||
|
[10 rows x 25 columns]
|
||||||
|
|
||||||
|
Bottom 10 PC1 values:
|
||||||
|
PC1 PC2 ... AuthorPHID date_created
|
||||||
|
19173 -14.819594 38.839843 ... PHID-USER-doeppszazlm3r7xah4il 1416964345
|
||||||
|
23533 -14.098760 31.956092 ... PHID-USER-sai77mtxmpqnm6pycyvz 1424498718
|
||||||
|
24553 -14.098553 31.953701 ... PHID-USER-sai77mtxmpqnm6pycyvz 1424498559
|
||||||
|
23532 -14.098346 31.951309 ... PHID-USER-sai77mtxmpqnm6pycyvz 1424498772
|
||||||
|
129 -13.767257 2.442547 ... PHID-USER-hyfm4swq76s4j642w46x 1375120080
|
||||||
|
22245 -12.327433 30.418183 ... PHID-USER-v7vgzvvcw7v2umf737ri 1438377936
|
||||||
|
752 -12.170613 17.171274 ... PHID-USER-sx63fwaih5kjt7bz4u6z 1380590700
|
||||||
|
2120 -11.607147 -10.509373 ... PHID-USER-xfe43w2lb5gpvglf4coa 1367008080
|
||||||
|
22153 -11.098587 7.351805 ... PHID-USER-a6p24cvyblhfzc7we7nc 1438982860
|
||||||
|
24847 -10.908633 15.377024 ... PHID-USER-srhlj2447vmpmrfhqnfa 1417632210
|
||||||
|
|
||||||
|
[10 rows x 25 columns]
|
||||||
|
Top 10 PC2 values:
|
||||||
|
PC1 PC2 ... AuthorPHID date_created
|
||||||
|
16080 525.102703 48.280630 ... PHID-USER-zjzhrhmn36icnzbckqy4 1350678600
|
||||||
|
19173 -14.819594 38.839843 ... PHID-USER-doeppszazlm3r7xah4il 1416964345
|
||||||
|
23127 -1.787399 32.727692 ... PHID-USER-myidf5vlkwvrgp2iwn76 1433839792
|
||||||
|
23533 -14.098760 31.956092 ... PHID-USER-sai77mtxmpqnm6pycyvz 1424498718
|
||||||
|
24553 -14.098553 31.953701 ... PHID-USER-sai77mtxmpqnm6pycyvz 1424498559
|
||||||
|
23532 -14.098346 31.951309 ... PHID-USER-sai77mtxmpqnm6pycyvz 1424498772
|
||||||
|
18500 13.647382 30.709395 ... PHID-USER-hbffue25ov3attlvclze 1387662960
|
||||||
|
22245 -12.327433 30.418183 ... PHID-USER-v7vgzvvcw7v2umf737ri 1438377936
|
||||||
|
22023 -7.400000 29.037196 ... PHID-USER-a6p24cvyblhfzc7we7nc 1440568477
|
||||||
|
14809 -2.186555 28.072103 ... PHID-USER-zjzhrhmn36icnzbckqy4 1379900100
|
||||||
|
|
||||||
|
[10 rows x 25 columns]
|
||||||
|
|
||||||
|
Bottom 10 PC2 values:
|
||||||
|
PC1 PC2 ... AuthorPHID date_created
|
||||||
|
23485 -1.065773 -15.903250 ... PHID-USER-u7udgblfyop6qd5wxot6 1425991276
|
||||||
|
22060 4.042133 -15.132236 ... PHID-USER-2nnm76h4ykalvvref2ye 1440412099
|
||||||
|
5792 -2.696513 -15.036399 ... PHID-USER-grpjkpfolt5gz4ljlbfg 1355334540
|
||||||
|
1436 -0.480107 -15.016999 ... PHID-USER-tyjmn7xcw6s2b6rqagj7 1373878680
|
||||||
|
22799 -5.139569 -14.977697 ... PHID-USER-fjve3gq5wsmaaccti7pb 1430752987
|
||||||
|
22845 -0.877723 -14.762675 ... PHID-USER-2nnm76h4ykalvvref2ye 1440085454
|
||||||
|
7451 9.897529 -14.392291 ... PHID-USER-ysftv67jxeaxdwcakvwo 1374347580
|
||||||
|
9423 10.013728 -14.381035 ... PHID-USER-zzvqlvm6i6kml4tfnqvq 1369411380
|
||||||
|
1228 -2.448487 -13.906291 ... PHID-USER-ysftv67jxeaxdwcakvwo 1374765240
|
||||||
|
2775 3.664323 -13.485623 ... PHID-USER-dw53c5cb2qfhyemej57o 1377068880
|
||||||
|
|
||||||
|
[10 rows x 25 columns]
|
||||||
|
job finished, cleaning up
|
||||||
|
job pau at: Tue Oct 14 15:09:18 CDT 2025
|
||||||
BIN
p2/quest/101325_description_pca.pkl
Normal file
BIN
p2/quest/101325_description_pca.pkl
Normal file
Binary file not shown.
108677
p2/quest/101325_subcomment_PCA_df.csv
Normal file
108677
p2/quest/101325_subcomment_PCA_df.csv
Normal file
File diff suppressed because one or more lines are too long
352
p2/quest/101325_subcomment_neurobiber-pca.log
Normal file
352
p2/quest/101325_subcomment_neurobiber-pca.log
Normal file
@ -0,0 +1,352 @@
|
|||||||
|
starting the job at: Tue Oct 14 15:54:24 CDT 2025
|
||||||
|
setting up the environment
|
||||||
|
running the neurobiber labeling script
|
||||||
|
1 [Change 86685 merged by jenkins-bot:\nFollow-u...
|
||||||
|
2 [*** Bug 54785 has been marked as a duplicate ...
|
||||||
|
3 [Change 86685 had a related patch set uploaded...
|
||||||
|
5 [**Wikifram** wrote:\n\nAllright, thanks to bo...
|
||||||
|
6 [(In reply to comment #4)\nQUOTE\n\nVE product...
|
||||||
|
...
|
||||||
|
25022 [Er... drag and drop from what?, Is there no n...
|
||||||
|
25023 [Could you attach a screenshot please?, Drag &...
|
||||||
|
25025 [Sorry for not reply-ing., I did a test and co...
|
||||||
|
25026 [SCREEN_NAME: Please answer.]
|
||||||
|
25027 [I cannot replicate this., What's the name of ...
|
||||||
|
Name: olmo_cleaned_sentences, Length: 21901, dtype: object
|
||||||
|
[[18. ]
|
||||||
|
[ 6.5]
|
||||||
|
[23. ]
|
||||||
|
...
|
||||||
|
[ 5.5]
|
||||||
|
[ 3. ]
|
||||||
|
[ 6. ]]
|
||||||
|
Number of PCs explaining 90% variance: 24
|
||||||
|
Variance of each PCA component: [273.55786883 135.16197459 82.94008657 63.12754897 60.39119505
|
||||||
|
38.84258991 32.35268417 26.32979149 21.57186105 18.691479
|
||||||
|
16.21404524 13.63887204 13.3960516 11.40372708 10.25820109
|
||||||
|
9.13513531 8.8549811 8.29863619 7.99933399 7.06165956
|
||||||
|
6.73377968 6.4742109 5.92152116 5.75533066]
|
||||||
|
PC1:
|
||||||
|
normalized_CAP: 0.670
|
||||||
|
normalized_NNP: 0.604
|
||||||
|
median_sentence_length: -0.283
|
||||||
|
normalized_DET: -0.142
|
||||||
|
normalized_PREP: -0.122
|
||||||
|
normalized_PIN: -0.122
|
||||||
|
normalized_ART: -0.089
|
||||||
|
normalized_VPRT: -0.082
|
||||||
|
normalized_RB: -0.077
|
||||||
|
normalized_PRP: -0.071
|
||||||
|
PC2:
|
||||||
|
median_sentence_length: 0.929
|
||||||
|
normalized_NNP: 0.319
|
||||||
|
normalized_RB: -0.074
|
||||||
|
normalized_VPRT: -0.070
|
||||||
|
normalized_DET: -0.066
|
||||||
|
normalized_AUXB: -0.055
|
||||||
|
normalized_PRP: -0.045
|
||||||
|
normalized_SBJP: -0.045
|
||||||
|
normalized_X: 0.038
|
||||||
|
normalized_CAP: 0.035
|
||||||
|
PC3:
|
||||||
|
normalized_NN: 0.750
|
||||||
|
normalized_NNP: -0.291
|
||||||
|
normalized_RB: -0.266
|
||||||
|
normalized_PRP: -0.232
|
||||||
|
normalized_SBJP: -0.232
|
||||||
|
normalized_CAP: 0.211
|
||||||
|
normalized_VPRT: -0.169
|
||||||
|
normalized_FPP1: -0.117
|
||||||
|
normalized_NUM: 0.106
|
||||||
|
normalized_INF: -0.097
|
||||||
|
PC4:
|
||||||
|
normalized_CAP: 0.577
|
||||||
|
normalized_PREP: 0.426
|
||||||
|
normalized_PIN: 0.426
|
||||||
|
normalized_NNP: -0.281
|
||||||
|
normalized_PRP: 0.187
|
||||||
|
normalized_SBJP: 0.187
|
||||||
|
median_sentence_length: 0.159
|
||||||
|
normalized_X: -0.148
|
||||||
|
normalized_RB: 0.141
|
||||||
|
normalized_INF: 0.128
|
||||||
|
PC5:
|
||||||
|
normalized_PIN: 0.507
|
||||||
|
normalized_PREP: 0.507
|
||||||
|
normalized_NNP: 0.435
|
||||||
|
normalized_CAP: -0.349
|
||||||
|
normalized_RB: -0.256
|
||||||
|
median_sentence_length: -0.147
|
||||||
|
normalized_CONJ: 0.125
|
||||||
|
normalized_SBJP: -0.120
|
||||||
|
normalized_PRP: -0.120
|
||||||
|
normalized_VPRT: -0.100
|
||||||
|
PC6:
|
||||||
|
normalized_DET: 0.618
|
||||||
|
normalized_ART: 0.383
|
||||||
|
normalized_X: -0.278
|
||||||
|
normalized_NN: 0.273
|
||||||
|
normalized_VPRT: 0.261
|
||||||
|
normalized_NNP: 0.246
|
||||||
|
normalized_AUXB: 0.215
|
||||||
|
normalized_NUM: -0.191
|
||||||
|
normalized_INF: -0.163
|
||||||
|
normalized_INDA: 0.156
|
||||||
|
PC7:
|
||||||
|
normalized_NN: 0.477
|
||||||
|
normalized_PRP: 0.459
|
||||||
|
normalized_SBJP: 0.459
|
||||||
|
normalized_NNP: 0.247
|
||||||
|
normalized_FPP1: 0.236
|
||||||
|
normalized_DET: -0.196
|
||||||
|
normalized_AUXB: -0.171
|
||||||
|
normalized_CAP: -0.163
|
||||||
|
normalized_PASS: -0.138
|
||||||
|
normalized_PIT: 0.126
|
||||||
|
PC8:
|
||||||
|
normalized_RB: 0.781
|
||||||
|
normalized_NN: 0.265
|
||||||
|
normalized_DET: -0.188
|
||||||
|
normalized_PRP: -0.187
|
||||||
|
normalized_SBJP: -0.186
|
||||||
|
normalized_JJ: -0.169
|
||||||
|
normalized_NNP: 0.154
|
||||||
|
normalized_X: -0.153
|
||||||
|
normalized_TIME: 0.139
|
||||||
|
normalized_ART: -0.136
|
||||||
|
PC9:
|
||||||
|
normalized_JJ: 0.672
|
||||||
|
normalized_INF: 0.353
|
||||||
|
normalized_VPRT: -0.324
|
||||||
|
normalized_PASS: -0.219
|
||||||
|
normalized_AUXB: -0.218
|
||||||
|
normalized_NUM: -0.214
|
||||||
|
normalized_ART: 0.214
|
||||||
|
normalized_CONJ: -0.147
|
||||||
|
normalized_RB: 0.132
|
||||||
|
normalized_PEAS: -0.117
|
||||||
|
PC10:
|
||||||
|
normalized_INF: 0.652
|
||||||
|
normalized_JJ: -0.543
|
||||||
|
normalized_VPRT: -0.298
|
||||||
|
normalized_DET: 0.248
|
||||||
|
normalized_TO: 0.131
|
||||||
|
normalized_ART: 0.128
|
||||||
|
normalized_PRIV: 0.108
|
||||||
|
normalized_NUM: 0.086
|
||||||
|
normalized_RB: -0.077
|
||||||
|
normalized_POMD: 0.072
|
||||||
|
PC11:
|
||||||
|
normalized_INF: 0.420
|
||||||
|
normalized_VPRT: 0.383
|
||||||
|
normalized_AUXB: 0.379
|
||||||
|
normalized_ART: -0.261
|
||||||
|
normalized_JJ: 0.251
|
||||||
|
normalized_RB: -0.249
|
||||||
|
normalized_VBD: -0.247
|
||||||
|
normalized_X: -0.223
|
||||||
|
normalized_DET: -0.212
|
||||||
|
normalized_PASS: 0.174
|
||||||
|
PC12:
|
||||||
|
sentence_count: 0.651
|
||||||
|
normalized_X: -0.619
|
||||||
|
normalized_VPRT: -0.180
|
||||||
|
normalized_PUBV: 0.169
|
||||||
|
normalized_RB: -0.115
|
||||||
|
normalized_CONJ: 0.114
|
||||||
|
normalized_INF: -0.104
|
||||||
|
normalized_CCONJ: 0.100
|
||||||
|
normalized_QUOT: 0.099
|
||||||
|
normalized_DET: -0.091
|
||||||
|
PC13:
|
||||||
|
sentence_count: 0.637
|
||||||
|
normalized_X: 0.496
|
||||||
|
normalized_VBD: -0.299
|
||||||
|
normalized_NUM: -0.287
|
||||||
|
normalized_PUBV: -0.223
|
||||||
|
normalized_JJ: -0.198
|
||||||
|
normalized_VPRT: 0.186
|
||||||
|
normalized_CONJ: -0.099
|
||||||
|
normalized_QUOT: 0.067
|
||||||
|
normalized_PASS: -0.061
|
||||||
|
PC14:
|
||||||
|
normalized_NUM: 0.714
|
||||||
|
normalized_VBD: -0.354
|
||||||
|
normalized_VPRT: 0.233
|
||||||
|
normalized_AUXB: -0.186
|
||||||
|
normalized_PASS: -0.171
|
||||||
|
normalized_ART: 0.153
|
||||||
|
normalized_UH: -0.150
|
||||||
|
normalized_RB: 0.141
|
||||||
|
normalized_INDA: 0.138
|
||||||
|
normalized_PUBV: -0.134
|
||||||
|
PC15:
|
||||||
|
normalized_QUOT: 0.422
|
||||||
|
normalized_VBD: -0.380
|
||||||
|
normalized_AUXB: -0.331
|
||||||
|
sentence_count: -0.322
|
||||||
|
normalized_CONT: 0.315
|
||||||
|
normalized_UH: 0.255
|
||||||
|
normalized_NUM: -0.221
|
||||||
|
normalized_PASS: -0.221
|
||||||
|
normalized_X: -0.206
|
||||||
|
normalized_VPRT: 0.154
|
||||||
|
PC16:
|
||||||
|
normalized_PUBV: 0.481
|
||||||
|
normalized_CONJ: -0.394
|
||||||
|
normalized_UH: -0.360
|
||||||
|
normalized_VBD: 0.317
|
||||||
|
normalized_QUOT: 0.267
|
||||||
|
normalized_VPRT: 0.248
|
||||||
|
normalized_CONT: 0.201
|
||||||
|
normalized_NUM: 0.151
|
||||||
|
normalized_PASS: -0.137
|
||||||
|
normalized_TO: 0.128
|
||||||
|
PC17:
|
||||||
|
normalized_QUOT: 0.520
|
||||||
|
normalized_CONT: 0.417
|
||||||
|
normalized_PUBV: -0.301
|
||||||
|
normalized_PGAS: -0.290
|
||||||
|
normalized_UH: -0.260
|
||||||
|
normalized_CONJ: 0.234
|
||||||
|
normalized_VBD: 0.200
|
||||||
|
normalized_NOMZ: -0.194
|
||||||
|
normalized_PASS: 0.193
|
||||||
|
normalized_AUXB: 0.175
|
||||||
|
PC18:
|
||||||
|
normalized_CONJ: 0.631
|
||||||
|
normalized_PUBV: 0.523
|
||||||
|
normalized_NUM: -0.253
|
||||||
|
normalized_PGAS: -0.211
|
||||||
|
normalized_VPRT: 0.168
|
||||||
|
normalized_X: 0.160
|
||||||
|
normalized_ART: 0.155
|
||||||
|
normalized_DEMP: -0.126
|
||||||
|
normalized_UH: -0.118
|
||||||
|
normalized_TIME: -0.106
|
||||||
|
PC19:
|
||||||
|
normalized_UH: 0.659
|
||||||
|
normalized_PGAS: -0.517
|
||||||
|
normalized_VBD: 0.237
|
||||||
|
normalized_CCONJ: -0.196
|
||||||
|
normalized_CONJ: -0.175
|
||||||
|
normalized_NOMZ: -0.153
|
||||||
|
normalized_VPRT: 0.149
|
||||||
|
normalized_ART: 0.109
|
||||||
|
normalized_INDA: 0.101
|
||||||
|
normalized_RB: 0.099
|
||||||
|
PC20:
|
||||||
|
normalized_ART: 0.461
|
||||||
|
normalized_DET: -0.342
|
||||||
|
normalized_DEMO: -0.294
|
||||||
|
normalized_INDA: 0.293
|
||||||
|
normalized_DEMP: -0.288
|
||||||
|
normalized_AUXB: 0.230
|
||||||
|
normalized_PIT: 0.222
|
||||||
|
normalized_FPP1: -0.215
|
||||||
|
normalized_PGAS: 0.208
|
||||||
|
normalized_CCONJ: 0.185
|
||||||
|
PC21:
|
||||||
|
normalized_PGAS: 0.594
|
||||||
|
normalized_CCONJ: -0.353
|
||||||
|
normalized_UH: 0.330
|
||||||
|
normalized_CONJ: 0.272
|
||||||
|
normalized_AUXB: 0.250
|
||||||
|
normalized_PRIV: 0.241
|
||||||
|
normalized_BEMA: 0.153
|
||||||
|
normalized_TIME: -0.141
|
||||||
|
normalized_PROD: -0.130
|
||||||
|
normalized_NUM: 0.125
|
||||||
|
PC22:
|
||||||
|
normalized_PRIV: 0.445
|
||||||
|
normalized_QUES: -0.422
|
||||||
|
normalized_CCONJ: 0.395
|
||||||
|
normalized_VPRT: 0.242
|
||||||
|
normalized_FPP1: 0.221
|
||||||
|
normalized_AUXB: -0.207
|
||||||
|
normalized_VBD: 0.200
|
||||||
|
normalized_BEMA: -0.178
|
||||||
|
normalized_PIT: -0.151
|
||||||
|
normalized_SPP2: -0.148
|
||||||
|
PC23:
|
||||||
|
normalized_NOMZ: 0.504
|
||||||
|
normalized_PRIV: 0.457
|
||||||
|
normalized_CCONJ: -0.327
|
||||||
|
normalized_PUBV: -0.283
|
||||||
|
normalized_NUM: -0.184
|
||||||
|
normalized_VBD: 0.180
|
||||||
|
normalized_SCONJ: 0.170
|
||||||
|
normalized_UH: -0.168
|
||||||
|
normalized_DEMP: -0.161
|
||||||
|
normalized_PGAS: -0.161
|
||||||
|
PC24:
|
||||||
|
normalized_CCONJ: 0.506
|
||||||
|
normalized_QUES: 0.414
|
||||||
|
normalized_CONJ: 0.251
|
||||||
|
normalized_PASS: -0.238
|
||||||
|
normalized_BEMA: 0.207
|
||||||
|
normalized_WH: 0.207
|
||||||
|
normalized_VBD: 0.186
|
||||||
|
normalized_DEMO: -0.180
|
||||||
|
normalized_PEAS: -0.164
|
||||||
|
normalized_SCONJ: 0.161
|
||||||
|
Top 10 PC1 values:
|
||||||
|
PC1 PC2 ... AuthorPHID date_created
|
||||||
|
23531 123.243897 22.112164 ... PHID-USER-arjqb24x4oae7awzpfp6 1424754141
|
||||||
|
707 123.226678 22.102265 ... PHID-USER-pun3sjvg3cemjzbgyo2t 1363132183
|
||||||
|
744 123.226678 22.102265 ... PHID-USER-fovtl67ew4l4cc3oeypc 1353551242
|
||||||
|
749 123.226678 22.102265 ... PHID-USER-fovtl67ew4l4cc3oeypc 1353384355
|
||||||
|
2243 123.226678 22.102265 ... PHID-USER-fovtl67ew4l4cc3oeypc 1356175107
|
||||||
|
5921 123.226678 22.102265 ... PHID-USER-fovtl67ew4l4cc3oeypc 1353366778
|
||||||
|
5933 123.226678 22.102265 ... PHID-USER-fovtl67ew4l4cc3oeypc 1353123761
|
||||||
|
5935 123.226678 22.102265 ... PHID-USER-fovtl67ew4l4cc3oeypc 1353386649
|
||||||
|
10080 123.226678 22.102265 ... PHID-USER-fovtl67ew4l4cc3oeypc 1366298361
|
||||||
|
10418 123.226678 22.102265 ... PHID-USER-fovtl67ew4l4cc3oeypc 1355363288
|
||||||
|
|
||||||
|
[10 rows x 34 columns]
|
||||||
|
|
||||||
|
Bottom 10 PC1 values:
|
||||||
|
PC1 PC2 ... AuthorPHID date_created
|
||||||
|
24812 -131.318535 438.637876 ... PHID-USER-fo56wm4wxiwpoofn2xdu 1463441072
|
||||||
|
24813 -131.130989 438.728132 ... PHID-USER-fo56wm4wxiwpoofn2xdu 1463441050
|
||||||
|
13983 -88.027511 274.892016 ... PHID-USER-v7vgzvvcw7v2umf737ri 1380947348
|
||||||
|
16510 -82.500013 294.909402 ... PHID-USER-izojihzr4ja3jsgzn5wv 1354470131
|
||||||
|
161 -68.446710 197.206426 ... PHID-USER-hyfm4swq76s4j642w46x 1374730027
|
||||||
|
24815 -60.440128 175.352637 ... PHID-USER-fo56wm4wxiwpoofn2xdu 1463439992
|
||||||
|
6163 -59.523505 195.679514 ... PHID-USER-4bjsher5mqcoikeqnnec 1379611711
|
||||||
|
22005 -59.492044 211.972278 ... PHID-USER-maceogqtxg4qfaefx7wd 1440633395
|
||||||
|
24010 -53.793798 153.114760 ... PHID-USER-lhtlnmkdbzlz6pbxaqdd 1428469742
|
||||||
|
24009 -53.614161 153.284397 ... PHID-USER-lhtlnmkdbzlz6pbxaqdd 1428538077
|
||||||
|
|
||||||
|
[10 rows x 34 columns]
|
||||||
|
Top 10 PC2 values:
|
||||||
|
PC1 PC2 ... AuthorPHID date_created
|
||||||
|
24813 -131.130989 438.728132 ... PHID-USER-fo56wm4wxiwpoofn2xdu 1463441050
|
||||||
|
24812 -131.318535 438.637876 ... PHID-USER-fo56wm4wxiwpoofn2xdu 1463441072
|
||||||
|
16510 -82.500013 294.909402 ... PHID-USER-izojihzr4ja3jsgzn5wv 1354470131
|
||||||
|
13983 -88.027511 274.892016 ... PHID-USER-v7vgzvvcw7v2umf737ri 1380947348
|
||||||
|
22005 -59.492044 211.972278 ... PHID-USER-maceogqtxg4qfaefx7wd 1440633395
|
||||||
|
161 -68.446710 197.206426 ... PHID-USER-hyfm4swq76s4j642w46x 1374730027
|
||||||
|
6163 -59.523505 195.679514 ... PHID-USER-4bjsher5mqcoikeqnnec 1379611711
|
||||||
|
20858 -52.549327 192.146265 ... PHID-USER-22bsa5u75jz3ci3wnplu 1441031208
|
||||||
|
24815 -60.440128 175.352637 ... PHID-USER-fo56wm4wxiwpoofn2xdu 1463439992
|
||||||
|
18294 -43.267655 159.973982 ... PHID-USER-vk6mlmacfhx77egryy5i 1394419981
|
||||||
|
|
||||||
|
[10 rows x 34 columns]
|
||||||
|
|
||||||
|
Bottom 10 PC2 values:
|
||||||
|
PC1 PC2 ... AuthorPHID date_created
|
||||||
|
17259 -12.413915 -20.310670 ... PHID-USER-6vzzsmi22zem6yttr6vp 1321220595
|
||||||
|
22246 2.436022 -19.030642 ... PHID-USER-2nnm76h4ykalvvref2ye 1461480989
|
||||||
|
24780 -8.420485 -18.295879 ... PHID-USER-lsveyqlsb4acoowxr5yj 1420344576
|
||||||
|
7427 12.144652 -18.033451 ... PHID-USER-wz5bw3q6zykhqbbeohzq 1375791780
|
||||||
|
7055 -1.553566 -17.924389 ... PHID-USER-cfsvvgbtlqnbt2yokfjf 1377020909
|
||||||
|
23122 9.656987 -17.642747 ... PHID-USER-2nnm76h4ykalvvref2ye 1467721812
|
||||||
|
16776 6.551795 -17.537527 ... PHID-USER-6vzzsmi22zem6yttr6vp 1317838205
|
||||||
|
7471 -0.812161 -17.516875 ... PHID-USER-wkpnidxoctuhawexig5p 1386166246
|
||||||
|
13670 3.270330 -17.516754 ... PHID-USER-5dwuaigmkz2vzg65lape 1401902866
|
||||||
|
20682 3.694061 -17.391146 ... PHID-USER-uciss2jl2e4ifxqqk7wk 1440083315
|
||||||
|
|
||||||
|
[10 rows x 34 columns]
|
||||||
|
job finished, cleaning up
|
||||||
|
job pau at: Tue Oct 14 15:54:56 CDT 2025
|
||||||
BIN
p2/quest/101325_subcomment_pca.pkl
Normal file
BIN
p2/quest/101325_subcomment_pca.pkl
Normal file
Binary file not shown.
@ -2,7 +2,8 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, OlmoForCausalLM
|
|||||||
import torch
|
import torch
|
||||||
import csv
|
import csv
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
import nltk
|
import nltk
|
||||||
nltk.download('punkt_tab')
|
nltk.download('punkt_tab')
|
||||||
@ -18,7 +19,7 @@ print(torch.cuda.get_device_properties(0))
|
|||||||
olmo = AutoModelForCausalLM.from_pretrained("allenai/OLMo-2-1124-13B", cache_dir=cache_directory).to(device)
|
olmo = AutoModelForCausalLM.from_pretrained("allenai/OLMo-2-1124-13B", cache_dir=cache_directory).to(device)
|
||||||
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-1124-13B", padding_side='left')
|
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-1124-13B", padding_side='left')
|
||||||
|
|
||||||
priming = "For the **GIVEN SENTENCE**, please categorize it into one of the defined [[CATEGORIES]]. Each [[CATEGORY]] is described in the TYPOLOGY for reference.Your task is to match the **GIVEN SENTENCE** to the **[[CATEGORY]]** that most accurately describes the content of the comment. Only provide the sentence category as your output. Do not provide any text beyond the category name."
|
priming = "You will be provided with a sentence from a software engineering task discussions. For the **GIVEN SENTENCE**, please categorize it into one of the defined [[CATEGORIES]]. Each [[CATEGORY]] is described in the TYPOLOGY for reference.Your task is to match the **GIVEN SENTENCE** to the **[[CATEGORY]]** that most accurately describes the content of the comment. Only provide the sentence category as your output. Do not provide any text beyond the category name."
|
||||||
|
|
||||||
typology = """
|
typology = """
|
||||||
TYPOLOGY:
|
TYPOLOGY:
|
||||||
@ -59,6 +60,7 @@ TYPOLOGY:
|
|||||||
"""
|
"""
|
||||||
instructions="The sentence's category is: "
|
instructions="The sentence's category is: "
|
||||||
|
|
||||||
|
csv.field_size_limit(sys.maxsize)
|
||||||
with open("/home/nws8519/git/mw-lifecycle-analysis/analysis_data/100325_unified_phab.csv", mode='r', newline='') as file:
|
with open("/home/nws8519/git/mw-lifecycle-analysis/analysis_data/100325_unified_phab.csv", mode='r', newline='') as file:
|
||||||
reader = csv.reader(file)
|
reader = csv.reader(file)
|
||||||
array_of_categorizations = []
|
array_of_categorizations = []
|
||||||
@ -107,7 +109,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/analysis_data/100325_unified_
|
|||||||
batch = comment_sentences[i:i+batch_size]
|
batch = comment_sentences[i:i+batch_size]
|
||||||
prompts = []
|
prompts = []
|
||||||
for sent in batch:
|
for sent in batch:
|
||||||
given_data = f"**GIVEN SENTENCE: \n ' Type -text_dict['task_title'] \n Text -{sent}**'\n"
|
given_data = f"**GIVEN SENTENCE: \n ' Task Title -text_dict['task_title'] \n Text -{sent}**'\n"
|
||||||
prompt = f"{priming}\n{typology}\n\n{given_data}\n{instructions}"
|
prompt = f"{priming}\n{typology}\n\n{given_data}\n{instructions}"
|
||||||
prompts.append(prompt)
|
prompts.append(prompt)
|
||||||
inputs = tokenizer(prompts, return_tensors='pt', return_token_type_ids=False, padding=True, truncation=True).to(device)
|
inputs = tokenizer(prompts, return_tensors='pt', return_token_type_ids=False, padding=True, truncation=True).to(device)
|
||||||
@ -127,7 +129,7 @@ with open("/home/nws8519/git/mw-lifecycle-analysis/analysis_data/100325_unified_
|
|||||||
array_of_categorizations.append(text_dict)
|
array_of_categorizations.append(text_dict)
|
||||||
df = pd.DataFrame(array_of_categorizations)
|
df = pd.DataFrame(array_of_categorizations)
|
||||||
#print(df.head())
|
#print(df.head())
|
||||||
df.to_csv('all_101025_olmo_batched_categorized.csv', index=False)
|
df.to_csv('all_101325_olmo_batched_categorized.csv', index=False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -7,6 +7,7 @@ import pandas as pd
|
|||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
import pickle
|
import pickle
|
||||||
|
import ast
|
||||||
|
|
||||||
# List of the 96 features that Neurobiber can predict
|
# List of the 96 features that Neurobiber can predict
|
||||||
BIBER_FEATURES = [
|
BIBER_FEATURES = [
|
||||||
@ -25,25 +26,72 @@ BIBER_FEATURES = [
|
|||||||
"BIN_DET","BIN_EMOJ","BIN_EMOT","BIN_EXCL","BIN_HASH","BIN_INF",
|
"BIN_DET","BIN_EMOJ","BIN_EMOT","BIN_EXCL","BIN_HASH","BIN_INF",
|
||||||
"BIN_UH","BIN_NUM","BIN_LAUGH","BIN_PRP","BIN_PREP","BIN_NNP",
|
"BIN_UH","BIN_NUM","BIN_LAUGH","BIN_PRP","BIN_PREP","BIN_NNP",
|
||||||
"BIN_QUES","BIN_QUOT","BIN_AT","BIN_SBJP","BIN_URL","BIN_WH",
|
"BIN_QUES","BIN_QUOT","BIN_AT","BIN_SBJP","BIN_URL","BIN_WH",
|
||||||
"BIN_INDA","BIN_ACCU","BIN_PGAS","BIN_CMADJ","BIN_SPADJ","BIN_X"
|
"BIN_INDA","BIN_ACCU","BIN_PGAS","BIN_CMADJ","BIN_SPADJ","BIN_X",
|
||||||
|
"sentence_count", "median_sentence_length"
|
||||||
|
]
|
||||||
|
|
||||||
|
selected_cols = [
|
||||||
|
"normalized_QUAN","normalized_QUPR","normalized_AMP","normalized_PASS","normalized_XX0","normalized_JJ",
|
||||||
|
"normalized_BEMA","normalized_CAUS","normalized_CONC","normalized_COND","normalized_CONJ","normalized_CONT",
|
||||||
|
"normalized_DPAR","normalized_DWNT","normalized_EX","normalized_FPP1","normalized_GER","normalized_RB",
|
||||||
|
"normalized_PIN","normalized_INPR","normalized_TO","normalized_NEMD","normalized_OSUB","normalized_PASTP",
|
||||||
|
"normalized_VBD","normalized_PHC","normalized_PIRE","normalized_PLACE","normalized_POMD","normalized_PRMD",
|
||||||
|
"normalized_WZPRES","normalized_VPRT","normalized_PRIV","normalized_PIT","normalized_PUBV","normalized_SPP2",
|
||||||
|
"normalized_SMP","normalized_SERE","normalized_STPR","normalized_SUAV","normalized_SYNE","normalized_TPP3",
|
||||||
|
"normalized_TIME","normalized_NOMZ","normalized_BYPA","normalized_PRED","normalized_TOBJ","normalized_TSUB",
|
||||||
|
"normalized_THVC","normalized_NN","normalized_DEMP","normalized_DEMO","normalized_WHQU","normalized_EMPH",
|
||||||
|
"normalized_HDG","normalized_WZPAST","normalized_THAC","normalized_PEAS","normalized_ANDC","normalized_PRESP",
|
||||||
|
"normalized_PROD","normalized_SPAU","normalized_SPIN","normalized_THATD","normalized_WHOBJ","normalized_WHSUB",
|
||||||
|
"normalized_WHCL","normalized_ART","normalized_AUXB","normalized_CAP","normalized_SCONJ","normalized_CCONJ",
|
||||||
|
"normalized_DET","normalized_EMOJ","normalized_EMOT","normalized_EXCL","normalized_HASH","normalized_INF",
|
||||||
|
"normalized_UH","normalized_NUM","normalized_LAUGH","normalized_PRP","normalized_PREP","normalized_NNP",
|
||||||
|
"normalized_QUES","normalized_QUOT","normalized_AT","normalized_SBJP","normalized_URL","normalized_WH",
|
||||||
|
"normalized_INDA","normalized_ACCU","normalized_PGAS","normalized_CMADJ","normalized_SPADJ","normalized_X",
|
||||||
|
"normalized_AWL", "normalized_TTR","sentence_count", "median_sentence_length"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def safe_parse(x):
|
||||||
|
# If NaN or float, treat as empty list
|
||||||
|
if isinstance(x, float) and np.isnan(x):
|
||||||
|
return []
|
||||||
|
if isinstance(x, str):
|
||||||
|
try:
|
||||||
|
return ast.literal_eval(x)
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
if isinstance(x, list):
|
||||||
|
return x
|
||||||
|
return []
|
||||||
|
|
||||||
def format_df_data(df):
|
def format_df_data(df):
|
||||||
#this accounts for the somewhat idiosyncratic way that I saved my data
|
#this accounts for the somewhat idiosyncratic way that I saved my data
|
||||||
normalized_cols = [col for col in df.columns if col.startswith('normalized_')]
|
normalized_cols = [col for col in df.columns if col.startswith('normalized_')]
|
||||||
|
|
||||||
|
#selected_features = [col for col in df.columns if col in selected_cols]
|
||||||
x = df[normalized_cols].astype(float).values
|
x = df[normalized_cols].astype(float).values
|
||||||
|
|
||||||
|
#101325_additions to account for length
|
||||||
|
df['olmo_cleaned_sentences'] = df['olmo_cleaned_sentences'].apply(safe_parse)
|
||||||
|
print(df['olmo_cleaned_sentences'])
|
||||||
|
sentence_count = df['olmo_cleaned_sentences'].apply(len).values.reshape(-1, 1)
|
||||||
|
|
||||||
|
median_sentence_length = df['olmo_cleaned_sentences'].apply(
|
||||||
|
lambda sents: np.median([len(sent.split()) for sent in sents]) if len(sents) > 0 else 0
|
||||||
|
).values.reshape(-1, 1)
|
||||||
|
print(median_sentence_length)
|
||||||
|
x = np.hstack([x, sentence_count, median_sentence_length])
|
||||||
#x = np.vstack(df['features'].values)
|
#x = np.vstack(df['features'].values)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
biber_vec_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/analysis_data/092925_unified_phab.csv", low_memory=False)
|
biber_vec_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/analysis_data/100325_unified_phab.csv", low_memory=False)
|
||||||
biber_vec_df = biber_vec_df[biber_vec_df['comment_type'] != 'task_description']
|
biber_vec_df = biber_vec_df[biber_vec_df['comment_type'] != 'task_description']
|
||||||
#biber_vec_df = biber_vec_df[biber_vec_df['AuthorPHID'] != "PHID-USER-idceizaw6elwiwm5xshb"]
|
#biber_vec_df = biber_vec_df[biber_vec_df['AuthorPHID'] != "PHID-USER-idceizaw6elwiwm5xshb"]
|
||||||
#biber_vec_df = biber_vec_df[biber_vec_df['comment_text'] != 'nan']
|
#biber_vec_df = biber_vec_df[biber_vec_df['comment_text'] != 'nan']
|
||||||
biber_vecs = format_df_data(biber_vec_df)
|
biber_vecs = format_df_data(biber_vec_df)
|
||||||
#handoff to PCA model
|
#handoff to PCA model
|
||||||
|
|
||||||
pca_trial = PCA()
|
pca_trial = PCA()
|
||||||
biber_vecs_pca_trial = pca_trial.fit_transform(biber_vecs)
|
biber_vecs_pca_trial = pca_trial.fit_transform(biber_vecs)
|
||||||
|
|
||||||
@ -55,9 +103,9 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
pca = PCA(n_components=argmax_components)
|
pca = PCA(n_components=argmax_components)
|
||||||
biber_vecs_pca = pca.fit_transform(biber_vecs)
|
biber_vecs_pca = pca.fit_transform(biber_vecs)
|
||||||
with open('100125_subcomment_pca.pkl', 'wb') as f:
|
with open('101325_subcomment_pca.pkl', 'wb') as f:
|
||||||
pickle.dump(pca, f)
|
pickle.dump(pca, f)
|
||||||
selected_axis = "closed_relevance"
|
selected_axis = "AuthorWMFAffil"
|
||||||
|
|
||||||
component_variances = np.var(biber_vecs_pca, axis=0)
|
component_variances = np.var(biber_vecs_pca, axis=0)
|
||||||
print("Variance of each PCA component:", component_variances)
|
print("Variance of each PCA component:", component_variances)
|
||||||
@ -66,28 +114,28 @@ if __name__ == "__main__":
|
|||||||
print(f"PC{i+1}:")
|
print(f"PC{i+1}:")
|
||||||
indices = np.argsort(np.abs(component))[::-1]
|
indices = np.argsort(np.abs(component))[::-1]
|
||||||
for idx in indices[:10]: # Top 10
|
for idx in indices[:10]: # Top 10
|
||||||
print(f" {BIBER_FEATURES[idx]}: {component[idx]:.3f}")
|
print(f"{selected_cols[idx]}: {component[idx]:.3f}")
|
||||||
|
|
||||||
#first looking at comment_type
|
#first looking at comment_type
|
||||||
le = LabelEncoder()
|
#le = LabelEncoder()
|
||||||
colors = le.fit_transform(biber_vec_df[selected_axis])
|
#colors = le.fit_transform(biber_vec_df[selected_axis])
|
||||||
|
|
||||||
pc_dict = {f"PC{i+1}": biber_vecs_pca[:, i] for i in range(18)}
|
pc_dict = {f"PC{i+1}": biber_vecs_pca[:, i] for i in range(argmax_components)}
|
||||||
pc_dict[selected_axis] = biber_vec_df[selected_axis].astype(str)
|
#pc_dict[selected_axis] = biber_vec_df[selected_axis].astype(str)
|
||||||
pc_dict["source"] = biber_vec_df['source'].astype(str)
|
pc_dict["source"] = biber_vec_df['source'].astype(str)
|
||||||
pc_dict["phase"] = biber_vec_df['phase'].astype(str)
|
pc_dict["phase"] = biber_vec_df['phase'].astype(str)
|
||||||
pc_dict["text"] = biber_vec_df['comment_text'].astype(str)
|
pc_dict["text"] = biber_vec_df['comment_text'].astype(str)
|
||||||
pc_dict['id'] = biber_vec_df['id']
|
pc_dict['id'] = biber_vec_df['id']
|
||||||
pc_dict['week_index'] = biber_vec_df['week_index']
|
pc_dict['week_index'] = biber_vec_df['week_index']
|
||||||
pc_dict['priority'] = biber_vec_df['priority']
|
pc_dict['priority'] = biber_vec_df['priority']
|
||||||
pc_dict['closed_relevance'] = biber_vec_df['closed_relevance']
|
pc_dict['resolution_outcome'] = biber_vec_df['resolution_outcome']
|
||||||
pc_dict['TaskPHID'] = biber_vec_df['TaskPHID']
|
pc_dict['TaskPHID'] = biber_vec_df['TaskPHID']
|
||||||
pc_dict['AuthorPHID'] = biber_vec_df['AuthorPHID']
|
pc_dict['AuthorPHID'] = biber_vec_df['AuthorPHID']
|
||||||
pc_dict['date_created'] = biber_vec_df['date_created']
|
pc_dict['date_created'] = biber_vec_df['date_created']
|
||||||
|
|
||||||
|
|
||||||
plot_df = pd.DataFrame(pc_dict)
|
plot_df = pd.DataFrame(pc_dict)
|
||||||
plot_df.to_csv("100125_subcomment_PCA_df.csv", index=False)
|
plot_df.to_csv("101325_subcomment_PCA_df.csv", index=False)
|
||||||
|
|
||||||
print("Top 10 PC1 values:")
|
print("Top 10 PC1 values:")
|
||||||
print(plot_df.nlargest(10, "PC1"))
|
print(plot_df.nlargest(10, "PC1"))
|
||||||
@ -109,20 +157,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
#plt.savefig("090225_biber_pca_plot.png", dpi=300)
|
#plt.savefig("090225_biber_pca_plot.png", dpi=300)
|
||||||
'''
|
'''
|
||||||
plot_df = pd.DataFrame({
|
|
||||||
"PC1": biber_vecs_pca[:, 0],
|
|
||||||
"PC2": biber_vecs_pca[:, 1],
|
|
||||||
selected_axis: biber_vec_df[selected_axis].astype(str)
|
|
||||||
})
|
|
||||||
plt.figure(figsize=(8,6))
|
|
||||||
sns.scatterplot(
|
|
||||||
data=plot_df, x="PC1", y="PC2", hue="source",
|
|
||||||
palette="tab10", s=40, alpha=0.7, edgecolor=None
|
|
||||||
)
|
|
||||||
plt.xlabel('component 1')
|
|
||||||
plt.ylabel('component 2')
|
|
||||||
plt.legend(title=selected_axis, bbox_to_anchor=(1.05, 1), loc=2)
|
|
||||||
'''
|
|
||||||
#g.fig.tight_layout()
|
#g.fig.tight_layout()
|
||||||
#g.savefig(f"subcomment_{selected_axis}_100125_biber_pca_final.png", dpi=300)
|
#g.savefig(f"subcomment_{selected_axis}_100125_biber_pca_final.png", dpi=300)
|
||||||
#plt.show()
|
#plt.show()
|
||||||
|
'''
|
||||||
|
|||||||
@ -9,7 +9,7 @@
|
|||||||
#SBATCH --mem=64G
|
#SBATCH --mem=64G
|
||||||
#SBATCH --cpus-per-task=4
|
#SBATCH --cpus-per-task=4
|
||||||
#SBATCH --job-name=batched-MW-info-typology
|
#SBATCH --job-name=batched-MW-info-typology
|
||||||
#SBATCH --output=101025-batched-mw-olmo-info-cat.log
|
#SBATCH --output=101325-batched-mw-olmo-info-cat.log
|
||||||
#SBATCH --mail-type=BEGIN,END,FAIL
|
#SBATCH --mail-type=BEGIN,END,FAIL
|
||||||
#SBATCH --mail-user=gaughan@u.northwestern.edu
|
#SBATCH --mail-user=gaughan@u.northwestern.edu
|
||||||
|
|
||||||
|
|||||||
@ -8,7 +8,7 @@
|
|||||||
#SBATCH --mem=64G
|
#SBATCH --mem=64G
|
||||||
#SBATCH --cpus-per-task=4
|
#SBATCH --cpus-per-task=4
|
||||||
#SBATCH --job-name=neurobiber-pca
|
#SBATCH --job-name=neurobiber-pca
|
||||||
#SBATCH --output=100125_subcomment_neurobiber-pca.log
|
#SBATCH --output=101325_subcomment_neurobiber-pca.log
|
||||||
#SBATCH --mail-type=BEGIN,END,FAIL
|
#SBATCH --mail-type=BEGIN,END,FAIL
|
||||||
#SBATCH --mail-user=gaughan@u.northwestern.edu
|
#SBATCH --mail-user=gaughan@u.northwestern.edu
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user