backing up the morning' before taking a few meetings
This commit is contained in:
parent
6a5f07872d
commit
5d4df28f94
@ -1,8 +1,9 @@
|
|||||||
starting the job at: Thu Sep 4 10:23:23 CDT 2025
|
starting the job at: Thu Sep 4 11:02:03 CDT 2025
|
||||||
setting up the environment
|
setting up the environment
|
||||||
running the neurobiber labeling script
|
running the neurobiber labeling script
|
||||||
Variance of each PCA component: [88.92832185 39.46471687 32.34601523 20.19544345 14.0083261 11.5837521
|
Variance of each PCA component: [259.38215213 83.11803664 67.16301107 61.78747188 38.94875996
|
||||||
7.82584723 6.89064989 6.07988254 5.80726367 5.49782354 4.50587747
|
32.78688889 26.45592105 21.9280629 18.734197 16.29485568
|
||||||
4.31482409 2.81997326 2.62989708 2.27205352 2.09396341 2.00076119]
|
13.48304855 11.50594609 10.77855857 9.30674176 8.96113511
|
||||||
|
8.35521401 8.17815209 7.13194427]
|
||||||
job finished, cleaning up
|
job finished, cleaning up
|
||||||
job pau at: Thu Sep 4 10:23:47 CDT 2025
|
job pau at: Thu Sep 4 11:02:32 CDT 2025
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
setting up the environment by loading in conda environment at Thu Sep 4 10:04:55 CDT 2025
|
setting up the environment by loading in conda environment at Thu Sep 4 11:14:26 CDT 2025
|
||||||
running the bertopic job at Thu Sep 4 10:04:55 CDT 2025
|
running the olmo labeling job at Thu Sep 4 11:14:26 CDT 2025
|
||||||
----------------------------------------
|
----------------------------------------
|
||||||
srun job start: Thu Sep 4 10:04:55 CDT 2025
|
srun job start: Thu Sep 4 11:14:27 CDT 2025
|
||||||
Job ID: 3272179
|
Job ID: 3273582
|
||||||
Username: nws8519
|
Username: nws8519
|
||||||
Queue: gengpu
|
Queue: gengpu
|
||||||
Account: p32852
|
Account: p32852
|
||||||
@ -14,328 +14,60 @@ prologue and the job run script
|
|||||||
PATH (in prologue) : /home/nws8519/.conda/envs/olmo/bin:/software/miniconda3/4.12.0/condabin:/home/nws8519/.local/bin:/home/nws8519/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/usr/lpp/mmfs/bin:/hpc/usertools
|
PATH (in prologue) : /home/nws8519/.conda/envs/olmo/bin:/software/miniconda3/4.12.0/condabin:/home/nws8519/.local/bin:/home/nws8519/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/usr/lpp/mmfs/bin:/hpc/usertools
|
||||||
WORKDIR is: /home/nws8519
|
WORKDIR is: /home/nws8519
|
||||||
----------------------------------------
|
----------------------------------------
|
||||||
W0904 10:05:10.900000 1845275 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766]
|
W0904 11:14:40.413000 1736745 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766]
|
||||||
W0904 10:05:10.900000 1845275 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] *****************************************
|
W0904 11:14:40.413000 1736745 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] *****************************************
|
||||||
W0904 10:05:10.900000 1845275 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
W0904 11:14:40.413000 1736745 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
||||||
W0904 10:05:10.900000 1845275 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] *****************************************
|
W0904 11:14:40.413000 1736745 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] *****************************************
|
||||||
W0904 10:05:10.900000 1845276 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766]
|
W0904 11:14:40.413000 1736746 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766]
|
||||||
W0904 10:05:10.900000 1845276 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] *****************************************
|
W0904 11:14:40.413000 1736746 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] *****************************************
|
||||||
W0904 10:05:10.900000 1845276 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
W0904 11:14:40.413000 1736746 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
||||||
W0904 10:05:10.900000 1845276 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] *****************************************
|
W0904 11:14:40.413000 1736746 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] *****************************************
|
||||||
W0904 10:05:10.906000 1400307 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766]
|
W0904 11:14:40.413000 2769136 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766]
|
||||||
W0904 10:05:10.906000 1400307 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] *****************************************
|
W0904 11:14:40.413000 2769136 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] *****************************************
|
||||||
W0904 10:05:10.906000 1400307 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
W0904 11:14:40.413000 2769136 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
||||||
W0904 10:05:10.906000 1400307 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] *****************************************
|
W0904 11:14:40.413000 2769136 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] *****************************************
|
||||||
W0904 10:05:10.907000 1400308 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766]
|
W0904 11:14:40.413000 2769137 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766]
|
||||||
W0904 10:05:10.907000 1400308 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] *****************************************
|
W0904 11:14:40.413000 2769137 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] *****************************************
|
||||||
W0904 10:05:10.907000 1400308 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
W0904 11:14:40.413000 2769137 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
||||||
W0904 10:05:10.907000 1400308 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] *****************************************
|
W0904 11:14:40.413000 2769137 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] *****************************************
|
||||||
/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py:117: DtypeWarning: Columns (21) have mixed types. Specify dtype option on import or set low_memory=False.
|
[nltk_data] Downloading package punkt to /home/nws8519/nltk_data...
|
||||||
|
[nltk_data] Downloading package punkt to /home/nws8519/nltk_data...
|
||||||
|
[nltk_data] Downloading package punkt to /home/nws8519/nltk_data...
|
||||||
|
[nltk_data] Downloading package punkt to /home/nws8519/nltk_data...
|
||||||
|
[nltk_data] Package punkt is already up-to-date![nltk_data] Package punkt is already up-to-date!
|
||||||
|
|
||||||
|
[nltk_data] Package punkt is already up-to-date![nltk_data] Package punkt is already up-to-date!
|
||||||
|
|
||||||
|
[nltk_data] Downloading package punkt_tab to
|
||||||
|
[nltk_data] /home/nws8519/nltk_data...[nltk_data] Downloading package punkt_tab to
|
||||||
|
[nltk_data] /home/nws8519/nltk_data...
|
||||||
|
|
||||||
|
[nltk_data] Downloading package punkt_tab to
|
||||||
|
[nltk_data] /home/nws8519/nltk_data...[nltk_data] Downloading package punkt_tab to
|
||||||
|
[nltk_data] /home/nws8519/nltk_data...
|
||||||
|
|
||||||
|
[nltk_data] Package punkt_tab is already up-to-date![nltk_data] Package punkt_tab is already up-to-date!
|
||||||
|
|
||||||
|
[nltk_data] Package punkt_tab is already up-to-date![nltk_data] Package punkt_tab is already up-to-date!
|
||||||
|
|
||||||
|
/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py:120: DtypeWarning: Columns (21) have mixed types. Specify dtype option on import or set low_memory=False.
|
||||||
df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv")
|
df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv")
|
||||||
/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py:117: DtypeWarning: Columns (21) have mixed types. Specify dtype option on import or set low_memory=False.
|
/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py:120: DtypeWarning: Columns (21) have mixed types. Specify dtype option on import or set low_memory=False.
|
||||||
df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv")
|
df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv")
|
||||||
[rank0]: Traceback (most recent call last):
|
/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py:120: DtypeWarning: Columns (21) have mixed types. Specify dtype option on import or set low_memory=False.
|
||||||
[rank0]: File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py", line 178, in <module>
|
|
||||||
[rank0]: main()
|
|
||||||
[rank0]: File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py", line 122, in main
|
|
||||||
[rank0]: dataset = SentenceDataset(comment_texts, comment_types, priming, typology, instructions)
|
|
||||||
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
[rank0]: File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py", line 76, in __init__
|
|
||||||
[rank0]: sentences = split_to_sentences(cleaned_comment)
|
|
||||||
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
[rank0]: File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py", line 106, in split_to_sentences
|
|
||||||
[rank0]: return nltk.sent_tokenize(text)
|
|
||||||
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
[rank0]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/nltk/tokenize/__init__.py", line 119, in sent_tokenize
|
|
||||||
[rank0]: tokenizer = _get_punkt_tokenizer(language)
|
|
||||||
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
[rank0]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/nltk/tokenize/__init__.py", line 105, in _get_punkt_tokenizer
|
|
||||||
[rank0]: return PunktTokenizer(language)
|
|
||||||
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
[rank0]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/nltk/tokenize/punkt.py", line 1744, in __init__
|
|
||||||
[rank0]: self.load_lang(lang)
|
|
||||||
[rank0]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/nltk/tokenize/punkt.py", line 1749, in load_lang
|
|
||||||
[rank0]: lang_dir = find(f"tokenizers/punkt_tab/{lang}/")
|
|
||||||
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
[rank0]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/nltk/data.py", line 579, in find
|
|
||||||
[rank0]: raise LookupError(resource_not_found)
|
|
||||||
[rank0]: LookupError:
|
|
||||||
[rank0]: **********************************************************************
|
|
||||||
[rank0]: Resource [93mpunkt_tab[0m not found.
|
|
||||||
[rank0]: Please use the NLTK Downloader to obtain the resource:
|
|
||||||
|
|
||||||
[rank0]: [31m>>> import nltk
|
|
||||||
[rank0]: >>> nltk.download('punkt_tab')
|
|
||||||
[rank0]: [0m
|
|
||||||
[rank0]: For more information see: https://www.nltk.org/data.html
|
|
||||||
|
|
||||||
[rank0]: Attempted to load [93mtokenizers/punkt_tab/english/[0m
|
|
||||||
|
|
||||||
[rank0]: Searched in:
|
|
||||||
[rank0]: - '/home/nws8519/nltk_data'
|
|
||||||
[rank0]: - '/home/nws8519/.conda/envs/olmo/nltk_data'
|
|
||||||
[rank0]: - '/home/nws8519/.conda/envs/olmo/share/nltk_data'
|
|
||||||
[rank0]: - '/home/nws8519/.conda/envs/olmo/lib/nltk_data'
|
|
||||||
[rank0]: - '/usr/share/nltk_data'
|
|
||||||
[rank0]: - '/usr/local/share/nltk_data'
|
|
||||||
[rank0]: - '/usr/lib/nltk_data'
|
|
||||||
[rank0]: - '/usr/local/lib/nltk_data'
|
|
||||||
[rank0]: **********************************************************************
|
|
||||||
|
|
||||||
[rank2]: Traceback (most recent call last):
|
|
||||||
[rank2]: File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py", line 178, in <module>
|
|
||||||
[rank2]: main()
|
|
||||||
[rank2]: File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py", line 122, in main
|
|
||||||
[rank2]: dataset = SentenceDataset(comment_texts, comment_types, priming, typology, instructions)
|
|
||||||
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
[rank2]: File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py", line 76, in __init__
|
|
||||||
[rank2]: sentences = split_to_sentences(cleaned_comment)
|
|
||||||
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
[rank2]: File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py", line 106, in split_to_sentences
|
|
||||||
[rank2]: return nltk.sent_tokenize(text)
|
|
||||||
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
[rank2]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/nltk/tokenize/__init__.py", line 119, in sent_tokenize
|
|
||||||
[rank2]: tokenizer = _get_punkt_tokenizer(language)
|
|
||||||
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
[rank2]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/nltk/tokenize/__init__.py", line 105, in _get_punkt_tokenizer
|
|
||||||
[rank2]: return PunktTokenizer(language)
|
|
||||||
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
[rank2]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/nltk/tokenize/punkt.py", line 1744, in __init__
|
|
||||||
[rank2]: self.load_lang(lang)
|
|
||||||
[rank2]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/nltk/tokenize/punkt.py", line 1749, in load_lang
|
|
||||||
[rank2]: lang_dir = find(f"tokenizers/punkt_tab/{lang}/")
|
|
||||||
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
[rank2]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/nltk/data.py", line 579, in find
|
|
||||||
[rank2]: raise LookupError(resource_not_found)
|
|
||||||
[rank2]: LookupError:
|
|
||||||
[rank2]: **********************************************************************
|
|
||||||
[rank2]: Resource [93mpunkt_tab[0m not found.
|
|
||||||
[rank2]: Please use the NLTK Downloader to obtain the resource:
|
|
||||||
|
|
||||||
[rank2]: [31m>>> import nltk
|
|
||||||
[rank2]: >>> nltk.download('punkt_tab')
|
|
||||||
[rank2]: [0m
|
|
||||||
[rank2]: For more information see: https://www.nltk.org/data.html
|
|
||||||
|
|
||||||
[rank2]: Attempted to load [93mtokenizers/punkt_tab/english/[0m
|
|
||||||
|
|
||||||
[rank2]: Searched in:
|
|
||||||
[rank2]: - '/home/nws8519/nltk_data'
|
|
||||||
[rank2]: - '/home/nws8519/.conda/envs/olmo/nltk_data'
|
|
||||||
[rank2]: - '/home/nws8519/.conda/envs/olmo/share/nltk_data'
|
|
||||||
[rank2]: - '/home/nws8519/.conda/envs/olmo/lib/nltk_data'
|
|
||||||
[rank2]: - '/usr/share/nltk_data'
|
|
||||||
[rank2]: - '/usr/local/share/nltk_data'
|
|
||||||
[rank2]: - '/usr/lib/nltk_data'
|
|
||||||
[rank2]: - '/usr/local/lib/nltk_data'
|
|
||||||
[rank2]: **********************************************************************
|
|
||||||
|
|
||||||
/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py:117: DtypeWarning: Columns (21) have mixed types. Specify dtype option on import or set low_memory=False.
|
|
||||||
df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv")
|
df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv")
|
||||||
/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py:117: DtypeWarning: Columns (21) have mixed types. Specify dtype option on import or set low_memory=False.
|
[rank3]:[W904 11:15:22.374478896 ProcessGroupNCCL.cpp:4715] [PG ID 0 PG GUID 0 Rank 3] using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device.
|
||||||
|
/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py:120: DtypeWarning: Columns (21) have mixed types. Specify dtype option on import or set low_memory=False.
|
||||||
df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv")
|
df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv")
|
||||||
[rank1]: Traceback (most recent call last):
|
[rank1]:[W904 11:15:22.049509730 ProcessGroupNCCL.cpp:4715] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device.
|
||||||
[rank1]: File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py", line 178, in <module>
|
[rank2]:[W904 11:15:22.461549051 ProcessGroupNCCL.cpp:4715] [PG ID 0 PG GUID 0 Rank 2] using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device.
|
||||||
[rank1]: main()
|
Fetching 12 files: 0%| | 0/12 [00:00<?, ?it/s]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
|
||||||
[rank1]: File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py", line 122, in main
|
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
|
||||||
[rank1]: dataset = SentenceDataset(comment_texts, comment_types, priming, typology, instructions)
|
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
|
||||||
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
|
||||||
[rank1]: File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py", line 76, in __init__
|
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
|
||||||
[rank1]: sentences = split_to_sentences(cleaned_comment)
|
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
|
||||||
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
|
||||||
[rank1]: File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py", line 106, in split_to_sentences
|
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
|
||||||
[rank1]: return nltk.sent_tokenize(text)
|
Fetching 12 files: 8%|▊ | 1/12 [03:13<35:25, 193.27s/it]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
|
||||||
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^
|
Fetching 12 files: 17%|█▋ | 2/12 [04:23<20:11, 121.10s/it]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
|
||||||
[rank1]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/nltk/tokenize/__init__.py", line 119, in sent_tokenize
|
|
||||||
[rank1]: tokenizer = _get_punkt_tokenizer(language)
|
|
||||||
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
[rank1]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/nltk/tokenize/__init__.py", line 105, in _get_punkt_tokenizer
|
|
||||||
[rank1]: return PunktTokenizer(language)
|
|
||||||
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
[rank1]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/nltk/tokenize/punkt.py", line 1744, in __init__
|
|
||||||
[rank1]: self.load_lang(lang)
|
|
||||||
[rank1]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/nltk/tokenize/punkt.py", line 1749, in load_lang
|
|
||||||
[rank1]: lang_dir = find(f"tokenizers/punkt_tab/{lang}/")
|
|
||||||
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
[rank1]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/nltk/data.py", line 579, in find
|
|
||||||
[rank1]: raise LookupError(resource_not_found)
|
|
||||||
[rank1]: LookupError:
|
|
||||||
[rank1]: **********************************************************************
|
|
||||||
[rank1]: Resource [93mpunkt_tab[0m not found.
|
|
||||||
[rank1]: Please use the NLTK Downloader to obtain the resource:
|
|
||||||
|
|
||||||
[rank1]: [31m>>> import nltk
|
|
||||||
[rank1]: >>> nltk.download('punkt_tab')
|
|
||||||
[rank1]: [0m
|
|
||||||
[rank1]: For more information see: https://www.nltk.org/data.html
|
|
||||||
|
|
||||||
[rank1]: Attempted to load [93mtokenizers/punkt_tab/english/[0m
|
|
||||||
|
|
||||||
[rank1]: Searched in:
|
|
||||||
[rank1]: - '/home/nws8519/nltk_data'
|
|
||||||
[rank1]: - '/home/nws8519/.conda/envs/olmo/nltk_data'
|
|
||||||
[rank1]: - '/home/nws8519/.conda/envs/olmo/share/nltk_data'
|
|
||||||
[rank1]: - '/home/nws8519/.conda/envs/olmo/lib/nltk_data'
|
|
||||||
[rank1]: - '/usr/share/nltk_data'
|
|
||||||
[rank1]: - '/usr/local/share/nltk_data'
|
|
||||||
[rank1]: - '/usr/lib/nltk_data'
|
|
||||||
[rank1]: - '/usr/local/lib/nltk_data'
|
|
||||||
[rank1]: **********************************************************************
|
|
||||||
|
|
||||||
[rank3]: Traceback (most recent call last):
|
|
||||||
[rank3]: File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py", line 178, in <module>
|
|
||||||
[rank3]: main()
|
|
||||||
[rank3]: File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py", line 122, in main
|
|
||||||
[rank3]: dataset = SentenceDataset(comment_texts, comment_types, priming, typology, instructions)
|
|
||||||
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
[rank3]: File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py", line 76, in __init__
|
|
||||||
[rank3]: sentences = split_to_sentences(cleaned_comment)
|
|
||||||
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
[rank3]: File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py", line 106, in split_to_sentences
|
|
||||||
[rank3]: return nltk.sent_tokenize(text)
|
|
||||||
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
[rank3]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/nltk/tokenize/__init__.py", line 119, in sent_tokenize
|
|
||||||
[rank3]: tokenizer = _get_punkt_tokenizer(language)
|
|
||||||
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
[rank3]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/nltk/tokenize/__init__.py", line 105, in _get_punkt_tokenizer
|
|
||||||
[rank3]: return PunktTokenizer(language)
|
|
||||||
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
[rank3]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/nltk/tokenize/punkt.py", line 1744, in __init__
|
|
||||||
[rank3]: self.load_lang(lang)
|
|
||||||
[rank3]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/nltk/tokenize/punkt.py", line 1749, in load_lang
|
|
||||||
[rank3]: lang_dir = find(f"tokenizers/punkt_tab/{lang}/")
|
|
||||||
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
[rank3]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/nltk/data.py", line 579, in find
|
|
||||||
[rank3]: raise LookupError(resource_not_found)
|
|
||||||
[rank3]: LookupError:
|
|
||||||
[rank3]: **********************************************************************
|
|
||||||
[rank3]: Resource [93mpunkt_tab[0m not found.
|
|
||||||
[rank3]: Please use the NLTK Downloader to obtain the resource:
|
|
||||||
|
|
||||||
[rank3]: [31m>>> import nltk
|
|
||||||
[rank3]: >>> nltk.download('punkt_tab')
|
|
||||||
[rank3]: [0m
|
|
||||||
[rank3]: For more information see: https://www.nltk.org/data.html
|
|
||||||
|
|
||||||
[rank3]: Attempted to load [93mtokenizers/punkt_tab/english/[0m
|
|
||||||
|
|
||||||
[rank3]: Searched in:
|
|
||||||
[rank3]: - '/home/nws8519/nltk_data'
|
|
||||||
[rank3]: - '/home/nws8519/.conda/envs/olmo/nltk_data'
|
|
||||||
[rank3]: - '/home/nws8519/.conda/envs/olmo/share/nltk_data'
|
|
||||||
[rank3]: - '/home/nws8519/.conda/envs/olmo/lib/nltk_data'
|
|
||||||
[rank3]: - '/usr/share/nltk_data'
|
|
||||||
[rank3]: - '/usr/local/share/nltk_data'
|
|
||||||
[rank3]: - '/usr/lib/nltk_data'
|
|
||||||
[rank3]: - '/usr/local/lib/nltk_data'
|
|
||||||
[rank3]: **********************************************************************
|
|
||||||
|
|
||||||
[rank2]:[W904 10:05:56.100290280 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
|
|
||||||
[rank0]:[W904 10:05:56.107999460 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
|
|
||||||
W0904 10:05:57.705000 1400307 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1400332 closing signal SIGTERM
|
|
||||||
W0904 10:05:57.720000 1400308 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1400334 closing signal SIGTERM
|
|
||||||
E0904 10:05:57.770000 1400307 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 0 (pid: 1400331) of binary: /home/nws8519/.conda/envs/olmo/bin/python3.11
|
|
||||||
Traceback (most recent call last):
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/bin/torchrun", line 8, in <module>
|
|
||||||
sys.exit(main())
|
|
||||||
^^^^^^
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
|
||||||
return f(*args, **kwargs)
|
|
||||||
^^^^^^^^^^^^^^^^^^
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py", line 892, in main
|
|
||||||
run(args)
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py", line 883, in run
|
|
||||||
elastic_launch(
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 139, in __call__
|
|
||||||
return launch_agent(self._config, self._entrypoint, list(args))
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent
|
|
||||||
raise ChildFailedError(
|
|
||||||
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
|
||||||
============================================================
|
|
||||||
/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py FAILED
|
|
||||||
------------------------------------------------------------
|
|
||||||
Failures:
|
|
||||||
<NO_OTHER_FAILURES>
|
|
||||||
------------------------------------------------------------
|
|
||||||
Root Cause (first observed failure):
|
|
||||||
[0]:
|
|
||||||
time : 2025-09-04_10:05:57
|
|
||||||
host : qgpu0203
|
|
||||||
rank : 0 (local_rank: 0)
|
|
||||||
exitcode : 1 (pid: 1400331)
|
|
||||||
error_file: <N/A>
|
|
||||||
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
|
||||||
============================================================
|
|
||||||
E0904 10:05:57.885000 1400308 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 0 (pid: 1400333) of binary: /home/nws8519/.conda/envs/olmo/bin/python3.11
|
|
||||||
Traceback (most recent call last):
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/bin/torchrun", line 8, in <module>
|
|
||||||
sys.exit(main())
|
|
||||||
^^^^^^
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
|
||||||
return f(*args, **kwargs)
|
|
||||||
^^^^^^^^^^^^^^^^^^
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py", line 892, in main
|
|
||||||
run(args)
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py", line 883, in run
|
|
||||||
elastic_launch(
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 139, in __call__
|
|
||||||
return launch_agent(self._config, self._entrypoint, list(args))
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent
|
|
||||||
raise ChildFailedError(
|
|
||||||
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
|
|
||||||
============================================================
|
|
||||||
/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py FAILED
|
|
||||||
------------------------------------------------------------
|
|
||||||
Failures:
|
|
||||||
<NO_OTHER_FAILURES>
|
|
||||||
------------------------------------------------------------
|
|
||||||
Root Cause (first observed failure):
|
|
||||||
[0]:
|
|
||||||
time : 2025-09-04_10:05:57
|
|
||||||
host : qgpu0203
|
|
||||||
rank : 2 (local_rank: 0)
|
|
||||||
exitcode : 1 (pid: 1400333)
|
|
||||||
error_file: <N/A>
|
|
||||||
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
|
|
||||||
============================================================
|
|
||||||
Traceback (most recent call last):
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/bin/torchrun", line 8, in <module>
|
|
||||||
sys.exit(main())
|
|
||||||
^^^^^^
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
|
||||||
return f(*args, **kwargs)
|
|
||||||
^^^^^^^^^^^^^^^^^^
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py", line 892, in main
|
|
||||||
run(args)
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py", line 883, in run
|
|
||||||
elastic_launch(
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 139, in __call__
|
|
||||||
return launch_agent(self._config, self._entrypoint, list(args))
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 265, in launch_agent
|
|
||||||
if result.is_failed():
|
|
||||||
^^^^^^^^^^^^^^^^
|
|
||||||
AttributeError: 'NoneType' object has no attribute 'is_failed'
|
|
||||||
Traceback (most recent call last):
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/bin/torchrun", line 8, in <module>
|
|
||||||
sys.exit(main())
|
|
||||||
^^^^^^
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
|
||||||
return f(*args, **kwargs)
|
|
||||||
^^^^^^^^^^^^^^^^^^
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py", line 892, in main
|
|
||||||
run(args)
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py", line 883, in run
|
|
||||||
elastic_launch(
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 139, in __call__
|
|
||||||
return launch_agent(self._config, self._entrypoint, list(args))
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 265, in launch_agent
|
|
||||||
if result.is_failed():
|
|
||||||
^^^^^^^^^^^^^^^^
|
|
||||||
AttributeError: 'NoneType' object has no attribute 'is_failed'
|
|
||||||
srun: error: qgpu0203: tasks 2-3: Exited with exit code 1
|
|
||||||
srun: error: qgpu0202: tasks 0-1: Exited with exit code 1
|
|
||||||
unsupervised olmo categorization pau at Thu Sep 4 10:05:58 CDT 2025
|
|
||||||
|
Binary file not shown.
Before Width: | Height: | Size: 797 KiB |
@ -17,7 +17,7 @@ def format_df_data(df):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
biber_vec_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv", low_memory=False)
|
biber_vec_df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv", low_memory=False)
|
||||||
biber_vec_df = biber_vec_df[biber_vec_df['comment_type'] == 'task_description']
|
biber_vec_df = biber_vec_df[biber_vec_df['comment_type'] == 'task_subcomment']
|
||||||
biber_vecs = format_df_data(biber_vec_df)
|
biber_vecs = format_df_data(biber_vec_df)
|
||||||
#handoff to PCA model
|
#handoff to PCA model
|
||||||
'''
|
'''
|
||||||
@ -32,7 +32,7 @@ if __name__ == "__main__":
|
|||||||
'''
|
'''
|
||||||
pca = PCA(n_components=18)
|
pca = PCA(n_components=18)
|
||||||
biber_vecs_pca = pca.fit_transform(biber_vecs)
|
biber_vecs_pca = pca.fit_transform(biber_vecs)
|
||||||
selected_axis = "phase"
|
selected_axis = "AuthorWMFAffil"
|
||||||
|
|
||||||
component_variances = np.var(biber_vecs_pca, axis=0)
|
component_variances = np.var(biber_vecs_pca, axis=0)
|
||||||
print("Variance of each PCA component:", component_variances)
|
print("Variance of each PCA component:", component_variances)
|
||||||
@ -45,11 +45,12 @@ if __name__ == "__main__":
|
|||||||
"PC1": biber_vecs_pca[:, 0],
|
"PC1": biber_vecs_pca[:, 0],
|
||||||
"PC2": biber_vecs_pca[:, 1],
|
"PC2": biber_vecs_pca[:, 1],
|
||||||
selected_axis: biber_vec_df[selected_axis].astype(str),
|
selected_axis: biber_vec_df[selected_axis].astype(str),
|
||||||
"source":biber_vec_df['source'].astype(str)
|
"source":biber_vec_df['source'].astype(str),
|
||||||
|
"phase":biber_vec_df['phase'].astype(str)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
g = sns.FacetGrid(plot_df, col="source", col_wrap=4, hue=selected_axis, palette="tab10", height=4, sharex=False, sharey=False)
|
g = sns.FacetGrid(plot_df, col="source", row="phase", hue=selected_axis, palette="tab10", height=4, sharex=False, sharey=False)
|
||||||
g.map_dataframe(sns.scatterplot, x="PC1", y="PC2", alpha=0.7, s=40)
|
g.map_dataframe(sns.scatterplot, x="PC1", y="PC2", alpha=0.7, s=40)
|
||||||
g.add_legend(title=selected_axis)
|
g.add_legend(title=selected_axis)
|
||||||
g.set_axis_labels("PC1", "PC2")
|
g.set_axis_labels("PC1", "PC2")
|
||||||
@ -73,5 +74,5 @@ if __name__ == "__main__":
|
|||||||
plt.legend(title=selected_axis, bbox_to_anchor=(1.05, 1), loc=2)
|
plt.legend(title=selected_axis, bbox_to_anchor=(1.05, 1), loc=2)
|
||||||
'''
|
'''
|
||||||
g.fig.tight_layout()
|
g.fig.tight_layout()
|
||||||
g.savefig(f"{selected_axis}_090425_biber_kernelpca_affil.png", dpi=300)
|
g.savefig(f"subcomment_{selected_axis}_090425_biber_pca.png", dpi=300)
|
||||||
plt.show()
|
plt.show()
|
||||||
|
@ -17,6 +17,7 @@ import re
|
|||||||
|
|
||||||
import nltk
|
import nltk
|
||||||
nltk.download('punkt')
|
nltk.download('punkt')
|
||||||
|
nltk.download('punkt_tab')
|
||||||
# ----------------- prompts for LLM
|
# ----------------- prompts for LLM
|
||||||
priming = "For the **GIVEN SENTENCE**, please categorize it into one of the defined [[CATEGORIES]]. Each [[CATEGORY]] is described in the TYPOLOGY for reference. Your task is to match the**GIVEN SENTENCE** to the **[[CATEGORY]]** that most accurately describes the content of the comment. Only provide the category as your output. Do not provide any text beyond the category name."
|
priming = "For the **GIVEN SENTENCE**, please categorize it into one of the defined [[CATEGORIES]]. Each [[CATEGORY]] is described in the TYPOLOGY for reference. Your task is to match the**GIVEN SENTENCE** to the **[[CATEGORY]]** that most accurately describes the content of the comment. Only provide the category as your output. Do not provide any text beyond the category name."
|
||||||
|
|
||||||
@ -77,7 +78,7 @@ class SentenceDataset(Dataset):
|
|||||||
cleaned_comment = preprocess_comment(comment)
|
cleaned_comment = preprocess_comment(comment)
|
||||||
sentences = split_to_sentences(cleaned_comment)
|
sentences = split_to_sentences(cleaned_comment)
|
||||||
for sentence in sentences:
|
for sentence in sentences:
|
||||||
given_data = f"**GIVEN SENTENCE: \n ' Type -{comment_type} \n Text -{sentence}**'\n"
|
given_data = f"**GIVEN SENTENCE: \n ' Type -{comment_types[idx]} \n Text -{sentence}**'\n"
|
||||||
prompt = f"{priming}\n{typology}\n\n{given_data}\n{instructions}"
|
prompt = f"{priming}\n{typology}\n\n{given_data}\n{instructions}"
|
||||||
self.samples.append((idx, sentence, prompt))
|
self.samples.append((idx, sentence, prompt))
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
@ -118,7 +119,7 @@ def main():
|
|||||||
#load in data
|
#load in data
|
||||||
df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv")
|
df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv")
|
||||||
# TODO comment out below
|
# TODO comment out below
|
||||||
df = df.iloc[:5].copy()
|
df = df.iloc[:50].copy()
|
||||||
comment_texts = df['comment_text'].tolist()
|
comment_texts = df['comment_text'].tolist()
|
||||||
comment_types = df['comment_type'].tolist()
|
comment_types = df['comment_type'].tolist()
|
||||||
dataset = SentenceDataset(comment_texts, comment_types, priming, typology, instructions)
|
dataset = SentenceDataset(comment_texts, comment_types, priming, typology, instructions)
|
||||||
@ -130,8 +131,15 @@ def main():
|
|||||||
|
|
||||||
#load model and wrap in DDP
|
#load model and wrap in DDP
|
||||||
cache_directory="/projects/p32852/cache/"
|
cache_directory="/projects/p32852/cache/"
|
||||||
olmo = AutoModelForCausalLM.from_pretrained("allenai/OLMo-2-1124-13B", cache_dir=cache_directory).to(device)
|
if dist.get_rank() == 0:
|
||||||
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-1124-13B", cache_dir=cache_directory)
|
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-1124-13B", cache_dir=cache_directory)
|
||||||
|
olmo = AutoModelForCausalLM.from_pretrained("allenai/OLMo-2-1124-13B", cache_dir=cache_directory).to(device)
|
||||||
|
dist.barrier()
|
||||||
|
if dist.get_rank() != 0:
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-1124-13B", cache_dir=cache_directory, local_files_only=True)
|
||||||
|
olmo = AutoModelForCausalLM.from_pretrained("allenai/OLMo-2-1124-13B", cache_dir=cache_directory, local_files_only=True).to(device)
|
||||||
|
#olmo = AutoModelForCausalLM.from_pretrained("allenai/OLMo-2-1124-13B", cache_dir=cache_directory).to(device)
|
||||||
|
#tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-1124-13B", cache_dir=cache_directory)
|
||||||
ddp_olmo = DDP(olmo, device_ids=[local_rank])
|
ddp_olmo = DDP(olmo, device_ids=[local_rank])
|
||||||
|
|
||||||
#prepare to collect results as dictionary
|
#prepare to collect results as dictionary
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
#SBATCH -A p32852
|
#SBATCH -A p32852
|
||||||
#SBATCH -p gengpu
|
#SBATCH -p gengpu
|
||||||
#SBATCH --gres=gpu:a100:2
|
#SBATCH --gres=gpu:a100:2
|
||||||
|
#SBATCH --constraint=sxm
|
||||||
#SBATCH --nodes=2
|
#SBATCH --nodes=2
|
||||||
#SBATCH --ntasks-per-node=2
|
#SBATCH --ntasks-per-node=2
|
||||||
#SBATCH --time=48:00:00
|
#SBATCH --time=48:00:00
|
||||||
@ -20,14 +21,21 @@ echo "setting up the environment by loading in conda environment at $(date)"
|
|||||||
|
|
||||||
conda activate olmo
|
conda activate olmo
|
||||||
|
|
||||||
echo "running the bertopic job at $(date)"
|
echo "running the olmo labeling job at $(date)"
|
||||||
|
|
||||||
|
# Get master node address for rendezvous
|
||||||
|
MASTER_ADDR=$(scontrol show hostnames $SLURM_NODELIST | head -n 1)
|
||||||
|
MASTER_PORT=29502
|
||||||
|
|
||||||
|
export MASTER_ADDR
|
||||||
|
export MASTER_PORT
|
||||||
|
|
||||||
srun torchrun \
|
srun torchrun \
|
||||||
--nnodes 2 \
|
--nnodes 2 \
|
||||||
--nproc-per-node 2 \
|
--nproc-per-node 2 \
|
||||||
--rdzv_id $RANDOM \
|
--rdzv_id $SLURM_JOB_ID \
|
||||||
--rdzv_backend c10d \
|
--rdzv_backend c10d \
|
||||||
--rdzv_endpoint "$SLURMD_NODENAME:29502" \
|
--rdzv_endpoint "$MASTER_ADDR:$MASTER_PORT" \
|
||||||
/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py 10000 100
|
/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py
|
||||||
|
|
||||||
echo "unsupervised olmo categorization pau at $(date)"
|
echo "unsupervised olmo categorization pau at $(date)"
|
||||||
|
Binary file not shown.
Before Width: | Height: | Size: 517 KiB |
Loading…
Reference in New Issue
Block a user