diff --git a/p2/quest/batched-mw-olmo-info-cat.log b/p2/quest/batched-mw-olmo-info-cat.log new file mode 100644 index 0000000..3e74ec8 --- /dev/null +++ b/p2/quest/batched-mw-olmo-info-cat.log @@ -0,0 +1,70 @@ +setting up the environment by loading in conda environment at Thu Sep 4 18:31:14 CDT 2025 +running the batched olmo categorization job at Thu Sep 4 18:31:14 CDT 2025 +[nltk_data] Downloading package punkt_tab to +[nltk_data] /home/nws8519/nltk_data... +[nltk_data] Package punkt_tab is already up-to-date! +cuda +NVIDIA A100-SXM4-80GB +_CudaDeviceProperties(name='NVIDIA A100-SXM4-80GB', major=8, minor=0, total_memory=81153MB, multi_processor_count=108, uuid=805df503-cf0d-c6cd-33f3-cb3560ee9fea, L2_cache_size=40MB) + Loading checkpoint shards: 0%| | 0/12 [00:00 + outputs = olmo.generate(**inputs, max_new_tokens=256, do_sample=False) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context + return func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/transformers/generation/utils.py", line 2597, in generate + result = self._sample( + ^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/transformers/generation/utils.py", line 3557, in _sample + outputs = self(**model_inputs, return_dict=True) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl + return forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/transformers/utils/generic.py", line 969, in wrapper + output = func(self, *args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/transformers/models/olmo2/modeling_olmo2.py", line 667, in forward + outputs: BaseModelOutputWithPast = self.model( + ^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl + return forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/transformers/utils/generic.py", line 969, in wrapper + output = func(self, *args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/transformers/models/olmo2/modeling_olmo2.py", line 432, in forward + layer_outputs = decoder_layer( + ^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/transformers/modeling_layers.py", line 48, in __call__ + return super().__call__(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl + return forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/transformers/models/olmo2/modeling_olmo2.py", line 269, in forward + hidden_states = self.mlp(hidden_states) + ^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl + return forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/transformers/models/olmo2/modeling_olmo2.py", line 224, in forward + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~ +torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 752.00 MiB. GPU 0 has a total capacity of 79.25 GiB of which 343.50 MiB is free. Including non-PyTorch memory, this process has 78.91 GiB memory in use. Of the allocated memory 70.96 GiB is allocated by PyTorch, and 7.45 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +unsupervised batched olmo categorization pau at Fri Sep 5 01:25:00 CDT 2025 diff --git a/p2/quest/parallel-mw-olmo-info-cat.log b/p2/quest/parallel-mw-olmo-info-cat.log index 5bb08f2..e007263 100644 --- a/p2/quest/parallel-mw-olmo-info-cat.log +++ b/p2/quest/parallel-mw-olmo-info-cat.log @@ -1,8 +1,8 @@ -setting up the environment by loading in conda environment at Thu Sep 4 11:14:26 CDT 2025 -running the olmo labeling job at Thu Sep 4 11:14:26 CDT 2025 +setting up the environment by loading in conda environment at Thu Sep 4 18:05:51 CDT 2025 +running the olmo labeling job at Thu Sep 4 18:05:52 CDT 2025 ---------------------------------------- -srun job start: Thu Sep 4 11:14:27 CDT 2025 -Job ID: 3273582 +srun job start: Thu Sep 4 18:05:54 CDT 2025 +Job ID: 3301934 Username: nws8519 Queue: gengpu Account: p32852 @@ -14,273 +14,16 @@ prologue and the job run script PATH (in prologue) : /home/nws8519/.conda/envs/olmo/bin:/software/miniconda3/4.12.0/condabin:/home/nws8519/.local/bin:/home/nws8519/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/usr/lpp/mmfs/bin:/hpc/usertools WORKDIR is: /home/nws8519 ---------------------------------------- -W0904 11:14:40.413000 1736745 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] -W0904 11:14:40.413000 1736745 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] ***************************************** -W0904 11:14:40.413000 1736745 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0904 11:14:40.413000 1736745 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] ***************************************** -W0904 11:14:40.413000 1736746 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] -W0904 11:14:40.413000 1736746 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] ***************************************** -W0904 11:14:40.413000 1736746 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0904 11:14:40.413000 1736746 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] ***************************************** -W0904 11:14:40.413000 2769136 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] -W0904 11:14:40.413000 2769136 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] ***************************************** -W0904 11:14:40.413000 2769136 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0904 11:14:40.413000 2769136 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] ***************************************** -W0904 11:14:40.413000 2769137 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] -W0904 11:14:40.413000 2769137 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] ***************************************** -W0904 11:14:40.413000 2769137 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -W0904 11:14:40.413000 2769137 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py:766] ***************************************** -[nltk_data] Downloading package punkt to /home/nws8519/nltk_data... -[nltk_data] Downloading package punkt to /home/nws8519/nltk_data... -[nltk_data] Downloading package punkt to /home/nws8519/nltk_data... -[nltk_data] Downloading package punkt to /home/nws8519/nltk_data... -[nltk_data] Package punkt is already up-to-date![nltk_data] Package punkt is already up-to-date! - -[nltk_data] Package punkt is already up-to-date![nltk_data] Package punkt is already up-to-date! - -[nltk_data] Downloading package punkt_tab to -[nltk_data] /home/nws8519/nltk_data...[nltk_data] Downloading package punkt_tab to -[nltk_data] /home/nws8519/nltk_data... - -[nltk_data] Downloading package punkt_tab to -[nltk_data] /home/nws8519/nltk_data...[nltk_data] Downloading package punkt_tab to -[nltk_data] /home/nws8519/nltk_data... - -[nltk_data] Package punkt_tab is already up-to-date![nltk_data] Package punkt_tab is already up-to-date! - -[nltk_data] Package punkt_tab is already up-to-date![nltk_data] Package punkt_tab is already up-to-date! - -/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py:120: DtypeWarning: Columns (21) have mixed types. Specify dtype option on import or set low_memory=False. - df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv") -/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py:120: DtypeWarning: Columns (21) have mixed types. Specify dtype option on import or set low_memory=False. - df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv") -/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py:120: DtypeWarning: Columns (21) have mixed types. Specify dtype option on import or set low_memory=False. - df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv") -[rank3]:[W904 11:15:22.374478896 ProcessGroupNCCL.cpp:4715] [PG ID 0 PG GUID 0 Rank 3] using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device. -/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py:120: DtypeWarning: Columns (21) have mixed types. Specify dtype option on import or set low_memory=False. - df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv") -[rank1]:[W904 11:15:22.049509730 ProcessGroupNCCL.cpp:4715] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device. -[rank2]:[W904 11:15:22.461549051 ProcessGroupNCCL.cpp:4715] [PG ID 0 PG GUID 0 Rank 2] using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device. - Fetching 12 files: 0%| | 0/12 [00:00 -[rank2]: main() -[rank2]: File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py", line 143, in main -[rank2]: ddp_olmo = DDP(olmo, device_ids=[local_rank]) -[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank2]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 850, in __init__ -[rank2]: self._ddp_init_helper( -[rank2]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 1201, in _ddp_init_helper -[rank2]: self.reducer = dist.Reducer( -[rank2]: ^^^^^^^^^^^^^ -[rank2]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 51.10 GiB. GPU 0 has a total capacity of 79.25 GiB of which 27.52 GiB is free. Including non-PyTorch memory, this process has 51.72 GiB memory in use. Of the allocated memory 51.10 GiB is allocated by PyTorch, and 875.00 KiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[rank3]: Traceback (most recent call last): -[rank3]: File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py", line 188, in -[rank3]: main() -[rank3]: File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py", line 143, in main -[rank3]: ddp_olmo = DDP(olmo, device_ids=[local_rank]) -[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank3]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 850, in __init__ -[rank3]: self._ddp_init_helper( -[rank3]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 1201, in _ddp_init_helper -[rank3]: self.reducer = dist.Reducer( -[rank3]: ^^^^^^^^^^^^^ -[rank3]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 51.10 GiB. GPU 1 has a total capacity of 79.25 GiB of which 27.52 GiB is free. Including non-PyTorch memory, this process has 51.72 GiB memory in use. Of the allocated memory 51.10 GiB is allocated by PyTorch, and 875.00 KiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[rank0]: Traceback (most recent call last): -[rank0]: File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py", line 188, in -[rank0]: main() -[rank0]: File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py", line 143, in main -[rank0]: ddp_olmo = DDP(olmo, device_ids=[local_rank]) -[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank0]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 850, in __init__ -[rank0]: self._ddp_init_helper( -[rank0]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 1201, in _ddp_init_helper -[rank0]: self.reducer = dist.Reducer( -[rank0]: ^^^^^^^^^^^^^ -[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 51.10 GiB. GPU 0 has a total capacity of 79.25 GiB of which 27.52 GiB is free. Including non-PyTorch memory, this process has 51.72 GiB memory in use. Of the allocated memory 51.10 GiB is allocated by PyTorch, and 875.00 KiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[rank1]: Traceback (most recent call last): -[rank1]: File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py", line 188, in -[rank1]: main() -[rank1]: File "/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py", line 143, in main -[rank1]: ddp_olmo = DDP(olmo, device_ids=[local_rank]) -[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank1]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 850, in __init__ -[rank1]: self._ddp_init_helper( -[rank1]: File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 1201, in _ddp_init_helper -[rank1]: self.reducer = dist.Reducer( -[rank1]: ^^^^^^^^^^^^^ -[rank1]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 51.10 GiB. GPU 1 has a total capacity of 79.25 GiB of which 27.52 GiB is free. Including non-PyTorch memory, this process has 51.72 GiB memory in use. Of the allocated memory 51.10 GiB is allocated by PyTorch, and 875.00 KiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[rank2]:[W904 11:27:15.787618003 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) -[rank0]:[W904 11:27:15.409824698 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) -W0904 11:27:17.571000 1736746 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1736801 closing signal SIGTERM -E0904 11:27:17.635000 1736746 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 1 (pid: 1736802) of binary: /home/nws8519/.conda/envs/olmo/bin/python3.11 Traceback (most recent call last): - File "/home/nws8519/.conda/envs/olmo/bin/torchrun", line 8, in + File "/home/nws8519/.conda/envs/olmo/bin/accelerate", line 8, in sys.exit(main()) ^^^^^^ - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper - return f(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^ - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py", line 892, in main - run(args) - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py", line 883, in run - elastic_launch( - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 139, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2025-09-04_11:27:17 - host : qgpu2013 - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 1736802) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -[W904 11:27:17.168398358 TCPStore.cpp:115] [c10d] recvVector failed on SocketImpl(fd=3, addr=[qgpu2014]:57300, remote=[qgpu2013]:29502): failed to recv, got 0 bytes -Exception raised from recvBytes at /pytorch/torch/csrc/distributed/c10d/Utils.hpp:678 (most recent call first): -frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x98 (0x14873d0d85e8 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libc10.so) -frame #1: + 0x5ba8afe (0x1487811fbafe in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #2: + 0x5baa0d0 (0x1487811fd0d0 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #3: + 0x5baa81d (0x1487811fd81d in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #4: + 0x5bab4a9 (0x1487811fe4a9 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #5: c10d::TCPStore::compareSet(std::__cxx11::basic_string, std::allocator > const&, std::vector > const&, std::vector > const&) + 0x1fb (0x1487811f84cb in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #6: + 0xc2a761 (0x148790587761 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_python.so) -frame #7: + 0x38a0cc (0x14878fce70cc in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_python.so) -frame #8: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x528b17] -frame #9: _PyObject_MakeTpCall + 0x27c (0x50452c in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #10: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x557ac9] -frame #11: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #12: _PyFunction_Vectorcall + 0x173 (0x539153 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #13: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #14: _PyFunction_Vectorcall + 0x173 (0x539153 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #15: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #16: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5581df] -frame #17: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x557a20] -frame #18: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x62a8a3] -frame #19: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5fa3c4] -frame #20: + 0x81ca (0x1487a64991ca in /lib64/libpthread.so.0) -frame #21: clone + 0x43 (0x1487a596a8d3 in /lib64/libc.so.6) - -W0904 11:27:17.959000 2769136 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1341] The node 'qgpu2014_2769136_0' has failed to send a keep-alive heartbeat to the rendezvous '3273582' due to an error of type RendezvousConnectionError. -W0904 11:27:17.959000 2769136 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 2769219 closing signal SIGTERM -[W904 11:27:17.170100534 TCPStore.cpp:106] [c10d] sendBytes failed on SocketImpl(fd=3, addr=[qgpu2014]:57290, remote=[qgpu2013]:29502): Broken pipe -Exception raised from sendBytes at /pytorch/torch/csrc/distributed/c10d/Utils.hpp:653 (most recent call first): -frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x98 (0x14c218b8e5e8 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libc10.so) -frame #1: + 0x5ba8afe (0x14c25ccb1afe in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #2: + 0x5baa358 (0x14c25ccb3358 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #3: + 0x5babb3e (0x14c25ccb4b3e in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #4: c10d::TCPStore::doWait(c10::ArrayRef, std::allocator > >, std::chrono::duration >) + 0x1a6 (0x14c25ccaeac6 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #5: c10d::TCPStore::doGet(std::__cxx11::basic_string, std::allocator > const&) + 0x33 (0x14c25ccaeea3 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #6: c10d::TCPStore::get(std::__cxx11::basic_string, std::allocator > const&) + 0xab (0x14c25ccaff8b in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #7: + 0xc2a390 (0x14c26c03d390 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_python.so) -frame #8: + 0x38a0cc (0x14c26b79d0cc in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_python.so) -frame #9: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x528b17] -frame #10: _PyObject_MakeTpCall + 0x27c (0x50452c in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #11: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x557ac9] -frame #12: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #13: _PyFunction_Vectorcall + 0x173 (0x539153 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #14: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #15: _PyFunction_Vectorcall + 0x173 (0x539153 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #16: _PyObject_FastCallDictTstate + 0x65 (0x508e05 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #17: _PyObject_Call_Prepend + 0x66 (0x540ac6 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #18: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x611dd7] -frame #19: PyObject_Call + 0xbd (0x54303d in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #20: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #21: _PyFunction_Vectorcall + 0x173 (0x539153 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #22: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #23: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5cc3aa] -frame #24: PyEval_EvalCode + 0x9f (0x5cba7f in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #25: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5ecba7] -frame #26: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5e8740] -frame #27: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5fd5f2] -frame #28: _PyRun_SimpleFileObject + 0x19f (0x5fc9bf in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #29: _PyRun_AnyFileObject + 0x43 (0x5fc6e3 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #30: Py_RunMain + 0x2ee (0x5f73fe in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #31: Py_BytesMain + 0x39 (0x5bc149 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #32: __libc_start_main + 0xe5 (0x14c2814217e5 in /lib64/libc.so.6) -frame #33: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5bbf93] - -W0904 11:27:17.963000 2769137 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1292] The node 'qgpu2014_2769137_0' has failed to shutdown the rendezvous '3273582' due to an error of type RendezvousConnectionError. -[W904 11:27:17.194777840 TCPStore.cpp:106] [c10d] sendBytes failed on SocketImpl(fd=3, addr=[qgpu2014]:57290, remote=[qgpu2013]:29502): Broken pipe -Exception raised from sendBytes at /pytorch/torch/csrc/distributed/c10d/Utils.hpp:653 (most recent call first): -frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x98 (0x14c218b8e5e8 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libc10.so) -frame #1: + 0x5ba8afe (0x14c25ccb1afe in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #2: + 0x5baa358 (0x14c25ccb3358 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #3: + 0x5babb3e (0x14c25ccb4b3e in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #4: c10d::TCPStore::doWait(c10::ArrayRef, std::allocator > >, std::chrono::duration >) + 0x1a6 (0x14c25ccaeac6 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #5: c10d::TCPStore::doGet(std::__cxx11::basic_string, std::allocator > const&) + 0x33 (0x14c25ccaeea3 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #6: c10d::TCPStore::get(std::__cxx11::basic_string, std::allocator > const&) + 0xab (0x14c25ccaff8b in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #7: + 0xc2a390 (0x14c26c03d390 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_python.so) -frame #8: + 0x38a0cc (0x14c26b79d0cc in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_python.so) -frame #9: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x528b17] -frame #10: _PyObject_MakeTpCall + 0x27c (0x50452c in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #11: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x557ac9] -frame #12: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #13: _PyFunction_Vectorcall + 0x173 (0x539153 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #14: _PyObject_FastCallDictTstate + 0x65 (0x508e05 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #15: _PyObject_Call_Prepend + 0x66 (0x540ac6 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #16: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x611dd7] -frame #17: PyObject_Call + 0xbd (0x54303d in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #18: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #19: _PyFunction_Vectorcall + 0x173 (0x539153 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #20: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #21: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5cc3aa] -frame #22: PyEval_EvalCode + 0x9f (0x5cba7f in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #23: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5ecba7] -frame #24: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5e8740] -frame #25: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5fd5f2] -frame #26: _PyRun_SimpleFileObject + 0x19f (0x5fc9bf in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #27: _PyRun_AnyFileObject + 0x43 (0x5fc6e3 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #28: Py_RunMain + 0x2ee (0x5f73fe in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #29: Py_BytesMain + 0x39 (0x5bc149 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #30: __libc_start_main + 0xe5 (0x14c2814217e5 in /lib64/libc.so.6) -frame #31: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5bbf93] - -W0904 11:27:17.986000 2769137 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1292] The node 'qgpu2014_2769137_0' has failed to shutdown the rendezvous '3273582' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 117, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -torch.distributed.DistNetworkError: failed to recv, got 0 bytes - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/home/nws8519/.conda/envs/olmo/bin/torchrun", line 8, in - sys.exit(main()) - ^^^^^^ - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper - return f(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^ - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py", line 892, in main - run(args) + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/accelerate/commands/accelerate_cli.py", line 50, in main + args.func(args) + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/accelerate/commands/launch.py", line 1222, in launch_command + multi_gpu_launcher(args) + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/accelerate/commands/launch.py", line 853, in multi_gpu_launcher + distrib_run.run(args) File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py", line 883, in run elastic_launch( File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 139, in __call__ @@ -308,254 +51,20 @@ Traceback (most recent call last): File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 500, in _rendezvous rdzv_info = spec.rdzv_handler.next_rendezvous() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1170, in next_rendezvous - self._op_executor.run(join_op, deadline, self._get_deadline) - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 648, in run - has_set = self._state_holder.sync() - ^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 437, in sync - get_response = self._backend.get_state() - ^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 75, in get_state - base64_state: bytes = self._call_store("get", self._key) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 119, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -E0904 11:27:18.023000 2769136 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 0 (pid: 2769218) of binary: /home/nws8519/.conda/envs/olmo/bin/python3.11 -[W904 11:27:18.239612027 TCPStore.cpp:106] [c10d] sendBytes failed on SocketImpl(fd=3, addr=[qgpu2014]:57300, remote=[qgpu2013]:29502): Broken pipe -Exception raised from sendBytes at /pytorch/torch/csrc/distributed/c10d/Utils.hpp:653 (most recent call first): -frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x98 (0x14873d0d85e8 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libc10.so) -frame #1: + 0x5ba8afe (0x1487811fbafe in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #2: + 0x5baa358 (0x1487811fd358 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #3: + 0x5babb3e (0x1487811feb3e in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #4: c10d::TCPStore::compareSet(std::__cxx11::basic_string, std::allocator > const&, std::vector > const&, std::vector > const&) + 0x299 (0x1487811f8569 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #5: + 0xc2a761 (0x148790587761 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_python.so) -frame #6: + 0x38a0cc (0x14878fce70cc in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_python.so) -frame #7: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x528b17] -frame #8: _PyObject_MakeTpCall + 0x27c (0x50452c in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #9: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x557ac9] -frame #10: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #11: _PyFunction_Vectorcall + 0x173 (0x539153 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #12: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #13: _PyFunction_Vectorcall + 0x173 (0x539153 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #14: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #15: _PyFunction_Vectorcall + 0x173 (0x539153 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #16: _PyObject_FastCallDictTstate + 0x65 (0x508e05 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #17: _PyObject_Call_Prepend + 0x66 (0x540ac6 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #18: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x611dd7] -frame #19: PyObject_Call + 0xbd (0x54303d in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #20: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #21: _PyFunction_Vectorcall + 0x173 (0x539153 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #22: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #23: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5cc3aa] -frame #24: PyEval_EvalCode + 0x9f (0x5cba7f in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #25: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5ecba7] -frame #26: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5e8740] -frame #27: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5fd5f2] -frame #28: _PyRun_SimpleFileObject + 0x19f (0x5fc9bf in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #29: _PyRun_AnyFileObject + 0x43 (0x5fc6e3 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #30: Py_RunMain + 0x2ee (0x5f73fe in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #31: Py_BytesMain + 0x39 (0x5bc149 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #32: __libc_start_main + 0xe5 (0x1487a596b7e5 in /lib64/libc.so.6) -frame #33: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5bbf93] - -W0904 11:27:18.030000 2769136 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1292] The node 'qgpu2014_2769136_0' has failed to shutdown the rendezvous '3273582' due to an error of type RendezvousConnectionError. -[W904 11:27:18.248039930 TCPStore.cpp:106] [c10d] sendBytes failed on SocketImpl(fd=3, addr=[qgpu2014]:57300, remote=[qgpu2013]:29502): Broken pipe -Exception raised from sendBytes at /pytorch/torch/csrc/distributed/c10d/Utils.hpp:653 (most recent call first): -frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x98 (0x14873d0d85e8 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libc10.so) -frame #1: + 0x5ba8afe (0x1487811fbafe in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #2: + 0x5baa358 (0x1487811fd358 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #3: + 0x5babb3e (0x1487811feb3e in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #4: c10d::TCPStore::compareSet(std::__cxx11::basic_string, std::allocator > const&, std::vector > const&, std::vector > const&) + 0x299 (0x1487811f8569 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #5: + 0xc2a761 (0x148790587761 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_python.so) -frame #6: + 0x38a0cc (0x14878fce70cc in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_python.so) -frame #7: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x528b17] -frame #8: _PyObject_MakeTpCall + 0x27c (0x50452c in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #9: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x557ac9] -frame #10: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #11: _PyFunction_Vectorcall + 0x173 (0x539153 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #12: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #13: _PyFunction_Vectorcall + 0x173 (0x539153 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #14: _PyObject_FastCallDictTstate + 0x65 (0x508e05 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #15: _PyObject_Call_Prepend + 0x66 (0x540ac6 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #16: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x611dd7] -frame #17: PyObject_Call + 0xbd (0x54303d in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #18: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #19: _PyFunction_Vectorcall + 0x173 (0x539153 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #20: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #21: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5cc3aa] -frame #22: PyEval_EvalCode + 0x9f (0x5cba7f in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #23: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5ecba7] -frame #24: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5e8740] -frame #25: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5fd5f2] -frame #26: _PyRun_SimpleFileObject + 0x19f (0x5fc9bf in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #27: _PyRun_AnyFileObject + 0x43 (0x5fc6e3 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #28: Py_RunMain + 0x2ee (0x5f73fe in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #29: Py_BytesMain + 0x39 (0x5bc149 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #30: __libc_start_main + 0xe5 (0x1487a596b7e5 in /lib64/libc.so.6) -frame #31: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5bbf93] - -W0904 11:27:18.038000 2769136 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1292] The node 'qgpu2014_2769136_0' has failed to shutdown the rendezvous '3273582' due to an error of type RendezvousConnectionError. -[W904 11:27:18.255885548 TCPStore.cpp:106] [c10d] sendBytes failed on SocketImpl(fd=3, addr=[qgpu2014]:57300, remote=[qgpu2013]:29502): Broken pipe -Exception raised from sendBytes at /pytorch/torch/csrc/distributed/c10d/Utils.hpp:653 (most recent call first): -frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x98 (0x14873d0d85e8 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libc10.so) -frame #1: + 0x5ba8afe (0x1487811fbafe in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #2: + 0x5baa358 (0x1487811fd358 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #3: + 0x5babb3e (0x1487811feb3e in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #4: c10d::TCPStore::compareSet(std::__cxx11::basic_string, std::allocator > const&, std::vector > const&, std::vector > const&) + 0x299 (0x1487811f8569 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #5: + 0xc2a761 (0x148790587761 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_python.so) -frame #6: + 0x38a0cc (0x14878fce70cc in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_python.so) -frame #7: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x528b17] -frame #8: _PyObject_MakeTpCall + 0x27c (0x50452c in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #9: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x557ac9] -frame #10: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #11: _PyFunction_Vectorcall + 0x173 (0x539153 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #12: _PyObject_FastCallDictTstate + 0x65 (0x508e05 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #13: _PyObject_Call_Prepend + 0x66 (0x540ac6 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #14: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x611dd7] -frame #15: PyObject_Call + 0xbd (0x54303d in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #16: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #17: _PyFunction_Vectorcall + 0x173 (0x539153 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #18: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #19: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5cc3aa] -frame #20: PyEval_EvalCode + 0x9f (0x5cba7f in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #21: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5ecba7] -frame #22: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5e8740] -frame #23: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5fd5f2] -frame #24: _PyRun_SimpleFileObject + 0x19f (0x5fc9bf in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #25: _PyRun_AnyFileObject + 0x43 (0x5fc6e3 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #26: Py_RunMain + 0x2ee (0x5f73fe in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #27: Py_BytesMain + 0x39 (0x5bc149 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #28: __libc_start_main + 0xe5 (0x1487a596b7e5 in /lib64/libc.so.6) -frame #29: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5bbf93] - -W0904 11:27:18.046000 2769136 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1292] The node 'qgpu2014_2769136_0' has failed to shutdown the rendezvous '3273582' due to an error of type RendezvousConnectionError. + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py", line 67, in next_rendezvous + self._store = TCPStore( # type: ignore[call-arg] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +torch.distributed.DistNetworkError: The server socket has failed to listen on any local network address. port: 29505, useIpv6: false, code: -98, name: EADDRINUSE, message: address already in use Traceback (most recent call last): - File "/home/nws8519/.conda/envs/olmo/bin/torchrun", line 8, in + File "/home/nws8519/.conda/envs/olmo/bin/accelerate", line 8, in sys.exit(main()) ^^^^^^ - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper - return f(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^ - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py", line 892, in main - run(args) - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py", line 883, in run - elastic_launch( - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 139, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py FAILED ------------------------------------------------------------- -Failures: - ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2025-09-04_11:27:17 - host : qgpu2014 - rank : 2 (local_rank: 0) - exitcode : 1 (pid: 2769218) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -srun: error: qgpu2013: task 1: Exited with exit code 1 -srun: error: qgpu2014: tasks 2-3: Exited with exit code 1 -[W904 11:27:18.383886513 TCPStore.cpp:106] [c10d] sendBytes failed on SocketImpl(fd=3, addr=[qgpu2013]:36246, remote=[qgpu2013]:29502): Broken pipe -Exception raised from sendBytes at /pytorch/torch/csrc/distributed/c10d/Utils.hpp:653 (most recent call first): -frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x98 (0x14977ddbe5e8 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libc10.so) -frame #1: + 0x5ba8afe (0x1497c1ee1afe in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #2: + 0x5baa358 (0x1497c1ee3358 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #3: + 0x5babb3e (0x1497c1ee4b3e in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #4: c10d::TCPStore::doWait(c10::ArrayRef, std::allocator > >, std::chrono::duration >) + 0x1a6 (0x1497c1edeac6 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #5: c10d::TCPStore::doGet(std::__cxx11::basic_string, std::allocator > const&) + 0x33 (0x1497c1edeea3 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #6: c10d::TCPStore::get(std::__cxx11::basic_string, std::allocator > const&) + 0xab (0x1497c1edff8b in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #7: + 0xc2a390 (0x1497d126d390 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_python.so) -frame #8: + 0x38a0cc (0x1497d09cd0cc in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_python.so) -frame #9: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x528b17] -frame #10: _PyObject_MakeTpCall + 0x27c (0x50452c in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #11: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x557ac9] -frame #12: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #13: _PyFunction_Vectorcall + 0x173 (0x539153 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #14: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #15: _PyFunction_Vectorcall + 0x173 (0x539153 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #16: _PyObject_FastCallDictTstate + 0x65 (0x508e05 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #17: _PyObject_Call_Prepend + 0x66 (0x540ac6 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #18: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x611dd7] -frame #19: PyObject_Call + 0xbd (0x54303d in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #20: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #21: _PyFunction_Vectorcall + 0x173 (0x539153 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #22: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #23: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5cc3aa] -frame #24: PyEval_EvalCode + 0x9f (0x5cba7f in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #25: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5ecba7] -frame #26: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5e8740] -frame #27: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5fd5f2] -frame #28: _PyRun_SimpleFileObject + 0x19f (0x5fc9bf in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #29: _PyRun_AnyFileObject + 0x43 (0x5fc6e3 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #30: Py_RunMain + 0x2ee (0x5f73fe in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #31: Py_BytesMain + 0x39 (0x5bc149 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #32: __libc_start_main + 0xe5 (0x1497e66517e5 in /lib64/libc.so.6) -frame #33: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5bbf93] - -W0904 11:27:18.554000 1736745 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1292] The node 'qgpu2013_1736745_0' has failed to shutdown the rendezvous '3273582' due to an error of type RendezvousConnectionError. -[W904 11:27:18.394906553 TCPStore.cpp:106] [c10d] sendBytes failed on SocketImpl(fd=3, addr=[qgpu2013]:36246, remote=[qgpu2013]:29502): Broken pipe -Exception raised from sendBytes at /pytorch/torch/csrc/distributed/c10d/Utils.hpp:653 (most recent call first): -frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x98 (0x14977ddbe5e8 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libc10.so) -frame #1: + 0x5ba8afe (0x1497c1ee1afe in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #2: + 0x5baa358 (0x1497c1ee3358 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #3: + 0x5babb3e (0x1497c1ee4b3e in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #4: c10d::TCPStore::doWait(c10::ArrayRef, std::allocator > >, std::chrono::duration >) + 0x1a6 (0x1497c1edeac6 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #5: c10d::TCPStore::doGet(std::__cxx11::basic_string, std::allocator > const&) + 0x33 (0x1497c1edeea3 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #6: c10d::TCPStore::get(std::__cxx11::basic_string, std::allocator > const&) + 0xab (0x1497c1edff8b in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so) -frame #7: + 0xc2a390 (0x1497d126d390 in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_python.so) -frame #8: + 0x38a0cc (0x1497d09cd0cc in /home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/lib/libtorch_python.so) -frame #9: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x528b17] -frame #10: _PyObject_MakeTpCall + 0x27c (0x50452c in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #11: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x557ac9] -frame #12: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #13: _PyFunction_Vectorcall + 0x173 (0x539153 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #14: _PyObject_FastCallDictTstate + 0x65 (0x508e05 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #15: _PyObject_Call_Prepend + 0x66 (0x540ac6 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #16: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x611dd7] -frame #17: PyObject_Call + 0xbd (0x54303d in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #18: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #19: _PyFunction_Vectorcall + 0x173 (0x539153 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #20: _PyEval_EvalFrameDefault + 0x47c0 (0x515b90 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #21: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5cc3aa] -frame #22: PyEval_EvalCode + 0x9f (0x5cba7f in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #23: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5ecba7] -frame #24: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5e8740] -frame #25: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5fd5f2] -frame #26: _PyRun_SimpleFileObject + 0x19f (0x5fc9bf in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #27: _PyRun_AnyFileObject + 0x43 (0x5fc6e3 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #28: Py_RunMain + 0x2ee (0x5f73fe in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #29: Py_BytesMain + 0x39 (0x5bc149 in /home/nws8519/.conda/envs/olmo/bin/python3.11) -frame #30: __libc_start_main + 0xe5 (0x1497e66517e5 in /lib64/libc.so.6) -frame #31: /home/nws8519/.conda/envs/olmo/bin/python3.11() [0x5bbf93] - -W0904 11:27:18.565000 1736745 /gpfs/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1292] The node 'qgpu2013_1736745_0' has failed to shutdown the rendezvous '3273582' due to an error of type RendezvousConnectionError. -Traceback (most recent call last): - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 117, in _call_store - return getattr(self._store, store_op)(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -torch.distributed.DistNetworkError: failed to recv, got 0 bytes - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/home/nws8519/.conda/envs/olmo/bin/torchrun", line 8, in - sys.exit(main()) - ^^^^^^ - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper - return f(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^ - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py", line 892, in main - run(args) + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/accelerate/commands/accelerate_cli.py", line 50, in main + args.func(args) + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/accelerate/commands/launch.py", line 1222, in launch_command + multi_gpu_launcher(args) + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/accelerate/commands/launch.py", line 853, in multi_gpu_launcher + distrib_run.run(args) File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py", line 883, in run elastic_launch( File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 139, in __call__ @@ -583,19 +92,104 @@ Traceback (most recent call last): File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 500, in _rendezvous rdzv_info = spec.rdzv_handler.next_rendezvous() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1170, in next_rendezvous - self._op_executor.run(join_op, deadline, self._get_deadline) - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 648, in run - has_set = self._state_holder.sync() - ^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 437, in sync - get_response = self._backend.get_state() - ^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 75, in get_state - base64_state: bytes = self._call_store("get", self._key) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 119, in _call_store - raise RendezvousConnectionError( -torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details. -srun: error: qgpu2013: task 0: Exited with exit code 1 -unsupervised olmo categorization pau at Thu Sep 4 11:27:18 CDT 2025 + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py", line 67, in next_rendezvous + self._store = TCPStore( # type: ignore[call-arg] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +torch.distributed.DistNetworkError: The server socket has failed to listen on any local network address. port: 29505, useIpv6: false, code: -98, name: EADDRINUSE, message: address already in use +srun: error: qgpu2005: task 0: Exited with exit code 1 +srun: error: qgpu2008: task 2: Exited with exit code 1 +[W904 18:21:24.281870443 socket.cpp:460] [c10d] waitForInput: poll for socket SocketImpl(fd=27, addr=[qgpu2005]:38060, remote=[qgpu2005]:29505) returned 0, likely a timeout +[W904 18:21:24.282308265 socket.cpp:485] [c10d] waitForInput: socket SocketImpl(fd=27, addr=[qgpu2005]:38060, remote=[qgpu2005]:29505) timed out after 900000ms +[W904 18:21:24.731952663 socket.cpp:460] [c10d] waitForInput: poll for socket SocketImpl(fd=27, addr=[qgpu2008]:35800, remote=[qgpu2005]:29505) returned 0, likely a timeout +[W904 18:21:24.733301968 socket.cpp:485] [c10d] waitForInput: socket SocketImpl(fd=27, addr=[qgpu2008]:35800, remote=[qgpu2005]:29505) timed out after 900000ms +Traceback (most recent call last): + File "/home/nws8519/.conda/envs/olmo/bin/accelerate", line 8, in + sys.exit(main()) + ^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/accelerate/commands/accelerate_cli.py", line 50, in main + args.func(args) + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/accelerate/commands/launch.py", line 1222, in launch_command + multi_gpu_launcher(args) + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/accelerate/commands/launch.py", line 853, in multi_gpu_launcher + distrib_run.run(args) + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py", line 883, in run + elastic_launch( + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 139, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 261, in launch_agent + result = agent.run() + ^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper + result = f(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 711, in run + result = self._invoke_run(role) + ^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 864, in _invoke_run + self._initialize_workers(self._worker_group) + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper + result = f(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 683, in _initialize_workers + self._rendezvous(worker_group) + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper + result = f(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 513, in _rendezvous + workers = self._assign_worker_ranks( + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper + result = f(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 605, in _assign_worker_ranks + role_infos_bytes = store.multi_get( + ^^^^^^^^^^^^^^^^ +torch.distributed.DistStoreError: wait timeout after 900000ms, keys: /none/torchelastic/role_info/0, /none/torchelastic/role_info/1 +Traceback (most recent call last): + File "/home/nws8519/.conda/envs/olmo/bin/accelerate", line 8, in + sys.exit(main()) + ^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/accelerate/commands/accelerate_cli.py", line 50, in main + args.func(args) + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/accelerate/commands/launch.py", line 1222, in launch_command + multi_gpu_launcher(args) + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/accelerate/commands/launch.py", line 853, in multi_gpu_launcher + distrib_run.run(args) + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/run.py", line 883, in run + elastic_launch( + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 139, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 261, in launch_agent + result = agent.run() + ^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper + result = f(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 711, in run + result = self._invoke_run(role) + ^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 864, in _invoke_run + self._initialize_workers(self._worker_group) + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper + result = f(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 683, in _initialize_workers + self._rendezvous(worker_group) + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper + result = f(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 513, in _rendezvous + workers = self._assign_worker_ranks( + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/metrics/api.py", line 138, in wrapper + result = f(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^ + File "/home/nws8519/.conda/envs/olmo/lib/python3.11/site-packages/torch/distributed/elastic/agent/server/api.py", line 605, in _assign_worker_ranks + role_infos_bytes = store.multi_get( + ^^^^^^^^^^^^^^^^ +torch.distributed.DistStoreError: wait timeout after 900000ms, keys: /none/torchelastic/role_info/0, /none/torchelastic/role_info/1 +srun: error: qgpu2005: task 1: Exited with exit code 1 +srun: error: qgpu2008: task 3: Exited with exit code 1 +unsupervised olmo categorization pau at Thu Sep 4 18:21:24 CDT 2025 diff --git a/p2/quest/python_scripts/090425_batched_olmo_cat.py b/p2/quest/python_scripts/090425_batched_olmo_cat.py new file mode 100644 index 0000000..2e05400 --- /dev/null +++ b/p2/quest/python_scripts/090425_batched_olmo_cat.py @@ -0,0 +1,126 @@ +from transformers import AutoModelForCausalLM, AutoTokenizer, OlmoForCausalLM +import torch +import csv +import pandas as pd +import re + +import nltk +nltk.download('punkt_tab') + +cache_directory = "/projects/p32852/cache/" +#load in the different models +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +print(device) +print(torch.cuda.get_device_name(0)) +print(torch.cuda.get_device_properties(0)) +#olmo = AutoModelForCausalLM.from_pretrained("allenai/OLMo-2-0325-32B", cache_dir=cache_directory).to(device) +#tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-0325-32B", cache_dir=cache_directory) +olmo = AutoModelForCausalLM.from_pretrained("allenai/OLMo-2-1124-13B", cache_dir=cache_directory).to(device) +tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-1124-13B", padding_side='left') + +priming = "For the **GIVEN SENTENCE**, please categorize it into one of the defined [[CATEGORIES]]. Each [[CATEGORY]] is described in the TYPOLOGY for reference.Your task is to match the **GIVEN SENTENCE** to the **[[CATEGORY]]** that most accurately describes the content of the comment. Only provide the sentence category as your output. Do not provide any text beyond the category name." + +typology = """ +TYPOLOGY: + +[[EXPECTED BEHAVIOR]], in which stakeholders discuss, from the user’s perspective, the expected or ideal situation affected by the issue. For example, a participant commented: “My suggestion/request in the near term would be to have an option to make the vocabulary read only so that users who want to be able to leave spacy alone to do streaming data processing don’t need to worry about changing memory requirements.” + +[[MOTIVATION]], in which stakeholders elaborate on why the issue needs to be fixed or a feature needs to be added. For example, in support of redesigning TensorFlow's input pipeline one participant wrote: “Right now, this method starves my GPU all the time, which is a shame because most other [deep learning] frameworks manage to make this much more performantly.” + +[[OBSERVED BUG BEHAVIOR]], which only appears in bug reports and focuses on describing the observed behaviour of the bug. For example, one participant commented: “I found strange behavior using the ‘pipe()’ method”, then started to describe this behavior. + +[[BUG REPRODUCTION]], which also only appears in bug reports and focuses on any report, request, and/or question regarding the reproduction of the bug. For example, one participant commented that a bug was reproducible: “Same problem here, working on Windows 10 with German text.” + +[[INVESTIGATION AND EXPLORATION]], in which OSS stakeholders discuss their exploration of ideas about the problem that was thought to have caused the issue. For example, “This result confirms my hypothesis but also shows that the memory increase really isn’t all that significant... But it still points to a potential flaw in the design of the library.” + +[[SOLUTION DISCUSSION]] is framed around the solution space from the developers’ point of view, in which participants discuss design ideas and implementation details, as well as suggestions, constraints, challenges, and useful references around such topics. For example, “I know there are multiple ways of approaching this however I strongly recommend node-gyp for performance.” + +[[CONTRIBUTION AND COMMITMENT]], in which participants call for contributors and/or voice willingness or unwillingness to contribute to resolving the issue. For example, one potential collaborator said: “I will gladly contribute in any way I can, however, this is something I will not be able to do alone. Would be best if a few other people is interested as well...” + +[[TASK PROGRESS]], in which stakeholders request or report progress of tasks and sub-tasks towards the solution of the issue. For example, “I made an initial stab at it... - this is just a proof of concept that gets the version string into nodejs. I’ll start working on adding the swig interfaces...” + +[[TESTING]], in which participants discuss the testing procedure and results, as well as the system environment, code, data, and feedback involved in testing. For example, “Tested on ‘0.101’ and ‘master’ - the issue seems to be fixed on ‘master’ not just for the example document, but for the entire corpus...” + +[[FUTURE PLAN]], in which participants discuss the long-term plan related to the issue; such plans usually involve work/ideas that are not required to close the current issue. For example, “For the futures, stay tuned, as we’re prototyping something in this direction.” + +[[POTENTIAL NEW ISSUES AND REQUESTS]], in which participants identify and discuss new bugs or needed features while investigating and addressing the current issue. For example, when discussing a bug in scikit-learn about parallel execution that causes process hanging, one participant said: “As a side point, I note there seems to be a lot more joblib parallelisation overhead in master... that wasn’t there in 0.14.” + +[[SOLUTION USAGE]] was usually discussed once a full or partial solution of the issue was released and stakeholders asked questions or provided suggestions about how to use the library with the new solution update. For example, “Please help me how to continue training the model [with the new release].” + +[[WORKAROUNDS]] focus on discussions about temporary or alternative solutions that can help overcome the issue until the official fix or enhancement is released. For example, in a discussion regarding memory growth for streamed data, one participant expressed his temporary solution: “For now workaround with reloading / collecting nlp object works quite ok in production.” + +[[ISSUE CONTENT MANAGEMENT]] focuses on redirecting the discussions and controlling the quality of the comments with respect to the issue. For example, “We might want to move this discussion to here: [link to another issue]” + +[[ACTION ON ISSUE]], in which participants comment on the proper actions to perform on the issue itself. For example, “I’m going to close this issue because it’s old and most of the information here is now out of date.” + +[[SOCIAL CONVERSATION]], in which participants express emotions such as appreciation, disappointment, annoyance, regret, etc. or engage in small talk. For example, “I’m so glad that this has received so much thought and attention!” +""" +instructions="The sentence's category is: " + +with open("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv", mode='r', newline='') as file: + reader = csv.reader(file) + array_of_categorizations = [] + index = -1 + for row in reader: + index += 1 + if index <= 0: + continue + text_dict = {} + #organizing the data from each citation + text_dict['id'] = row[0] + text_dict['task_title'] = row[1] + text_dict['comment_text'] = row[2] + text_dict['comment_type'] = row[12] + raw_text = text_dict['comment_text'] + + # comment_text preprocessing per https://arxiv.org/pdf/1902.07093 + # 1. replace code with CODE + comment_text = re.sub(r'`[^`]+`', 'CODE', raw_text) # Inline code + comment_text = re.sub(r'```[\s\S]+?```', 'CODE', comment_text) # Block code + # 2. replace quotes with QUOTE + lines = comment_text.split('\n') + lines = ['QUOTE' if line.strip().startswith('>') else line for line in lines] + comment_text = '\n'.join(lines) + # 3. replace Gerrit URLs with GERRIT URL + gerrit_url_pattern = r'https://gerrit\.wikimedia\.org/r/\d+' + comment_text = re.sub(gerrit_url_pattern, 'GERRIT_URL', comment_text) + # replace URL with URL + url_pattern = r'https?://[^\s]+' + comment_text = re.sub(url_pattern, 'URL', comment_text) + # 4. if possible, replace @ with SCREEN_NAME + comment_text = re.sub(r'(^|\s)@\w+', 'SCREEN_NAME', comment_text) + # 5. split into an array of sentences + comment_sentences = nltk.sent_tokenize(comment_text) + + results = [] + batch_size = 2 + for i in range(0, len(comment_sentences), batch_size): + batch = comment_sentences[i:i+batch_size] + prompts = [] + for sent in batch: + given_data = f"**GIVEN SENTENCE: \n ' Type -text_dict['comment_type'] \n Text -{sent}**'\n" + prompt = f"{priming}\n{typology}\n\n{given_data}\n{instructions}" + prompts.append(prompt) + inputs = tokenizer(prompts, return_tensors='pt', return_token_type_ids=False, padding=True, truncation=True).to(device) + with torch.no_grad(): + outputs = olmo.generate(**inputs, max_new_tokens=256, do_sample=False) + decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True) + for response_txt in decoded: + match = re.search(r"The sentence's category is: \s*(.*)", response_txt) + if match: + category = match.group(1).strip("[]*") + else: + category = "NO CATEGORY" + results.append(category) + torch.cuda.empty_cache() + + text_dict['sentence_categories']=results + array_of_categorizations.append(text_dict) + df = pd.DataFrame(array_of_categorizations) + #print(df.head()) + df.to_csv('090425_olmo_batched_categorized.csv', index=False) + + + + + diff --git a/p2/quest/python_scripts/__pycache__/090425_batched_olmo_cat.cpython-311.pyc b/p2/quest/python_scripts/__pycache__/090425_batched_olmo_cat.cpython-311.pyc new file mode 100644 index 0000000..eefc7b1 Binary files /dev/null and b/p2/quest/python_scripts/__pycache__/090425_batched_olmo_cat.cpython-311.pyc differ diff --git a/p2/quest/python_scripts/olmo_parallel_cat.py b/p2/quest/python_scripts/olmo_parallel_cat.py index 0de5167..1ffde28 100644 --- a/p2/quest/python_scripts/olmo_parallel_cat.py +++ b/p2/quest/python_scripts/olmo_parallel_cat.py @@ -2,6 +2,7 @@ import torch import torch.nn.functional as F from torch.utils.data import Dataset, DataLoader #from utils import MyTrainDataset +from accelerate import Accelerator import torch.multiprocessing as mp import torch.distributed as dist @@ -16,8 +17,8 @@ import pandas as pd import re import nltk -nltk.download('punkt') -nltk.download('punkt_tab') +#nltk.download('punkt') +#nltk.download('punkt_tab') # ----------------- prompts for LLM priming = "For the **GIVEN SENTENCE**, please categorize it into one of the defined [[CATEGORIES]]. Each [[CATEGORY]] is described in the TYPOLOGY for reference. Your task is to match the**GIVEN SENTENCE** to the **[[CATEGORY]]** that most accurately describes the content of the comment. Only provide the category as your output. Do not provide any text beyond the category name." @@ -113,9 +114,11 @@ def main(): # https://github.com/nuitrcs/examplejobs/blob/master/python/pytorch_ddp/multinode_torchrun.py #prep ddp setting - rank, world_size, local_rank = setup_ddp() - device = torch.device(f"cuda:{local_rank}") - + #rank, world_size, local_rank = setup_ddp() + #device = torch.device(f"cuda:{local_rank}") + accelerator = Accelerator() + device = accelerator.device + #load in data df = pd.read_csv("/home/nws8519/git/mw-lifecycle-analysis/p2/quest/072525_pp_biberplus_labels.csv") # TODO comment out below @@ -125,10 +128,11 @@ def main(): dataset = SentenceDataset(comment_texts, comment_types, priming, typology, instructions) #split data up across processes + #TODO fix batch_size = 4 sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank, shuffle=False) dataloader = DataLoader(dataset, batch_size=batch_size, sampler=sampler) - + ''' #load model and wrap in DDP cache_directory="/projects/p32852/cache/" if dist.get_rank() == 0: @@ -141,7 +145,16 @@ def main(): #olmo = AutoModelForCausalLM.from_pretrained("allenai/OLMo-2-1124-13B", cache_dir=cache_directory).to(device) #tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-1124-13B", cache_dir=cache_directory) ddp_olmo = DDP(olmo, device_ids=[local_rank]) - + ''' + cache_directory="/projects/p32852/cache/" + tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-2-1124-13B", cache_dir=cache_directory) + olmo = AutoModelForCausalLM.from_pretrained( + "allenai/OLMo-2-1124-13B", + cache_dir="/projects/p32852/cache/", + torch_dtype=torch.float16 + ) + + ddp_olmo = accelerator.prepare(model) #prepare to collect results as dictionary results = dict() @@ -163,6 +176,7 @@ def main(): results.setdefault(comment_idx, []).append((sentence, category)) #bring all together + ''' gathered = [None for _ in range(world_size)] dist.all_gather_object(gathered, results) if rank == 0: @@ -183,7 +197,27 @@ def main(): print(out_df.head()) #TODO out_df.to_csv("090325_olmo_sentence_categorized.csv") dist.destroy_process_group() + ''' + all_results = accelerator.gather_object(results) + if accelerator.is_main_process: + merged = dict() + for partial in all_results: + for k,v in partial.items(): + merged.setdefault(k, []).extend(v) + out_rows = [] + for comment_idx, sentence_labels in merged.items(): + out_rows.append({ + 'id': df['id'].iloc[comment_idx], + 'task_title': df['task_title'].iloc[comment_idx], + 'comment_text': df['comment_text'].iloc[comment_idx], + 'AuthorPHID': df['AuthorPHID'].iloc[comment_idx], + 'sentence_labels': sentence_labels + }) + out_df = pd.DataFrame(out_rows) + print(out_df.head()) + #TODO out_df.to_csv("090325_olmo_sentence_categorized.csv") + if __name__ == "__main__": main() print('all pau; internal to the script') diff --git a/p2/quest/slurm_jobs/090425_olmo_batched_cat.sh b/p2/quest/slurm_jobs/090425_olmo_batched_cat.sh new file mode 100644 index 0000000..bc1d0fa --- /dev/null +++ b/p2/quest/slurm_jobs/090425_olmo_batched_cat.sh @@ -0,0 +1,28 @@ +#!/bin/bash +#SBATCH -A p32852 +#SBATCH -p gengpu +#SBATCH --gres=gpu:a100:1 +#SBATCH --constraint=sxm +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=1 +#SBATCH --time=48:00:00 +#SBATCH --mem=64G +#SBATCH --cpus-per-task=4 +#SBATCH --job-name=batched-MW-info-typology +#SBATCH --output=batched-mw-olmo-info-cat.log +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --mail-user=gaughan@u.northwestern.edu + +module purge + +eval "$(conda shell.bash hook)" + +echo "setting up the environment by loading in conda environment at $(date)" + +conda activate olmo + +echo "running the batched olmo categorization job at $(date)" + +python /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/090425_batched_olmo_cat.py + +echo "unsupervised batched olmo categorization pau at $(date)" diff --git a/p2/quest/slurm_jobs/parallel_olmo_categorization.sh b/p2/quest/slurm_jobs/parallel_olmo_categorization.sh index f33154e..40098e2 100644 --- a/p2/quest/slurm_jobs/parallel_olmo_categorization.sh +++ b/p2/quest/slurm_jobs/parallel_olmo_categorization.sh @@ -25,17 +25,32 @@ echo "running the olmo labeling job at $(date)" # Get master node address for rendezvous MASTER_ADDR=$(scontrol show hostnames $SLURM_NODELIST | head -n 1) -MASTER_PORT=29502 +MASTER_PORT=29505 -export MASTER_ADDR -export MASTER_PORT +# Write accelerate config with correct master IP +cat << EOF > ./slurm_accelerate.yaml +compute_environment: LOCAL_MACHINE +deepspeed_config: {} +distributed_type: FSDP +downcast_bf16: 'no' +fsdp_config: + fsdp_min_num_params: 0 + fsdp_sharding_strategy: 1 + fsdp_offload_params: false +machine_rank: $SLURM_NODEID +main_process_ip: $MASTER_ADDR +main_process_port: $MASTER_PORT +main_training_function: main +num_machines: 2 +num_processes: 4 +mixed_precision: 'no' +same_network: true +use_cpu: false +EOF -srun torchrun \ - --nnodes 2 \ - --nproc-per-node 2 \ - --rdzv_id $SLURM_JOB_ID \ - --rdzv_backend c10d \ - --rdzv_endpoint "$MASTER_ADDR:$MASTER_PORT" \ +srun accelerate launch --config_file ./slurm_accelerate.yaml \ /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py +rm ./slurm_accelerate.yaml + echo "unsupervised olmo categorization pau at $(date)"