#!/bin/bash #SBATCH -A p32852 #SBATCH -p gengpu #SBATCH --gres=gpu:a100:2 #SBATCH --constraint=sxm #SBATCH --nodes=2 #SBATCH --ntasks-per-node=2 #SBATCH --time=48:00:00 #SBATCH --mem=64G #SBATCH --cpus-per-task=4 #SBATCH --job-name=MW-info-typology #SBATCH --output=parallel-mw-olmo-info-cat.log #SBATCH --mail-type=BEGIN,END,FAIL #SBATCH --mail-user=gaughan@u.northwestern.edu module purge eval "$(conda shell.bash hook)" echo "setting up the environment by loading in conda environment at $(date)" conda activate olmo echo "running the olmo labeling job at $(date)" # Get master node address for rendezvous MASTER_ADDR=$(scontrol show hostnames $SLURM_NODELIST | head -n 1) MASTER_PORT=29502 export MASTER_ADDR export MASTER_PORT srun torchrun \ --nnodes 2 \ --nproc-per-node 2 \ --rdzv_id $SLURM_JOB_ID \ --rdzv_backend c10d \ --rdzv_endpoint "$MASTER_ADDR:$MASTER_PORT" \ /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py echo "unsupervised olmo categorization pau at $(date)"