#!/bin/bash #SBATCH -A p32852 #SBATCH -p gengpu #SBATCH --gres=gpu:a100:2 #SBATCH --constraint=sxm #SBATCH --nodes=2 #SBATCH --ntasks-per-node=2 #SBATCH --time=48:00:00 #SBATCH --mem=64G #SBATCH --cpus-per-task=4 #SBATCH --job-name=MW-info-typology #SBATCH --output=parallel-mw-olmo-info-cat.log #SBATCH --mail-type=BEGIN,END,FAIL #SBATCH --mail-user=gaughan@u.northwestern.edu module purge eval "$(conda shell.bash hook)" echo "setting up the environment by loading in conda environment at $(date)" conda activate olmo echo "running the olmo labeling job at $(date)" # Get master node address for rendezvous MASTER_ADDR=$(scontrol show hostnames $SLURM_NODELIST | head -n 1) MASTER_PORT=29505 # Write accelerate config with correct master IP cat << EOF > ./slurm_accelerate.yaml compute_environment: LOCAL_MACHINE deepspeed_config: {} distributed_type: FSDP downcast_bf16: 'no' fsdp_config: fsdp_min_num_params: 0 fsdp_sharding_strategy: 1 fsdp_offload_params: false machine_rank: $SLURM_NODEID main_process_ip: $MASTER_ADDR main_process_port: $MASTER_PORT main_training_function: main num_machines: 2 num_processes: 4 mixed_precision: 'no' same_network: true use_cpu: false EOF srun accelerate launch --config_file ./slurm_accelerate.yaml \ /home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py rm ./slurm_accelerate.yaml echo "unsupervised olmo categorization pau at $(date)"