57 lines
1.4 KiB
Bash
57 lines
1.4 KiB
Bash
#!/bin/bash
|
|
#SBATCH -A p32852
|
|
#SBATCH -p gengpu
|
|
#SBATCH --gres=gpu:a100:2
|
|
#SBATCH --constraint=sxm
|
|
#SBATCH --nodes=2
|
|
#SBATCH --ntasks-per-node=2
|
|
#SBATCH --time=48:00:00
|
|
#SBATCH --mem=64G
|
|
#SBATCH --cpus-per-task=4
|
|
#SBATCH --job-name=MW-info-typology
|
|
#SBATCH --output=parallel-mw-olmo-info-cat.log
|
|
#SBATCH --mail-type=BEGIN,END,FAIL
|
|
#SBATCH --mail-user=gaughan@u.northwestern.edu
|
|
|
|
module purge
|
|
|
|
eval "$(conda shell.bash hook)"
|
|
|
|
echo "setting up the environment by loading in conda environment at $(date)"
|
|
|
|
conda activate olmo
|
|
|
|
echo "running the olmo labeling job at $(date)"
|
|
|
|
# Get master node address for rendezvous
|
|
MASTER_ADDR=$(scontrol show hostnames $SLURM_NODELIST | head -n 1)
|
|
MASTER_PORT=29505
|
|
|
|
# Write accelerate config with correct master IP
|
|
cat << EOF > ./slurm_accelerate.yaml
|
|
compute_environment: LOCAL_MACHINE
|
|
deepspeed_config: {}
|
|
distributed_type: FSDP
|
|
downcast_bf16: 'no'
|
|
fsdp_config:
|
|
fsdp_min_num_params: 0
|
|
fsdp_sharding_strategy: 1
|
|
fsdp_offload_params: false
|
|
machine_rank: $SLURM_NODEID
|
|
main_process_ip: $MASTER_ADDR
|
|
main_process_port: $MASTER_PORT
|
|
main_training_function: main
|
|
num_machines: 2
|
|
num_processes: 4
|
|
mixed_precision: 'no'
|
|
same_network: true
|
|
use_cpu: false
|
|
EOF
|
|
|
|
srun accelerate launch --config_file ./slurm_accelerate.yaml \
|
|
/home/nws8519/git/mw-lifecycle-analysis/p2/quest/python_scripts/olmo_parallel_cat.py
|
|
|
|
rm ./slurm_accelerate.yaml
|
|
|
|
echo "unsupervised olmo categorization pau at $(date)"
|