Created
April 17, 2024 07:51
-
-
Save mehdidc/59ae95e70a1da57e061d4a9a1b5e1b3c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#SBATCH --nodes=8 | |
#SBATCH --time=00:20:00 | |
#SBATCH --ntasks-per-node=1 | |
#SBATCH --cpus-per-task=48 | |
#SBATCH --gres=gpu:4 | |
#SBATCH --partition=booster | |
#SBATCH --account=transfernetx | |
#SBATCH --exclude=jwb[0059,0067,0069,0193,0284,0287,0294,0359,0418,0637,0829,0832,0838,0898,0907,0921,0971,1004,1023,1029,1213,1126] | |
#SBATCH --threads-per-core=1 | |
#SBATCH --mem=0 | |
REPO_PATH=$(pwd) | |
export CUDA_DEVICE_MAX_CONNECTIONS=1 | |
export CUDA_VISIBLE_DEVICES="0,1,2,3" | |
export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} | |
export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) | |
#export MASTER_ADDR="${MASTER_ADDR}.juwels" | |
#export MASTER_ADDR="${MASTER_ADDR}.jureca" | |
#export MASTER_ADDR="${MASTER_ADDR}" | |
export MASTER_ADDR="${MASTER_ADDR}i" | |
export MASTER_PORT=12345 | |
export NNODES=$SLURM_JOB_NUM_NODES | |
export GPUS_PER_NODE=4 | |
echo $MASTER_ADDR $MASTER_PORT | |
ml GCC | |
source /p/project/ccstdl/laion/mamba/bin/activate experimental-torch-nightly | |
export PYTHONPATH=$(pwd)/src:$PYTHONPATH | |
export OMP_NUM_THREADS=1 | |
export TRITON_CACHE_DIR=cache | |
export CUDA_LAUNCH_BLOCKING=1 | |
export NCCL_ASYNC_ERROR_HANDLING=1 | |
export NCCL_IB_TIMEOUT=20 | |
export NCCL_SOCKET_IFNAME=ib0 | |
export NCCL_DEBUG=INFO | |
#LAUNCHER="python -u -m torch.distributed.run \ | |
#--nproc_per_node $GPUS_PER_NODE \ | |
#--nnodes $NNODES \ | |
#--node_rank \$SLURM_PROCID \ | |
#--master_addr $MASTER_ADDR \ | |
#--master_port $MASTER_PORT \ | |
#--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ | |
#--rdzv_backend c10d \ | |
#" | |
LAUNCHER="python -u -m torch.distributed.run \ | |
--nproc_per_node $GPUS_PER_NODE \ | |
--nnodes $NNODES \ | |
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ | |
--rdzv_backend static \ | |
--max_restarts 0 \ | |
--tee 3 \ | |
" | |
PROGRAM="$REPO_PATH/run_train.py --config-file examples/config_poro_34b.yaml" | |
#export CMD="${LAUNCHER} ${PROGRAM}" | |
export CMD="$LAUNCHER --node_rank \$SLURM_PROCID $PROGRAM" | |
echo $CMD | |
export WANDB_MODE="offline" | |
SRUN_ARGS=" --threads-per-core=1\ | |
--wait=60 \ | |
--kill-on-bad-exit=1 \ | |
--jobid $SLURM_JOB_ID \ | |
" | |
DATETIME=$(date +'date_%y-%m-%d_time_%H-%M-%S') | |
LOG_PATH="logs/${SLURM_JOB_NAME}_${DATETIME}.log" | |
srun $SRUN_ARGS bash -c "$CMD" 2>&1 | tee -a $LOG_PATH |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment