Skip to content

Instantly share code, notes, and snippets.

@mehdidc
Created April 17, 2024 07:51

Revisions

  1. mehdidc created this gist Apr 17, 2024.
    64 changes: 64 additions & 0 deletions run_34b.sbatch
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,64 @@
    #!/bin/bash
    #SBATCH --nodes=8
    #SBATCH --time=00:20:00
    #SBATCH --ntasks-per-node=1
    #SBATCH --cpus-per-task=48
    #SBATCH --gres=gpu:4
    #SBATCH --partition=booster
    #SBATCH --account=transfernetx
    #SBATCH --exclude=jwb[0059,0067,0069,0193,0284,0287,0294,0359,0418,0637,0829,0832,0838,0898,0907,0921,0971,1004,1023,1029,1213,1126]
    #SBATCH --threads-per-core=1
    #SBATCH --mem=0
    REPO_PATH=$(pwd)
    export CUDA_DEVICE_MAX_CONNECTIONS=1
    export CUDA_VISIBLE_DEVICES="0,1,2,3"
    export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK}
    export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
    #export MASTER_ADDR="${MASTER_ADDR}.juwels"
    #export MASTER_ADDR="${MASTER_ADDR}.jureca"
    #export MASTER_ADDR="${MASTER_ADDR}"
    export MASTER_ADDR="${MASTER_ADDR}i"
    export MASTER_PORT=12345
    export NNODES=$SLURM_JOB_NUM_NODES
    export GPUS_PER_NODE=4
    echo $MASTER_ADDR $MASTER_PORT
    ml GCC
    source /p/project/ccstdl/laion/mamba/bin/activate experimental-torch-nightly
    export PYTHONPATH=$(pwd)/src:$PYTHONPATH
    export OMP_NUM_THREADS=1
    export TRITON_CACHE_DIR=cache
    export CUDA_LAUNCH_BLOCKING=1
    export NCCL_ASYNC_ERROR_HANDLING=1
    export NCCL_IB_TIMEOUT=20
    export NCCL_SOCKET_IFNAME=ib0
    export NCCL_DEBUG=INFO
    #LAUNCHER="python -u -m torch.distributed.run \
    #--nproc_per_node $GPUS_PER_NODE \
    #--nnodes $NNODES \
    #--node_rank \$SLURM_PROCID \
    #--master_addr $MASTER_ADDR \
    #--master_port $MASTER_PORT \
    #--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
    #--rdzv_backend c10d \
    #"
    LAUNCHER="python -u -m torch.distributed.run \
    --nproc_per_node $GPUS_PER_NODE \
    --nnodes $NNODES \
    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
    --rdzv_backend static \
    --max_restarts 0 \
    --tee 3 \
    "
    PROGRAM="$REPO_PATH/run_train.py --config-file examples/config_poro_34b.yaml"
    #export CMD="${LAUNCHER} ${PROGRAM}"
    export CMD="$LAUNCHER --node_rank \$SLURM_PROCID $PROGRAM"
    echo $CMD
    export WANDB_MODE="offline"
    SRUN_ARGS=" --threads-per-core=1\
    --wait=60 \
    --kill-on-bad-exit=1 \
    --jobid $SLURM_JOB_ID \
    "
    DATETIME=$(date +'date_%y-%m-%d_time_%H-%M-%S')
    LOG_PATH="logs/${SLURM_JOB_NAME}_${DATETIME}.log"
    srun $SRUN_ARGS bash -c "$CMD" 2>&1 | tee -a $LOG_PATH