Created
April 17, 2024 07:51
Revisions
-
mehdidc created this gist
Apr 17, 2024 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,64 @@ #!/bin/bash #SBATCH --nodes=8 #SBATCH --time=00:20:00 #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=48 #SBATCH --gres=gpu:4 #SBATCH --partition=booster #SBATCH --account=transfernetx #SBATCH --exclude=jwb[0059,0067,0069,0193,0284,0287,0294,0359,0418,0637,0829,0832,0838,0898,0907,0921,0971,1004,1023,1029,1213,1126] #SBATCH --threads-per-core=1 #SBATCH --mem=0 REPO_PATH=$(pwd) export CUDA_DEVICE_MAX_CONNECTIONS=1 export CUDA_VISIBLE_DEVICES="0,1,2,3" export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) #export MASTER_ADDR="${MASTER_ADDR}.juwels" #export MASTER_ADDR="${MASTER_ADDR}.jureca" #export MASTER_ADDR="${MASTER_ADDR}" export MASTER_ADDR="${MASTER_ADDR}i" export MASTER_PORT=12345 export NNODES=$SLURM_JOB_NUM_NODES export GPUS_PER_NODE=4 echo $MASTER_ADDR $MASTER_PORT ml GCC source /p/project/ccstdl/laion/mamba/bin/activate experimental-torch-nightly export PYTHONPATH=$(pwd)/src:$PYTHONPATH export OMP_NUM_THREADS=1 export TRITON_CACHE_DIR=cache export CUDA_LAUNCH_BLOCKING=1 export NCCL_ASYNC_ERROR_HANDLING=1 export NCCL_IB_TIMEOUT=20 export NCCL_SOCKET_IFNAME=ib0 export NCCL_DEBUG=INFO #LAUNCHER="python -u -m torch.distributed.run \ #--nproc_per_node $GPUS_PER_NODE \ #--nnodes $NNODES \ #--node_rank \$SLURM_PROCID \ #--master_addr $MASTER_ADDR \ #--master_port $MASTER_PORT \ #--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ #--rdzv_backend c10d \ #" LAUNCHER="python -u -m torch.distributed.run \ --nproc_per_node $GPUS_PER_NODE \ --nnodes $NNODES \ --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ --rdzv_backend static \ --max_restarts 0 \ --tee 3 \ " PROGRAM="$REPO_PATH/run_train.py --config-file examples/config_poro_34b.yaml" #export CMD="${LAUNCHER} ${PROGRAM}" export CMD="$LAUNCHER --node_rank \$SLURM_PROCID $PROGRAM" echo $CMD export WANDB_MODE="offline" SRUN_ARGS=" --threads-per-core=1\ --wait=60 \ --kill-on-bad-exit=1 \ --jobid $SLURM_JOB_ID \ " DATETIME=$(date +'date_%y-%m-%d_time_%H-%M-%S') LOG_PATH="logs/${SLURM_JOB_NAME}_${DATETIME}.log" srun $SRUN_ARGS bash -c "$CMD" 2>&1 | tee -a $LOG_PATH