Created
December 11, 2019 13:14
-
-
Save ticapix/bef4d6deb5f70242f0d88c15b60845fb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
set -e | |
TERM=ansi | |
if [ $# -ne 1 ]; then | |
echo "usage: $0 <ngpus:int>" | |
echo "This script has to be executed on the t1-(45|90|180) VM directly" | |
exit 1 | |
fi | |
NGPUS=$1 | |
if [ `id -u` -ne 0 ]; then | |
sudo $0 $@ | |
exit 0 | |
fi | |
echo "Testing VM `hostname` with $NGPUS GPU(s)" | |
# 0 – Black # 1 – Red # 2 – Green # 3 – Yellow # 4 – Blue # 5 – Magenta # 6 – Cyan # 7 – White | |
#"GTX 1080 Ti" "10de:1b06" --precision=fp16 --batch_size=128 / --precision=fp32 --batch_size=64 | |
#"V100" "10de:1db4" --precision=fp16 --batch_size=256 / --precision=fp32 --batch_size=128 | |
assert_eq() { | |
val1=$1 | |
val2=$2 | |
msg=$3 | |
tput bold || true | |
echo -n "$msg: " | |
tput sgr0 || true | |
if [ $val1 = $val2 ]; then | |
tput setaf 2 || true | |
echo "OK ($val1 = $val2)" | |
tput sgr0 || true | |
else | |
tput setaf 1 || true | |
echo "FAIL ($val1 != $val2)" | |
tput sgr0 || true | |
exit 1 | |
fi | |
} | |
check() { | |
tput bold || true | |
tput setaf 6 || true | |
echo $1 | |
tput sgr0 || true | |
eval $1 | |
} | |
test_lspci() { | |
count=`lspci -n | grep '10de:1db4' | wc -l` | |
assert_eq $count $NGPUS "Number of devices returned by lspci" | |
} | |
test_nvidia_smi_binary() { | |
count=`(which nvidia-smi || true) | grep nvidia-smi | wc -l` | |
assert_eq $count 1 "Is nvidia-smi installed" | |
} | |
test_nvidia_smi() { | |
count=`nvidia-smi --list-gpus | grep 'V100' | wc -l` | |
assert_eq $count $NGPUS "Number of devices returned by nvidia-smi" | |
} | |
test_docker_hello_world() { | |
docker run hello-world | |
} | |
test_docker_nvidia_smi() { | |
count=`docker run --runtime=nvidia --rm nvidia/cuda:10.2-base nvidia-smi --list-gpus | grep 'V100' | wc -l` | |
assert_eq $count $NGPUS "Number of devices returned by nvidia-smi inside docker" | |
} | |
latest_tag() { | |
latest=`date --date='-60 day' "+%y.%m-py3"` | |
echo -n $latest | |
} | |
test_tf_basic_fp16() { | |
nvidia-docker run --rm nvcr.io/nvidia/tensorflow:`latest_tag` mpiexec \ | |
--allow-run-as-root -np $NGPUS python /workspace/nvidia-examples/cnn/resnet.py \ | |
--layers=50 --precision=fp16 --batch_size=256 --num_iter=100 | |
} | |
test_tf_basic_fp32() { | |
nvidia-docker run --rm nvcr.io/nvidia/tensorflow:`latest_tag` \ | |
mpiexec --allow-run-as-root -np $NGPUS python /workspace/nvidia-examples/cnn/resnet.py \ | |
--layers=50 --precision=fp32 --batch_size=128 --num_iter=100 | |
} | |
check test_lspci | |
check test_nvidia_smi_binary | |
check test_nvidia_smi | |
check test_docker_hello_world | |
check test_docker_nvidia_smi | |
check test_tf_basic_fp16 | |
check test_tf_basic_fp32 | |
exit 0 | |
# # run LSTM (~9400wps to t1-45) | |
# cd /workspace/nvidia-examples/big_lstm | |
# ./download_1b_words_data.sh | |
# python single_lm_train.py --mode=train --logdir=./logs --num_gpus=1 --datadir=./data/1-billion-word-language-modeling-benchmark-r13output/ --hpconfig run_profiler=False,max_time=90,num_steps=20,num_shards=8,num_layers=2,learning_rate=0.2,max_grad_norm=1,keep_prob=0.9,emb_size=1024,projected_size=1024,state_size=8192,num_sampled=8192,batch_size=512 | |
# # Run basic PyTorch code | |
# nvidia-docker run --rm --ipc=host nvcr.io/nvidia/pytorch:`latest_tag` \ | |
# python /opt/pytorch/examples/word_language_model/main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 3 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment