参考: https://www.substratus.ai/blog/kind-with-gpus 先把 docker 配好
sudo nvidia-ctk runtime configure --runtime=docker --set-as-default
sudo systemctl restart docker
Set accept-nvidia-visible-devices-as-volume-mounts = true in /etc/nvidia-container-runtime/config.toml:
sudo sed -i '/accept-nvidia-visible-devices-as-volume-mounts/c\accept-nvidia-visible-devices-as-volume-mounts = true' /etc/nvidia-container-runtime/config.toml
安装 Kind
export KUBE_VERSION="1.32.0"
export KIND_VERSION=$(curl -s https://api.github.com/repos/kubernetes-sigs/kind/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' )
# 配置内核
cat <<EOF | sudo tee /etc/modules-load.d/k8s.conf
br_netfilter
EOF
cat <<EOF | sudo tee /etc/sysctl.d/k8s.conf
net.bridge.bridge-nf-call-ip6tables = 1
net.bridge.bridge-nf-call-iptables = 1
EOF
sudo sysctl --system
curl -Lo ./kind https://files.m.daocloud.io/github.com/kubernetes-sigs/kind/releases/download/${KIND_VERSION}/kind-linux-amd64
chmod +x ./kind
mv ./kind /usr/bin/kind
service docker start
cat <<EOF | sudo tee /root/kind.yml
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
# required for GPU workaround
extraMounts:
- hostPath: /dev/null
containerPath: /var/run/nvidia-container-devices/all
- hostPath: /root
containerPath: /root
EOF
kind create cluster --config /root/kind.yml --image docker.m.daocloud.io/kindest/node:v${KUBE_VERSION}
Workaround for issue with missing required file /sbin/ldconfig.real:
# https://github.com/NVIDIA/nvidia-docker/issues/614#issuecomment-423991632
docker exec -ti kind-control-plane ln -s /sbin/ldconfig /sbin/ldconfig.real
安装 gpu-operator 参考:https://gist.github.com/yankay/5144e4a00845000f3e1d6c7fabb0b973
kubectl create -n gpu-operator -f time-slicing-config.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: time-slicing-config
data:
any: |-
version: v1
sharing:
timeSlicing:
renameByDefault: false
resources:
- name: nvidia.com/gpu
replicas: 10
kubectl patch clusterpolicies.nvidia.com/cluster-policy \
-n gpu-operator --type merge \
-p '{"spec": {"devicePlugin": {"config": {"name": "time-slicing-config", "default": "any"}}}}'
看效果
kubectl describe node |grep gpu