Created
August 22, 2023 17:02
-
-
Save alexeldeib/1755bc9f1b628886605186407064087c to your computer and use it in GitHub Desktop.
nvidia cgroupv2 repro test
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# install nvidia device plugin (without env var) | |
kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/main/deployments/static/nvidia-device-plugin-compat-with-cpumanager.yml | |
# ssh OR nsenter node using node-shell + privileged pod | |
# tried both to eliminate any container mount issues. | |
# same behavior | |
# https://github.com/kvaps/kubectl-node-shell | |
kubectl node-shell aks-nca100-36400834-vmss000000 | |
# in a separate shell: | |
# write out the test manifest | |
tee nvidia-smi-loop.yaml > /dev/null <<'EOF' | |
apiVersion: v1 | |
kind: Pod | |
metadata: | |
name: cuda-nvidia-smi-loop | |
spec: | |
restartPolicy: OnFailure | |
containers: | |
- name: cuda | |
image: "nvcr.io/nvidia/cuda:12.0.0-base-ubuntu20.04" | |
command: ["/bin/sh", "-c"] | |
args: ["while true; do nvidia-smi -L; sleep 5; done"] | |
resources: | |
limits: | |
nvidia.com/gpu: 1 | |
EOF | |
kubectl apply -f nvidia-smi-loop.yaml | |
# back in node-shell, trigger the issue | |
systemctl daemon-reload | |
kubectl logs -f cuda-nvidia-smi-loop | |
# GPU 0: NVIDIA A100 80GB PCIe (UUID: GPU-0451fe54-e0a1-36c5-eeb5-19025f49663e) | |
# GPU 0: NVIDIA A100 80GB PCIe (UUID: GPU-0451fe54-e0a1-36c5-eeb5-19025f49663e) | |
# GPU 0: NVIDIA A100 80GB PCIe (UUID: GPU-0451fe54-e0a1-36c5-eeb5-19025f49663e) | |
# GPU 0: NVIDIA A100 80GB PCIe (UUID: GPU-0451fe54-e0a1-36c5-eeb5-19025f49663e) | |
# Failed to initialize NVML: Unknown Error | |
# Failed to initialize NVML: Unknown Error | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment