NODE_NAME="$(kubectl get node -o jsonpath="{.items[0].metadata.name}")"
kubectl node-shell $NODE_NAME
copy rebuild_kernel.sh into /opt/rebuild_kernel.sh or similar, and bash rebuild_kernel.sh
it'll reboot into the new kernel if successful.
#!/usr/bin/env bash | |
# version must be available from https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-x86_64/ | |
# version must be paired with a cuda release from https://developer.nvidia.com/cuda-downloads | |
# not every driver version releases fabricmanager artifacts or pairs with a cuda version -_- | |
# we recommend using production or LTS branches when possible. | |
# e.g. 12.6.2 + 560.35.03 are here | |
# https://developer.nvidia.com/cuda-12-6-2-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=runfile_local | |
# https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-x86_64/fabricmanager-linux-x86_64-560.35.03-archive.tar.xz | |
NVIDIA_DRIVER_VERSION="560.35.03" |
# install nvidia device plugin (without env var) | |
kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/main/deployments/static/nvidia-device-plugin-compat-with-cpumanager.yml | |
# ssh OR nsenter node using node-shell + privileged pod | |
# tried both to eliminate any container mount issues. | |
# same behavior | |
# https://github.com/kvaps/kubectl-node-shell | |
kubectl node-shell aks-nca100-36400834-vmss000000 | |
apiVersion: kyverno.io/v1 | |
kind: ClusterPolicy | |
metadata: | |
name: exclude-all-pods-http-proxy | |
spec: | |
mutateExistingOnPolicyUpdate: false | |
rules: | |
- name: pod-ns | |
match: | |
any: |
apiVersion: apps/v1 | |
kind: DaemonSet | |
metadata: | |
name: nvidia-device-plugin-daemonset | |
namespace: kube-system | |
spec: | |
selector: | |
matchLabels: | |
name: nvidia-device-plugin-ds | |
updateStrategy: |
import http.client | |
import xml.etree.ElementTree as ET | |
from urllib.parse import urlparse | |
import json | |
from subprocess import Popen, PIPE, STDOUT | |
import base64 | |
try: | |
# request goalstate from wireserver | |
wireserver = "168.63.129.16" |
export GROUP=ace-mig | |
export NAME=ace-mig | |
export LOCATION=eastus | |
az group create -g "${GROUP}" -l ${LOCATION} | |
# create a cluster with a default pool with some typical parameters. | |
# not really relevant. | |
# only key piece is use k8s version >= 1.25.0 for Ubuntu 22.04 with cgroupv2. |
# n.b.: nodes are in 172.18.0.0/16 | |
# hash subnet range with sha256 -> first 10 digit for prefix | |
hash_prefix=$(ip r | grep -E "\/[0-9]+ dev eth0" | cut -d' ' -f1 | sha256sum | head -c 10) | |
# prepend with fd for unique local address predix for 6rd routing | |
rd_prefix="fd${hash_prefix}" | |
# add colons between each 4 hex chars | |
rd_prefix_formatted=$(echo "${rd_prefix}" | fold -w4 | paste -sd:) | |
local_addr=$(ip a show dev eth0 | grep -E 'inet ' | cut -d' ' -f6 | cut -d'/' -f1) | |
# get local IPv4 subnet as XX.XX.XX.XX/XX |
apiVersion: apps/v1 | |
kind: DaemonSet | |
metadata: | |
name: &name kubelet-killer-30sec | |
labels: | |
app: *name | |
spec: | |
selector: | |
matchLabels: | |
app: *name |
# for context, this is a 22.04 hetzner machine upgraded from 20.04 with maybe a kernel I rebuilt (?) I forgot. | |
root@Ubuntu-2004-focal-64-minimal ~ # cat /etc/os-release | |
PRETTY_NAME="Ubuntu 22.04.2 LTS" | |
NAME="Ubuntu" | |
VERSION_ID="22.04" | |
VERSION="22.04.2 LTS (Jammy Jellyfish)" | |
VERSION_CODENAME=jammy | |
ID=ubuntu | |
ID_LIKE=debian | |
HOME_URL="https://www.ubuntu.com/" |