Ace Eldeib alexeldeib

AKS iolatency rebuild

NODE_NAME="$(kubectl get node -o jsonpath="{.items[0].metadata.name}")"
kubectl node-shell $NODE_NAME

copy rebuild_kernel.sh into /opt/rebuild_kernel.sh or similar, and bash rebuild_kernel.sh

it'll reboot into the new kernel if successful.

	#!/usr/bin/env bash

	# version must be available from https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-x86_64/
	# version must be paired with a cuda release from https://developer.nvidia.com/cuda-downloads
	# not every driver version releases fabricmanager artifacts or pairs with a cuda version -_-
	# we recommend using production or LTS branches when possible.
	# e.g. 12.6.2 + 560.35.03 are here
	# https://developer.nvidia.com/cuda-12-6-2-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=runfile_local
	# https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-x86_64/fabricmanager-linux-x86_64-560.35.03-archive.tar.xz
	NVIDIA_DRIVER_VERSION="560.35.03"

	# install nvidia device plugin (without env var)
	kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/main/deployments/static/nvidia-device-plugin-compat-with-cpumanager.yml

	# ssh OR nsenter node using node-shell + privileged pod
	# tried both to eliminate any container mount issues.
	# same behavior
	# https://github.com/kvaps/kubectl-node-shell
	kubectl node-shell aks-nca100-36400834-vmss000000

	apiVersion: kyverno.io/v1
	kind: ClusterPolicy
	metadata:
	name: exclude-all-pods-http-proxy
	spec:
	mutateExistingOnPolicyUpdate: false
	rules:
	- name: pod-ns
	match:
	any:

	apiVersion: apps/v1
	kind: DaemonSet
	metadata:
	name: nvidia-device-plugin-daemonset
	namespace: kube-system
	spec:
	selector:
	matchLabels:
	name: nvidia-device-plugin-ds
	updateStrategy:

	import http.client
	import xml.etree.ElementTree as ET
	from urllib.parse import urlparse
	import json
	from subprocess import Popen, PIPE, STDOUT
	import base64

	try:
	# request goalstate from wireserver
	wireserver = "168.63.129.16"


	export GROUP=ace-mig
	export NAME=ace-mig
	export LOCATION=eastus

	az group create -g "${GROUP}" -l ${LOCATION}

	# create a cluster with a default pool with some typical parameters.
	# not really relevant.
	# only key piece is use k8s version >= 1.25.0 for Ubuntu 22.04 with cgroupv2.

	# n.b.: nodes are in 172.18.0.0/16

	# hash subnet range with sha256 -> first 10 digit for prefix
	hash_prefix=$(ip r \| grep -E "\/[0-9]+ dev eth0" \| cut -d' ' -f1 \| sha256sum \| head -c 10)
	# prepend with fd for unique local address predix for 6rd routing
	rd_prefix="fd${hash_prefix}"
	# add colons between each 4 hex chars
	rd_prefix_formatted=$(echo "${rd_prefix}" \| fold -w4 \| paste -sd:)
	local_addr=$(ip a show dev eth0 \| grep -E 'inet ' \| cut -d' ' -f6 \| cut -d'/' -f1)
	# get local IPv4 subnet as XX.XX.XX.XX/XX

	# for context, this is a 22.04 hetzner machine upgraded from 20.04 with maybe a kernel I rebuilt (?) I forgot.
	root@Ubuntu-2004-focal-64-minimal ~ # cat /etc/os-release
	PRETTY_NAME="Ubuntu 22.04.2 LTS"
	NAME="Ubuntu"
	VERSION_ID="22.04"
	VERSION="22.04.2 LTS (Jammy Jellyfish)"
	VERSION_CODENAME=jammy
	ID=ubuntu
	ID_LIKE=debian
	HOME_URL="https://www.ubuntu.com/"