Skip to content

Instantly share code, notes, and snippets.

@alexeldeib
Created April 25, 2025 17:17
Show Gist options
  • Save alexeldeib/0600c42a5bf4392f11c7eba1a914079e to your computer and use it in GitHub Desktop.
Save alexeldeib/0600c42a5bf4392f11c7eba1a914079e to your computer and use it in GitHub Desktop.
setup nvidia drivers with FM from upstream urls
#!/usr/bin/env bash
# version must be available from https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-x86_64/
# version must be paired with a cuda release from https://developer.nvidia.com/cuda-downloads
# not every driver version releases fabricmanager artifacts or pairs with a cuda version -_-
# we recommend using production or LTS branches when possible.
# e.g. 12.6.2 + 560.35.03 are here
# https://developer.nvidia.com/cuda-12-6-2-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=runfile_local
# https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-x86_64/fabricmanager-linux-x86_64-560.35.03-archive.tar.xz
NVIDIA_DRIVER_VERSION="560.35.03"
CUDA_DRIVER_VERSION="560.35.03"
CUDA_MAJOR_MINOR_PATCH="12.6.2"
CUDA_MAJOR_MINOR="$(echo $CUDA_MAJOR_MINOR_PATCH | cut -d'.' -f1-2)"
load_nvidia_peermem() {
if ! grep -q 'nvidia-peermem' /etc/modules-load.d/nvidia-peermem.conf; then
echo "nvidia-peermem not found in modules-load.conf, adding..."
echo "nvidia-peermem" | tee /etc/modules-load.d/nvidia-peermem.conf >/dev/null
fi
if ! lsmod | grep -q 'nvidia_peermem'; then
echo "nvidia-peermem module is not loaded, loading now..."
modprobe nvidia_peermem
if [ $? -eq 0 ]; then
echo "nvidia-peermem module loaded successfully."
else
echo "Failed to load nvidia-peermem module."
fi
else
echo "nvidia-peermem module is already loaded."
fi
}
setup_cuda_environment() {
local cuda_path="/usr/local/cuda-${CUDA_MAJOR_MINOR}"
local root_bashrc="/root/.bashrc"
if ! grep -q "PATH=\"$cuda_path/bin:\$PATH\"" "$root_bashrc"; then
bash -c 'echo "export PATH=\"'$cuda_path'/bin:\$PATH\"" >> "'$root_bashrc'"'
bash -c 'echo "export LD_LIBRARY_PATH=\"'$cuda_path'/lib64:\$LD_LIBRARY_PATH\"" >> "'$root_bashrc'"'
echo "CUDA environment variables set for root user."
fi
ldconfig
bash -c 'source '$root_bashrc'; ldconfig'
echo "CUDA environment variables loaded and system configuration updated."
}
check_nvidia_fabricmanager() {
echo "Checking nvidia-fabricmanager service state..."
systemctl status nvidia-fabricmanager --no-pager -l
echo "Checking Nvidia NvSwitch fabric state..."
nvidia-smi -q -i 0 | grep -i -A 2 Fabric
}
install_nvidia_fabric_manager() {
echo "Checking if NVIDIA Fabric Manager is installed and running..."
echo "NVIDIA Fabric Manager not found, downloading and installing..."
TAR_URL="https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-x86_64/fabricmanager-linux-x86_64-${NVIDIA_DRIVER_VERSION}-archive.tar.xz"
TAR_FILE="fabricmanager-linux-x86_64-${NVIDIA_DRIVER_VERSION}-archive.tar.xz"
EXTRACT_DIR="fabricmanager_installation"
wget "$TAR_URL" -O "$TAR_FILE"
mkdir -p "$EXTRACT_DIR"
tar -xf "$TAR_FILE" -C "$EXTRACT_DIR"
cd "$EXTRACT_DIR/fabricmanager-linux-x86_64-${NVIDIA_DRIVER_VERSION}-archive"
echo "Starting NVIDIA Fabric Manager installation"
if systemctl is-active --quiet nvidia-fabricmanager; then
echo "Fabric Manager service is running, stopping it..."
systemctl stop nvidia-fabricmanager
else
echo "Fabric Manager service is not running."
fi
ARCH_TYPE=$(uname -m)
LIB_LOC="/usr/lib/x86_64-linux-gnu"
if [ "$ARCH_TYPE" = 'aarch64' ]; then
LIB_LOC="/usr/lib/aarch64-linux-gnu"
fi
cp lib/libnvfm.so.1 "$LIB_LOC"
cp -P lib/libnvfm.so "$LIB_LOC"
cp bin/nv-fabricmanager /usr/bin
cp bin/nvswitch-audit /usr/bin
cp systemd/nvidia-fabricmanager.service /lib/systemd/system
mkdir -p /usr/share/nvidia/nvswitch
cp share/nvidia/nvswitch/* /usr/share/nvidia/nvswitch/
cp etc/fabricmanager.cfg /usr/share/nvidia/nvswitch/
cp include/nv_fm_agent.h /usr/include
cp include/nv_fm_types.h /usr/include
mkdir -p /usr/share/doc/nvidia-fabricmanager
cp LICENSE /usr/share/doc/nvidia-fabricmanager
cp third-party-notices.txt /usr/share/doc/nvidia-fabricmanager
systemctl enable nvidia-fabricmanager
echo "Fabric Manager installation completed."
cd ..
systemctl daemon-reload
systemctl restart nvidia-fabricmanager
}
install_cuda() {
apt update
apt install -yq wget kmod < /dev/null
if command -v nvcc >/dev/null 2>&1; then
echo "CUDA is already installed."
nvcc --version
else
echo "CUDA not found. Installing CUDA..."
apt update
apt install -y build-essential dkms freeglut3 freeglut3-dev libxi-dev libxmu-dev
apt install -y gcc-11 g++-11
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 60 --slave /usr/bin/g++ g++ /usr/bin/g++-11
wget https://us.download.nvidia.com/tesla/${NVIDIA_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}.run
chmod +x NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}.run
./NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}.run -j$(nproc) -a --no-drm --dkms -s --kernel-module-build-directory=kernel-open
CUDA_RUNFILE="https://developer.download.nvidia.com/compute/cuda/${CUDA_MAJOR_MINOR_PATCH}/local_installers/cuda_${CUDA_MAJOR_MINOR_PATCH}_${CUDA_DRIVER_VERSION}_linux.run"
CUDA_FILE_NAME="cuda_${CUDA_MAJOR_MINOR_PATCH}_${CUDA_DRIVER_VERSION}_linux.run"
wget "$CUDA_RUNFILE" -O "$CUDA_FILE_NAME"
chmod +x "$CUDA_FILE_NAME"
./"$CUDA_FILE_NAME" --toolkit --kernelobjects --silent --kernel-module-build-directory=kernel-open -- -j$(nproc)
echo "CUDA installation completed."
fi
}
setup_persistenced() {
tee /etc/systemd/system/nvidia-persistenced.service <<'eof'
[Unit]
Description=NVIDIA Persistence Daemon
Wants=syslog.target
[Service]
Type=forking
ExecStart=/usr/bin/nvidia-persistenced --user root --verbose
ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced
[Install]
WantedBy=multi-user.target
eof
systemctl enable nvidia-persistenced
systemctl start nvidia-persistenced
}
install_nv_container_cli() {
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
dpkg -i cuda-keyring_1.1-1_all.deb
apt update
apt install -yq nvidia-container-toolkit < /dev/null
# NOT NECESSARY FOR SLURM, only k8s/docker
nvidia-ctk runtime configure --runtime=docker
nvidia-ctk runtime configure --runtime=containerd
}
# Adding these functions to the main function call
main() {
install_cuda
setup_cuda_environment
load_nvidia_peermem # possibly not necessary/will fail if using PCIE vs SXM interconnect. should work on DGX H100
install_nvidia_fabric_manager # will only work
setup_persistenced # optional, accelerates commands which require driver invocations like basic nvidia-smi commands
# install_nv_container_cli
}
# Execute the main function
main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment