Created
April 25, 2025 17:17
-
-
Save alexeldeib/0600c42a5bf4392f11c7eba1a914079e to your computer and use it in GitHub Desktop.
setup nvidia drivers with FM from upstream urls
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# version must be available from https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-x86_64/ | |
# version must be paired with a cuda release from https://developer.nvidia.com/cuda-downloads | |
# not every driver version releases fabricmanager artifacts or pairs with a cuda version -_- | |
# we recommend using production or LTS branches when possible. | |
# e.g. 12.6.2 + 560.35.03 are here | |
# https://developer.nvidia.com/cuda-12-6-2-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=runfile_local | |
# https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-x86_64/fabricmanager-linux-x86_64-560.35.03-archive.tar.xz | |
NVIDIA_DRIVER_VERSION="560.35.03" | |
CUDA_DRIVER_VERSION="560.35.03" | |
CUDA_MAJOR_MINOR_PATCH="12.6.2" | |
CUDA_MAJOR_MINOR="$(echo $CUDA_MAJOR_MINOR_PATCH | cut -d'.' -f1-2)" | |
load_nvidia_peermem() { | |
if ! grep -q 'nvidia-peermem' /etc/modules-load.d/nvidia-peermem.conf; then | |
echo "nvidia-peermem not found in modules-load.conf, adding..." | |
echo "nvidia-peermem" | tee /etc/modules-load.d/nvidia-peermem.conf >/dev/null | |
fi | |
if ! lsmod | grep -q 'nvidia_peermem'; then | |
echo "nvidia-peermem module is not loaded, loading now..." | |
modprobe nvidia_peermem | |
if [ $? -eq 0 ]; then | |
echo "nvidia-peermem module loaded successfully." | |
else | |
echo "Failed to load nvidia-peermem module." | |
fi | |
else | |
echo "nvidia-peermem module is already loaded." | |
fi | |
} | |
setup_cuda_environment() { | |
local cuda_path="/usr/local/cuda-${CUDA_MAJOR_MINOR}" | |
local root_bashrc="/root/.bashrc" | |
if ! grep -q "PATH=\"$cuda_path/bin:\$PATH\"" "$root_bashrc"; then | |
bash -c 'echo "export PATH=\"'$cuda_path'/bin:\$PATH\"" >> "'$root_bashrc'"' | |
bash -c 'echo "export LD_LIBRARY_PATH=\"'$cuda_path'/lib64:\$LD_LIBRARY_PATH\"" >> "'$root_bashrc'"' | |
echo "CUDA environment variables set for root user." | |
fi | |
ldconfig | |
bash -c 'source '$root_bashrc'; ldconfig' | |
echo "CUDA environment variables loaded and system configuration updated." | |
} | |
check_nvidia_fabricmanager() { | |
echo "Checking nvidia-fabricmanager service state..." | |
systemctl status nvidia-fabricmanager --no-pager -l | |
echo "Checking Nvidia NvSwitch fabric state..." | |
nvidia-smi -q -i 0 | grep -i -A 2 Fabric | |
} | |
install_nvidia_fabric_manager() { | |
echo "Checking if NVIDIA Fabric Manager is installed and running..." | |
echo "NVIDIA Fabric Manager not found, downloading and installing..." | |
TAR_URL="https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-x86_64/fabricmanager-linux-x86_64-${NVIDIA_DRIVER_VERSION}-archive.tar.xz" | |
TAR_FILE="fabricmanager-linux-x86_64-${NVIDIA_DRIVER_VERSION}-archive.tar.xz" | |
EXTRACT_DIR="fabricmanager_installation" | |
wget "$TAR_URL" -O "$TAR_FILE" | |
mkdir -p "$EXTRACT_DIR" | |
tar -xf "$TAR_FILE" -C "$EXTRACT_DIR" | |
cd "$EXTRACT_DIR/fabricmanager-linux-x86_64-${NVIDIA_DRIVER_VERSION}-archive" | |
echo "Starting NVIDIA Fabric Manager installation" | |
if systemctl is-active --quiet nvidia-fabricmanager; then | |
echo "Fabric Manager service is running, stopping it..." | |
systemctl stop nvidia-fabricmanager | |
else | |
echo "Fabric Manager service is not running." | |
fi | |
ARCH_TYPE=$(uname -m) | |
LIB_LOC="/usr/lib/x86_64-linux-gnu" | |
if [ "$ARCH_TYPE" = 'aarch64' ]; then | |
LIB_LOC="/usr/lib/aarch64-linux-gnu" | |
fi | |
cp lib/libnvfm.so.1 "$LIB_LOC" | |
cp -P lib/libnvfm.so "$LIB_LOC" | |
cp bin/nv-fabricmanager /usr/bin | |
cp bin/nvswitch-audit /usr/bin | |
cp systemd/nvidia-fabricmanager.service /lib/systemd/system | |
mkdir -p /usr/share/nvidia/nvswitch | |
cp share/nvidia/nvswitch/* /usr/share/nvidia/nvswitch/ | |
cp etc/fabricmanager.cfg /usr/share/nvidia/nvswitch/ | |
cp include/nv_fm_agent.h /usr/include | |
cp include/nv_fm_types.h /usr/include | |
mkdir -p /usr/share/doc/nvidia-fabricmanager | |
cp LICENSE /usr/share/doc/nvidia-fabricmanager | |
cp third-party-notices.txt /usr/share/doc/nvidia-fabricmanager | |
systemctl enable nvidia-fabricmanager | |
echo "Fabric Manager installation completed." | |
cd .. | |
systemctl daemon-reload | |
systemctl restart nvidia-fabricmanager | |
} | |
install_cuda() { | |
apt update | |
apt install -yq wget kmod < /dev/null | |
if command -v nvcc >/dev/null 2>&1; then | |
echo "CUDA is already installed." | |
nvcc --version | |
else | |
echo "CUDA not found. Installing CUDA..." | |
apt update | |
apt install -y build-essential dkms freeglut3 freeglut3-dev libxi-dev libxmu-dev | |
apt install -y gcc-11 g++-11 | |
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 60 --slave /usr/bin/g++ g++ /usr/bin/g++-11 | |
wget https://us.download.nvidia.com/tesla/${NVIDIA_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}.run | |
chmod +x NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}.run | |
./NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}.run -j$(nproc) -a --no-drm --dkms -s --kernel-module-build-directory=kernel-open | |
CUDA_RUNFILE="https://developer.download.nvidia.com/compute/cuda/${CUDA_MAJOR_MINOR_PATCH}/local_installers/cuda_${CUDA_MAJOR_MINOR_PATCH}_${CUDA_DRIVER_VERSION}_linux.run" | |
CUDA_FILE_NAME="cuda_${CUDA_MAJOR_MINOR_PATCH}_${CUDA_DRIVER_VERSION}_linux.run" | |
wget "$CUDA_RUNFILE" -O "$CUDA_FILE_NAME" | |
chmod +x "$CUDA_FILE_NAME" | |
./"$CUDA_FILE_NAME" --toolkit --kernelobjects --silent --kernel-module-build-directory=kernel-open -- -j$(nproc) | |
echo "CUDA installation completed." | |
fi | |
} | |
setup_persistenced() { | |
tee /etc/systemd/system/nvidia-persistenced.service <<'eof' | |
[Unit] | |
Description=NVIDIA Persistence Daemon | |
Wants=syslog.target | |
[Service] | |
Type=forking | |
ExecStart=/usr/bin/nvidia-persistenced --user root --verbose | |
ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced | |
[Install] | |
WantedBy=multi-user.target | |
eof | |
systemctl enable nvidia-persistenced | |
systemctl start nvidia-persistenced | |
} | |
install_nv_container_cli() { | |
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb | |
dpkg -i cuda-keyring_1.1-1_all.deb | |
apt update | |
apt install -yq nvidia-container-toolkit < /dev/null | |
# NOT NECESSARY FOR SLURM, only k8s/docker | |
nvidia-ctk runtime configure --runtime=docker | |
nvidia-ctk runtime configure --runtime=containerd | |
} | |
# Adding these functions to the main function call | |
main() { | |
install_cuda | |
setup_cuda_environment | |
load_nvidia_peermem # possibly not necessary/will fail if using PCIE vs SXM interconnect. should work on DGX H100 | |
install_nvidia_fabric_manager # will only work | |
setup_persistenced # optional, accelerates commands which require driver invocations like basic nvidia-smi commands | |
# install_nv_container_cli | |
} | |
# Execute the main function | |
main |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment