Skip to content

Instantly share code, notes, and snippets.

@jacobtomlinson
Last active July 3, 2025 17:10
Show Gist options
  • Save jacobtomlinson/a392986092cce17d855d9cc3a9c8f90c to your computer and use it in GitHub Desktop.
Save jacobtomlinson/a392986092cce17d855d9cc3a9c8f90c to your computer and use it in GitHub Desktop.
Example of using SSH and nvidia-smi to show GPU utilization on a compute cluster

GPU Utilization CLI demo

A fun little CLI tool to gather and disiplay GPU utilization for nodes in a cluster.

image

Inspired by this LinkedIn post.

Usage

python gpu_util.py [hostname] [hostname] [etc] ...
#!/usr/bin/env python3
import subprocess
import argparse
import sys
import xml.etree.ElementTree as etree
import time
import os
APP_WIDTH = 34
GPUS_PER_ROW = 5
def print_centered(text: str):
print(f"\033[34m{' '*((APP_WIDTH - len(text)) // 2)}{text}\033[0m")
def remote_nvidia_smi(hostname: str) -> str:
return subprocess.check_output(["ssh", hostname, "nvidia-smi", "-x", "-q"]).decode('utf-8')
def extract_gpu_utilization(xml_string):
for gpu in etree.fromstring(xml_string).findall('gpu'):
yield {
'name': gpu.find('product_name').text,
'gpu_util': int(gpu.find('utilization').find('gpu_util').text.replace(' %', '')),
}
def print_gpu_utilization(hosts_data):
all_gpus = []
total_util = 0
valid_gpus = 0
# Clear the screen
os.system('cls' if os.name == 'nt' else 'clear')
# Calculate total utilization and valid GPUs
for utilization_data in hosts_data:
for gpu in utilization_data:
try:
total_util += gpu['gpu_util']
valid_gpus += 1
all_gpus.append(f"\033[32m[{gpu['gpu_util']:3d}%]\033[0m")
except (ValueError, AttributeError):
all_gpus.append("[ N/A]")
# Display header
total_gpu_count = len(all_gpus)
print("-"*APP_WIDTH)
print_centered("AI Compute Cluster")
# Get the GPU model name
gpu_models = set(gpu['name'] for utilization_data in hosts_data
for gpu in utilization_data
if gpu['name'] and gpu['name'] != 'N/A')
gpu_model = gpu_models.pop() if len(gpu_models) == 1 else "Mixed GPUs"
print_centered(f"{total_gpu_count} GPUs • {gpu_model} ⚡️")
print_centered("GPU Utilization")
# Display utilization data in grid format (5 GPUs per row)
print("-"*APP_WIDTH)
for i in range(0, len(all_gpus), GPUS_PER_ROW):
row = all_gpus[i:i + GPUS_PER_ROW]
print(" ".join(row))
# Calculate and display average
print("-"*APP_WIDTH)
avg_util = total_util / valid_gpus
print_centered(f"Average GPU Utilization: \033[32m{avg_util:.1f}%\033[0m")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("hostnames", nargs="+")
args = parser.parse_args()
while True:
# Get nvidia-smi data for all hostnames using list comprehension
nvidia_smi_data_list = [remote_nvidia_smi(hostname) for hostname in args.hostnames]
# Extract utilization data for all hosts
utilization_data = [list(extract_gpu_utilization(smi_data)) for smi_data in nvidia_smi_data_list]
# Print the utilization data for all hosts
print_gpu_utilization(utilization_data)
# Wait for 1 second before next update
time.sleep(1)
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
sys.exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment