A fun little CLI tool to gather and disiplay GPU utilization for nodes in a cluster.
Inspired by this LinkedIn post.
python gpu_util.py [hostname] [hostname] [etc] ...
A fun little CLI tool to gather and disiplay GPU utilization for nodes in a cluster.
Inspired by this LinkedIn post.
python gpu_util.py [hostname] [hostname] [etc] ...
#!/usr/bin/env python3 | |
import subprocess | |
import argparse | |
import sys | |
import xml.etree.ElementTree as etree | |
import time | |
import os | |
APP_WIDTH = 34 | |
GPUS_PER_ROW = 5 | |
def print_centered(text: str): | |
print(f"\033[34m{' '*((APP_WIDTH - len(text)) // 2)}{text}\033[0m") | |
def remote_nvidia_smi(hostname: str) -> str: | |
return subprocess.check_output(["ssh", hostname, "nvidia-smi", "-x", "-q"]).decode('utf-8') | |
def extract_gpu_utilization(xml_string): | |
for gpu in etree.fromstring(xml_string).findall('gpu'): | |
yield { | |
'name': gpu.find('product_name').text, | |
'gpu_util': int(gpu.find('utilization').find('gpu_util').text.replace(' %', '')), | |
} | |
def print_gpu_utilization(hosts_data): | |
all_gpus = [] | |
total_util = 0 | |
valid_gpus = 0 | |
# Clear the screen | |
os.system('cls' if os.name == 'nt' else 'clear') | |
# Calculate total utilization and valid GPUs | |
for utilization_data in hosts_data: | |
for gpu in utilization_data: | |
try: | |
total_util += gpu['gpu_util'] | |
valid_gpus += 1 | |
all_gpus.append(f"\033[32m[{gpu['gpu_util']:3d}%]\033[0m") | |
except (ValueError, AttributeError): | |
all_gpus.append("[ N/A]") | |
# Display header | |
total_gpu_count = len(all_gpus) | |
print("-"*APP_WIDTH) | |
print_centered("AI Compute Cluster") | |
# Get the GPU model name | |
gpu_models = set(gpu['name'] for utilization_data in hosts_data | |
for gpu in utilization_data | |
if gpu['name'] and gpu['name'] != 'N/A') | |
gpu_model = gpu_models.pop() if len(gpu_models) == 1 else "Mixed GPUs" | |
print_centered(f"{total_gpu_count} GPUs • {gpu_model} ⚡️") | |
print_centered("GPU Utilization") | |
# Display utilization data in grid format (5 GPUs per row) | |
print("-"*APP_WIDTH) | |
for i in range(0, len(all_gpus), GPUS_PER_ROW): | |
row = all_gpus[i:i + GPUS_PER_ROW] | |
print(" ".join(row)) | |
# Calculate and display average | |
print("-"*APP_WIDTH) | |
avg_util = total_util / valid_gpus | |
print_centered(f"Average GPU Utilization: \033[32m{avg_util:.1f}%\033[0m") | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("hostnames", nargs="+") | |
args = parser.parse_args() | |
while True: | |
# Get nvidia-smi data for all hostnames using list comprehension | |
nvidia_smi_data_list = [remote_nvidia_smi(hostname) for hostname in args.hostnames] | |
# Extract utilization data for all hosts | |
utilization_data = [list(extract_gpu_utilization(smi_data)) for smi_data in nvidia_smi_data_list] | |
# Print the utilization data for all hosts | |
print_gpu_utilization(utilization_data) | |
# Wait for 1 second before next update | |
time.sleep(1) | |
if __name__ == "__main__": | |
try: | |
main() | |
except KeyboardInterrupt: | |
sys.exit(0) |