Skip to content

Instantly share code, notes, and snippets.

@schmmd
Created November 20, 2019 19:08

Revisions

  1. schmmd created this gist Nov 20, 2019.
    61 changes: 61 additions & 0 deletions slurm-gpu-stats.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,61 @@
    #!/usr/bin/env python

    import os
    import re
    import subprocess
    import socket
    import sys

    def pids_of_jid(jid):
    result = subprocess.run(["sstat", "-p", "--format=PID", "-j", jid, "--noheader"], stdout=subprocess.PIPE)
    pids = result.stdout.decode("utf-8").strip().strip("|").split(",")
    return pids

    def devices_of_pid(pid):
    cuda_visible_devices = []
    with open(f"/proc/{pid}/environ") as f:
    for line in f.read().split("\0"):
    if line.startswith("CUDA_VISIBLE_DEVICES="):
    return [int(device) for device in re.sub("CUDA_VISIBLE_DEVICES=", "", line).split(",")]

    return []

    def devices_of_jid(jid):
    # Look up the CUDA_VISIBLE_DEVICES for all pids and make sure they match.
    cuda_visible_devices = set()
    for pid in pids:
    for device in devices_of_pid(pid):
    cuda_visible_devices.add(device)

    return cuda_visible_devices

    def get_jobs():
    hostname = socket.gethostname()

    result = subprocess.run(["squeue", "--format=%A,%u", "--noheader", "-w", hostname], stdout=subprocess.PIPE)
    return [line.split(",") for line in result.stdout.decode("utf-8").splitlines()]

    def gpu_utilization():
    result = subprocess.run(["nvidia-smi", "--query-gpu=utilization.gpu,memory.used,memory.total", "--format=csv,noheader"], stdout=subprocess.PIPE)
    utilization = []
    for line in result.stdout.decode("utf-8").splitlines():
    split = [part.strip() for part in line.split(",")]
    proc = int(re.sub('[^0-9]', "", split[0]))
    memused = int(re.sub('[^0-9]', "", split[1]))
    memtotal = int(re.sub('[^0-9]', "", split[2]))
    utilization.append([proc, 100 * memused / memtotal])

    return utilization

    if not 'SUDO_UID' in os.environ.keys():
    print("This program requires super user.")
    sys.exit(1)

    gpu_stats = gpu_utilization()

    for jid, user in get_jobs():
    pids = pids_of_jid(jid)
    cuda_visible_devices = devices_of_jid(jid)
    processor = sum([gpu_stats[device][0] for device in cuda_visible_devices]) / len(cuda_visible_devices)
    memory = int(sum([gpu_stats[device][1] for device in cuda_visible_devices]) / len(cuda_visible_devices))
    print("{} ({}) -> {} (proc={}%, memused={}%)".format(jid, user, cuda_visible_devices, processor, memory))