Skip to content

Instantly share code, notes, and snippets.

@hoonsubin
Created April 28, 2026 14:25
Show Gist options
  • Select an option

  • Save hoonsubin/c90bbf36f87faadaecea393a593ff31b to your computer and use it in GitHub Desktop.

Select an option

Save hoonsubin/c90bbf36f87faadaecea393a593ff31b to your computer and use it in GitHub Desktop.
Checking AMD Vulkan and ROCm installation on Ubuntu Linux for gfx1150
#!/usr/bin/env python3
"""
GPU/ROCm/Vulkan system readiness check for gfx1150 (Radeon 890M) LXC inference.
Checks system installations and inference capabilities only.
No external dependencies — stdlib only. Run as root for full device access checks.
"""
import os, re, shutil, subprocess, grp, pwd, sys
from pathlib import Path
# ── ANSI ──────────────────────────────────────────────────────────────────────
R = "\033[0m"; BOLD = "\033[1m"; DIM = "\033[2m"
GRN = "\033[92m"; YLW = "\033[93m"; RED = "\033[91m"; CYN = "\033[96m"
PASS = f"{GRN}✔{R}"; FAIL = f"{RED}✘{R}"; WARN = f"{YLW}⚠{R}"; INFO = f"{DIM}·{R}"
# ── Expected values ────────────────────────────────────────────────────────────
EXPECTED = {
"gfx_version": "11.5.0",
"chip_id": "0x150e",
"isa": "gfx1150",
"drm_major": 226,
"kfd_major": 234,
"accel_major": 261,
}
DEVICES = {
"/dev/dri/renderD128": (EXPECTED["drm_major"], 128, "render", 993),
"/dev/dri/card0": (EXPECTED["drm_major"], 0, "video", 44),
"/dev/kfd": (EXPECTED["kfd_major"], 0, "render", 993),
"/dev/accel/accel0": (EXPECTED["accel_major"], 0, "render", 993),
}
# ── Helpers ────────────────────────────────────────────────────────────────────
def run(cmd, timeout=10):
try:
r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
return r.returncode, r.stdout.strip(), r.stderr.strip()
except subprocess.TimeoutExpired:
return -1, "", "TIMEOUT"
def p(status, label, detail=""):
detail_s = f" {DIM}{detail}{R}" if detail else ""
print(f" {status} {label}{detail_s}")
def section(title):
print(f"\n{BOLD}{CYN}── {title} {'─' * (52 - len(title))}{R}")
# ── Checks ─────────────────────────────────────────────────────────────────────
results = [] # (passed: bool, warn: bool)
def check(passed, warned, label, detail=""):
icon = PASS if passed else (WARN if warned else FAIL)
p(icon, label, detail)
results.append((passed, warned))
# 1. Environment variables
section("Environment Variables")
gfx = os.environ.get("HSA_OVERRIDE_GFX_VERSION", "")
check(gfx == EXPECTED["gfx_version"], False,
"HSA_OVERRIDE_GFX_VERSION",
gfx if gfx else f"NOT SET — must be {EXPECTED['gfx_version']}")
sdma = os.environ.get("HSA_ENABLE_SDMA", "")
check(sdma == "0", False,
"HSA_ENABLE_SDMA",
sdma if sdma else "NOT SET — must be 0 (prevents iGPU DMA hangs)")
# 2. Device nodes
section("Device Nodes")
process_gids = set(os.getgroups())
is_root = os.getuid() == 0
for path, (exp_major, exp_minor, exp_group, exp_gid) in DEVICES.items():
dev = Path(path)
if not dev.exists():
check(False, False, path, "missing — check LXC dev[n] config")
continue
st = os.stat(path)
major = os.major(st.st_rdev)
minor = os.minor(st.st_rdev)
if major != exp_major:
check(False, False, path, f"wrong major {major} (expected {exp_major})")
continue
minor_ok = (minor == exp_minor)
# card0/minor=0 may be card1 if simpledrm is active — warn instead of fail
if path.endswith("card0") and not minor_ok:
check(False, True, path,
f"minor={minor} not 0 — simpledrm may own card0; verify vendor on host")
else:
check(minor_ok, False, path, f"major={major} minor={minor}")
# Access check
can_open = is_root or (exp_gid in process_gids)
check(can_open, False,
f" {path} openable by process",
"" if can_open else f"not in {exp_group}(gid={exp_gid}) — run: usermod -aG {exp_group} <user>")
# 3. Group membership
section("Groups")
for grp_name in ("render", "video"):
try:
info = grp.getgrnam(grp_name)
check(True, False, f"Group '{grp_name}'", f"gid={info.gr_gid}")
in_grp = is_root or (info.gr_gid in process_gids)
check(in_grp, False, f" Current process in '{grp_name}'",
"" if in_grp else f"run: usermod -aG {grp_name} $(whoami) && relogin")
except KeyError:
check(False, False, f"Group '{grp_name}'", "missing — run: groupadd render / groupadd video")
# 4. Vulkan
section("Vulkan")
glslc = shutil.which("glslc")
check(bool(glslc), False,
"glslc shader compiler",
glslc or "missing — run: apt install glslang-tools (required to build llama.cpp)")
vk_bin = shutil.which("vulkaninfo")
check(bool(vk_bin), False, "vulkaninfo", vk_bin or "missing — apt install vulkan-tools")
if vk_bin:
rc, out, err = run("vulkaninfo --summary 2>/dev/null", timeout=15)
check(rc == 0 and bool(out), False, "vulkaninfo execution",
"" if (rc == 0 and out) else (err or "no output"))
if rc == 0 and out:
# Driver — must be RADV, not AMDVLK
drv = re.search(r"driverName\s*=\s*(\S+)", out)
drv_name = drv.group(1) if drv else ""
is_radv = "radv" in drv_name.lower()
check(is_radv, not is_radv,
"Vulkan driver is RADV (Mesa)",
drv_name or "unknown — RADV required; AMDVLK causes llama.cpp instability")
# Device identity
dev_m = re.search(r"deviceName\s*=\s*(.+)", out)
dev_name = dev_m.group(1).strip() if dev_m else ""
is_1150 = EXPECTED["isa"].upper() in dev_name.upper()
check(is_1150, False,
f"GPU identity ({EXPECTED['isa']})",
dev_name or "unknown")
# API version
api_m = re.search(r"apiVersion\s*=\s*(.+)", out)
if api_m:
p(INFO, "Vulkan API version", api_m.group(1).strip())
# 5. ROCm / HSA
section("ROCm / HSA")
rocminfo = shutil.which("rocminfo")
if not rocminfo:
check(False, True, "rocminfo",
"not installed — ROCm HIP unavailable (Vulkan path is fine without it)")
else:
check(True, False, "rocminfo", rocminfo)
rc, out, err = run("rocminfo", timeout=60)
if rc == -1 and err == "TIMEOUT":
check(False, True, "rocminfo execution",
"timed out after 60s — GPU agent may be slow or /dev/kfd not accessible; retry manually")
elif rc != 0 or not out:
check(False, False, "rocminfo execution", err or "no output")
else:
check(True, False, "rocminfo execution")
if rc == 0 and out:
# ROCk / kfd
rock_ok = "ROCk module is loaded" in out
check(rock_ok, False, "ROCk module loaded (/dev/kfd accessible)",
"" if rock_ok else "/dev/kfd not reachable from this process")
# GPU agent with correct ISA
isa_ok = EXPECTED["isa"] in out
check(isa_ok, False, f"GPU agent ISA ({EXPECTED['isa']})",
"" if isa_ok else "not found — passthrough or HSA_OVERRIDE_GFX_VERSION wrong")
# Chip ID
chip_m = re.search(r"Chip ID:\s*\d+\((0x[0-9a-fA-F]+)\)", out)
if chip_m:
chip = chip_m.group(1).lower()
check(chip == EXPECTED["chip_id"], False,
f"Chip ID ({EXPECTED['chip_id']} = Radeon 890M)", chip)
# APU unified memory
apu_ok = "Memory Properties: APU" in out
check(apu_ok, not apu_ok, "APU unified memory flag",
"present" if apu_ok else "absent — GTT pool may not be exposed")
# GPU memory pool size (want ≥ 16 GB = GTT is visible)
agent2 = out.find("Agent 2")
if agent2 != -1:
sizes = re.findall(r"Size:\s*(\d+)\(", out[agent2:])
if sizes:
kb = int(sizes[0])
gb = kb / (1024 ** 2)
large = gb >= 16
check(large, not large,
"GPU memory pool",
f"{gb:.1f} GB {'(GTT visible ✔)' if large else '(only VRAM? GTT may be missing)'}")
# HSA version cross-check
gfx_env = os.environ.get("HSA_OVERRIDE_GFX_VERSION", "")
check(gfx_env == EXPECTED["gfx_version"], False,
"HSA_OVERRIDE_GFX_VERSION matches ISA",
gfx_env or "NOT SET")
# 6. llama.cpp Vulkan build
section("llama.cpp (Vulkan build)")
candidates = [
"/opt/llama.cpp/build/bin/llama-server",
shutil.which("llama-server") or "",
]
binary = next((c for c in candidates if c and Path(c).exists()), None)
if not binary:
check(False, False, "llama-server binary",
"not found — build: cmake -DGGML_VULKAN=ON && cmake --build build -j$(nproc)")
else:
check(True, False, "llama-server binary", binary)
check(os.access(binary, os.X_OK), False, " executable bit set")
_, ldd, _ = run(f"ldd {binary} 2>/dev/null")
has_vk = "libvulkan" in ldd
check(has_vk, not has_vk, " linked against libvulkan",
"" if has_vk else "rebuild with -DGGML_VULKAN=ON")
_, cnt, _ = run(f"strings {binary} 2>/dev/null | grep -c 'ggml.vulkan'")
compiled_in = cnt.isdigit() and int(cnt) > 0
if not has_vk: # only show strings check when ldd wasn't conclusive
check(compiled_in, not compiled_in, " ggml-vulkan compiled in",
f"{cnt} references found" if compiled_in else "0 — backend missing")
# ── Summary ────────────────────────────────────────────────────────────────────
total = len(results)
passed = sum(1 for ok, _ in results if ok)
warned = sum(1 for ok, wn in results if not ok and wn)
failed = total - passed - warned
print(f"\n{BOLD}{'─' * 58}{R}")
print(f" {GRN}{passed} passed{R} {YLW}{warned} warned{R} {RED}{failed} failed{R} "
f"{DIM}({total} checks){R}")
if failed == 0 and warned == 0:
print(f" {GRN}{BOLD}System is ready for Vulkan inference.{R}")
elif failed == 0:
print(f" {YLW}{BOLD}Review warnings before running inference.{R}")
else:
print(f" {RED}{BOLD}Fix failures before running inference.{R}")
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment