Created
April 28, 2026 14:25
-
-
Save hoonsubin/c90bbf36f87faadaecea393a593ff31b to your computer and use it in GitHub Desktop.
Checking AMD Vulkan and ROCm installation on Ubuntu Linux for gfx1150
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| GPU/ROCm/Vulkan system readiness check for gfx1150 (Radeon 890M) LXC inference. | |
| Checks system installations and inference capabilities only. | |
| No external dependencies — stdlib only. Run as root for full device access checks. | |
| """ | |
| import os, re, shutil, subprocess, grp, pwd, sys | |
| from pathlib import Path | |
| # ── ANSI ────────────────────────────────────────────────────────────────────── | |
| R = "\033[0m"; BOLD = "\033[1m"; DIM = "\033[2m" | |
| GRN = "\033[92m"; YLW = "\033[93m"; RED = "\033[91m"; CYN = "\033[96m" | |
| PASS = f"{GRN}✔{R}"; FAIL = f"{RED}✘{R}"; WARN = f"{YLW}⚠{R}"; INFO = f"{DIM}·{R}" | |
| # ── Expected values ──────────────────────────────────────────────────────────── | |
| EXPECTED = { | |
| "gfx_version": "11.5.0", | |
| "chip_id": "0x150e", | |
| "isa": "gfx1150", | |
| "drm_major": 226, | |
| "kfd_major": 234, | |
| "accel_major": 261, | |
| } | |
| DEVICES = { | |
| "/dev/dri/renderD128": (EXPECTED["drm_major"], 128, "render", 993), | |
| "/dev/dri/card0": (EXPECTED["drm_major"], 0, "video", 44), | |
| "/dev/kfd": (EXPECTED["kfd_major"], 0, "render", 993), | |
| "/dev/accel/accel0": (EXPECTED["accel_major"], 0, "render", 993), | |
| } | |
| # ── Helpers ──────────────────────────────────────────────────────────────────── | |
| def run(cmd, timeout=10): | |
| try: | |
| r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout) | |
| return r.returncode, r.stdout.strip(), r.stderr.strip() | |
| except subprocess.TimeoutExpired: | |
| return -1, "", "TIMEOUT" | |
| def p(status, label, detail=""): | |
| detail_s = f" {DIM}{detail}{R}" if detail else "" | |
| print(f" {status} {label}{detail_s}") | |
| def section(title): | |
| print(f"\n{BOLD}{CYN}── {title} {'─' * (52 - len(title))}{R}") | |
| # ── Checks ───────────────────────────────────────────────────────────────────── | |
| results = [] # (passed: bool, warn: bool) | |
| def check(passed, warned, label, detail=""): | |
| icon = PASS if passed else (WARN if warned else FAIL) | |
| p(icon, label, detail) | |
| results.append((passed, warned)) | |
| # 1. Environment variables | |
| section("Environment Variables") | |
| gfx = os.environ.get("HSA_OVERRIDE_GFX_VERSION", "") | |
| check(gfx == EXPECTED["gfx_version"], False, | |
| "HSA_OVERRIDE_GFX_VERSION", | |
| gfx if gfx else f"NOT SET — must be {EXPECTED['gfx_version']}") | |
| sdma = os.environ.get("HSA_ENABLE_SDMA", "") | |
| check(sdma == "0", False, | |
| "HSA_ENABLE_SDMA", | |
| sdma if sdma else "NOT SET — must be 0 (prevents iGPU DMA hangs)") | |
| # 2. Device nodes | |
| section("Device Nodes") | |
| process_gids = set(os.getgroups()) | |
| is_root = os.getuid() == 0 | |
| for path, (exp_major, exp_minor, exp_group, exp_gid) in DEVICES.items(): | |
| dev = Path(path) | |
| if not dev.exists(): | |
| check(False, False, path, "missing — check LXC dev[n] config") | |
| continue | |
| st = os.stat(path) | |
| major = os.major(st.st_rdev) | |
| minor = os.minor(st.st_rdev) | |
| if major != exp_major: | |
| check(False, False, path, f"wrong major {major} (expected {exp_major})") | |
| continue | |
| minor_ok = (minor == exp_minor) | |
| # card0/minor=0 may be card1 if simpledrm is active — warn instead of fail | |
| if path.endswith("card0") and not minor_ok: | |
| check(False, True, path, | |
| f"minor={minor} not 0 — simpledrm may own card0; verify vendor on host") | |
| else: | |
| check(minor_ok, False, path, f"major={major} minor={minor}") | |
| # Access check | |
| can_open = is_root or (exp_gid in process_gids) | |
| check(can_open, False, | |
| f" {path} openable by process", | |
| "" if can_open else f"not in {exp_group}(gid={exp_gid}) — run: usermod -aG {exp_group} <user>") | |
| # 3. Group membership | |
| section("Groups") | |
| for grp_name in ("render", "video"): | |
| try: | |
| info = grp.getgrnam(grp_name) | |
| check(True, False, f"Group '{grp_name}'", f"gid={info.gr_gid}") | |
| in_grp = is_root or (info.gr_gid in process_gids) | |
| check(in_grp, False, f" Current process in '{grp_name}'", | |
| "" if in_grp else f"run: usermod -aG {grp_name} $(whoami) && relogin") | |
| except KeyError: | |
| check(False, False, f"Group '{grp_name}'", "missing — run: groupadd render / groupadd video") | |
| # 4. Vulkan | |
| section("Vulkan") | |
| glslc = shutil.which("glslc") | |
| check(bool(glslc), False, | |
| "glslc shader compiler", | |
| glslc or "missing — run: apt install glslang-tools (required to build llama.cpp)") | |
| vk_bin = shutil.which("vulkaninfo") | |
| check(bool(vk_bin), False, "vulkaninfo", vk_bin or "missing — apt install vulkan-tools") | |
| if vk_bin: | |
| rc, out, err = run("vulkaninfo --summary 2>/dev/null", timeout=15) | |
| check(rc == 0 and bool(out), False, "vulkaninfo execution", | |
| "" if (rc == 0 and out) else (err or "no output")) | |
| if rc == 0 and out: | |
| # Driver — must be RADV, not AMDVLK | |
| drv = re.search(r"driverName\s*=\s*(\S+)", out) | |
| drv_name = drv.group(1) if drv else "" | |
| is_radv = "radv" in drv_name.lower() | |
| check(is_radv, not is_radv, | |
| "Vulkan driver is RADV (Mesa)", | |
| drv_name or "unknown — RADV required; AMDVLK causes llama.cpp instability") | |
| # Device identity | |
| dev_m = re.search(r"deviceName\s*=\s*(.+)", out) | |
| dev_name = dev_m.group(1).strip() if dev_m else "" | |
| is_1150 = EXPECTED["isa"].upper() in dev_name.upper() | |
| check(is_1150, False, | |
| f"GPU identity ({EXPECTED['isa']})", | |
| dev_name or "unknown") | |
| # API version | |
| api_m = re.search(r"apiVersion\s*=\s*(.+)", out) | |
| if api_m: | |
| p(INFO, "Vulkan API version", api_m.group(1).strip()) | |
| # 5. ROCm / HSA | |
| section("ROCm / HSA") | |
| rocminfo = shutil.which("rocminfo") | |
| if not rocminfo: | |
| check(False, True, "rocminfo", | |
| "not installed — ROCm HIP unavailable (Vulkan path is fine without it)") | |
| else: | |
| check(True, False, "rocminfo", rocminfo) | |
| rc, out, err = run("rocminfo", timeout=60) | |
| if rc == -1 and err == "TIMEOUT": | |
| check(False, True, "rocminfo execution", | |
| "timed out after 60s — GPU agent may be slow or /dev/kfd not accessible; retry manually") | |
| elif rc != 0 or not out: | |
| check(False, False, "rocminfo execution", err or "no output") | |
| else: | |
| check(True, False, "rocminfo execution") | |
| if rc == 0 and out: | |
| # ROCk / kfd | |
| rock_ok = "ROCk module is loaded" in out | |
| check(rock_ok, False, "ROCk module loaded (/dev/kfd accessible)", | |
| "" if rock_ok else "/dev/kfd not reachable from this process") | |
| # GPU agent with correct ISA | |
| isa_ok = EXPECTED["isa"] in out | |
| check(isa_ok, False, f"GPU agent ISA ({EXPECTED['isa']})", | |
| "" if isa_ok else "not found — passthrough or HSA_OVERRIDE_GFX_VERSION wrong") | |
| # Chip ID | |
| chip_m = re.search(r"Chip ID:\s*\d+\((0x[0-9a-fA-F]+)\)", out) | |
| if chip_m: | |
| chip = chip_m.group(1).lower() | |
| check(chip == EXPECTED["chip_id"], False, | |
| f"Chip ID ({EXPECTED['chip_id']} = Radeon 890M)", chip) | |
| # APU unified memory | |
| apu_ok = "Memory Properties: APU" in out | |
| check(apu_ok, not apu_ok, "APU unified memory flag", | |
| "present" if apu_ok else "absent — GTT pool may not be exposed") | |
| # GPU memory pool size (want ≥ 16 GB = GTT is visible) | |
| agent2 = out.find("Agent 2") | |
| if agent2 != -1: | |
| sizes = re.findall(r"Size:\s*(\d+)\(", out[agent2:]) | |
| if sizes: | |
| kb = int(sizes[0]) | |
| gb = kb / (1024 ** 2) | |
| large = gb >= 16 | |
| check(large, not large, | |
| "GPU memory pool", | |
| f"{gb:.1f} GB {'(GTT visible ✔)' if large else '(only VRAM? GTT may be missing)'}") | |
| # HSA version cross-check | |
| gfx_env = os.environ.get("HSA_OVERRIDE_GFX_VERSION", "") | |
| check(gfx_env == EXPECTED["gfx_version"], False, | |
| "HSA_OVERRIDE_GFX_VERSION matches ISA", | |
| gfx_env or "NOT SET") | |
| # 6. llama.cpp Vulkan build | |
| section("llama.cpp (Vulkan build)") | |
| candidates = [ | |
| "/opt/llama.cpp/build/bin/llama-server", | |
| shutil.which("llama-server") or "", | |
| ] | |
| binary = next((c for c in candidates if c and Path(c).exists()), None) | |
| if not binary: | |
| check(False, False, "llama-server binary", | |
| "not found — build: cmake -DGGML_VULKAN=ON && cmake --build build -j$(nproc)") | |
| else: | |
| check(True, False, "llama-server binary", binary) | |
| check(os.access(binary, os.X_OK), False, " executable bit set") | |
| _, ldd, _ = run(f"ldd {binary} 2>/dev/null") | |
| has_vk = "libvulkan" in ldd | |
| check(has_vk, not has_vk, " linked against libvulkan", | |
| "" if has_vk else "rebuild with -DGGML_VULKAN=ON") | |
| _, cnt, _ = run(f"strings {binary} 2>/dev/null | grep -c 'ggml.vulkan'") | |
| compiled_in = cnt.isdigit() and int(cnt) > 0 | |
| if not has_vk: # only show strings check when ldd wasn't conclusive | |
| check(compiled_in, not compiled_in, " ggml-vulkan compiled in", | |
| f"{cnt} references found" if compiled_in else "0 — backend missing") | |
| # ── Summary ──────────────────────────────────────────────────────────────────── | |
| total = len(results) | |
| passed = sum(1 for ok, _ in results if ok) | |
| warned = sum(1 for ok, wn in results if not ok and wn) | |
| failed = total - passed - warned | |
| print(f"\n{BOLD}{'─' * 58}{R}") | |
| print(f" {GRN}{passed} passed{R} {YLW}{warned} warned{R} {RED}{failed} failed{R} " | |
| f"{DIM}({total} checks){R}") | |
| if failed == 0 and warned == 0: | |
| print(f" {GRN}{BOLD}System is ready for Vulkan inference.{R}") | |
| elif failed == 0: | |
| print(f" {YLW}{BOLD}Review warnings before running inference.{R}") | |
| else: | |
| print(f" {RED}{BOLD}Fix failures before running inference.{R}") | |
| print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment