hoonsubin · April 28, 2026 14:25
diff --git a/amd-rocm-check.py b/amd-rocm-check.py
 #!/usr/bin/env python3
 """
 GPU/ROCm/Vulkan system readiness check for gfx1150 (Radeon 890M) LXC inference.
 Checks system installations and inference capabilities only.
 No external dependencies — stdlib only. Run as root for full device access checks.
 """

 import os, re, shutil, subprocess, grp, pwd, sys
 from pathlib import Path

 # ── ANSI ──────────────────────────────────────────────────────────────────────
 R = "\033[0m"; BOLD = "\033[1m"; DIM = "\033[2m"
 GRN = "\033[92m"; YLW = "\033[93m"; RED = "\033[91m"; CYN = "\033[96m"
 PASS = f"{GRN}✔{R}"; FAIL = f"{RED}✘{R}"; WARN = f"{YLW}⚠{R}"; INFO = f"{DIM}·{R}"

 # ── Expected values ────────────────────────────────────────────────────────────
 EXPECTED = {
    "gfx_version": "11.5.0",
    "chip_id":     "0x150e",
    "isa":         "gfx1150",
    "drm_major":   226,
    "kfd_major":   234,
    "accel_major": 261,
 }

 DEVICES = {
    "/dev/dri/renderD128": (EXPECTED["drm_major"],   128, "render", 993),
    "/dev/dri/card0":      (EXPECTED["drm_major"],   0,   "video",  44),
    "/dev/kfd":            (EXPECTED["kfd_major"],   0,   "render", 993),
    "/dev/accel/accel0":   (EXPECTED["accel_major"], 0,   "render", 993),
 }

 # ── Helpers ────────────────────────────────────────────────────────────────────
 def run(cmd, timeout=10):
    try:
        r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
        return r.returncode, r.stdout.strip(), r.stderr.strip()
    except subprocess.TimeoutExpired:
        return -1, "", "TIMEOUT"

 def p(status, label, detail=""):
    detail_s = f"  {DIM}{detail}{R}" if detail else ""
    print(f"  {status} {label}{detail_s}")

 def section(title):
    print(f"\n{BOLD}{CYN}── {title} {'─' * (52 - len(title))}{R}")

 # ── Checks ─────────────────────────────────────────────────────────────────────
 results = []   # (passed: bool, warn: bool)

 def check(passed, warned, label, detail=""):
    icon = PASS if passed else (WARN if warned else FAIL)
    p(icon, label, detail)
    results.append((passed, warned))

 # 1. Environment variables
 section("Environment Variables")

 gfx = os.environ.get("HSA_OVERRIDE_GFX_VERSION", "")
 check(gfx == EXPECTED["gfx_version"], False,
      "HSA_OVERRIDE_GFX_VERSION",
      gfx if gfx else f"NOT SET — must be {EXPECTED['gfx_version']}")

 sdma = os.environ.get("HSA_ENABLE_SDMA", "")
 check(sdma == "0", False,
      "HSA_ENABLE_SDMA",
      sdma if sdma else "NOT SET — must be 0 (prevents iGPU DMA hangs)")

 # 2. Device nodes
 section("Device Nodes")

 process_gids = set(os.getgroups())
 is_root = os.getuid() == 0

 for path, (exp_major, exp_minor, exp_group, exp_gid) in DEVICES.items():
    dev = Path(path)
    if not dev.exists():
        check(False, False, path, "missing — check LXC dev[n] config")
        continue

    st    = os.stat(path)
    major = os.major(st.st_rdev)
    minor = os.minor(st.st_rdev)

    if major != exp_major:
        check(False, False, path, f"wrong major {major} (expected {exp_major})")
        continue

    minor_ok = (minor == exp_minor)
    # card0/minor=0 may be card1 if simpledrm is active — warn instead of fail
    if path.endswith("card0") and not minor_ok:
        check(False, True, path,
              f"minor={minor} not 0 — simpledrm may own card0; verify vendor on host")
    else:
        check(minor_ok, False, path, f"major={major} minor={minor}")

    # Access check
    can_open = is_root or (exp_gid in process_gids)
    check(can_open, False,
          f"  {path} openable by process",
          "" if can_open else f"not in {exp_group}(gid={exp_gid}) — run: usermod -aG {exp_group} <user>")

 # 3. Group membership
 section("Groups")

 for grp_name in ("render", "video"):
    try:
        info = grp.getgrnam(grp_name)
        check(True, False, f"Group '{grp_name}'", f"gid={info.gr_gid}")
        in_grp = is_root or (info.gr_gid in process_gids)
        check(in_grp, False, f"  Current process in '{grp_name}'",
              "" if in_grp else f"run: usermod -aG {grp_name} $(whoami) && relogin")
    except KeyError:
        check(False, False, f"Group '{grp_name}'", "missing — run: groupadd render / groupadd video")

 # 4. Vulkan
 section("Vulkan")

 glslc = shutil.which("glslc")
 check(bool(glslc), False,
      "glslc shader compiler",
      glslc or "missing — run: apt install glslang-tools  (required to build llama.cpp)")

 vk_bin = shutil.which("vulkaninfo")
 check(bool(vk_bin), False, "vulkaninfo", vk_bin or "missing — apt install vulkan-tools")

 if vk_bin:
    rc, out, err = run("vulkaninfo --summary 2>/dev/null", timeout=15)
    check(rc == 0 and bool(out), False, "vulkaninfo execution",
          "" if (rc == 0 and out) else (err or "no output"))

    if rc == 0 and out:
        # Driver — must be RADV, not AMDVLK
        drv = re.search(r"driverName\s*=\s*(\S+)", out)
        drv_name = drv.group(1) if drv else ""
        is_radv = "radv" in drv_name.lower()
        check(is_radv, not is_radv,
              "Vulkan driver is RADV (Mesa)",
              drv_name or "unknown — RADV required; AMDVLK causes llama.cpp instability")

        # Device identity
        dev_m = re.search(r"deviceName\s*=\s*(.+)", out)
        dev_name = dev_m.group(1).strip() if dev_m else ""
        is_1150 = EXPECTED["isa"].upper() in dev_name.upper()
        check(is_1150, False,
              f"GPU identity ({EXPECTED['isa']})",
              dev_name or "unknown")

        # API version
        api_m = re.search(r"apiVersion\s*=\s*(.+)", out)
        if api_m:
            p(INFO, "Vulkan API version", api_m.group(1).strip())

 # 5. ROCm / HSA
 section("ROCm / HSA")

 rocminfo = shutil.which("rocminfo")
 if not rocminfo:
    check(False, True, "rocminfo",
          "not installed — ROCm HIP unavailable (Vulkan path is fine without it)")
 else:
    check(True, False, "rocminfo", rocminfo)
    rc, out, err = run("rocminfo", timeout=60)

    if rc == -1 and err == "TIMEOUT":
        check(False, True, "rocminfo execution",
              "timed out after 60s — GPU agent may be slow or /dev/kfd not accessible; retry manually")
    elif rc != 0 or not out:
        check(False, False, "rocminfo execution", err or "no output")
    else:
        check(True, False, "rocminfo execution")

    if rc == 0 and out:
        # ROCk / kfd
        rock_ok = "ROCk module is loaded" in out
        check(rock_ok, False, "ROCk module loaded (/dev/kfd accessible)",
              "" if rock_ok else "/dev/kfd not reachable from this process")

        # GPU agent with correct ISA
        isa_ok = EXPECTED["isa"] in out
        check(isa_ok, False, f"GPU agent ISA ({EXPECTED['isa']})",
              "" if isa_ok else "not found — passthrough or HSA_OVERRIDE_GFX_VERSION wrong")

        # Chip ID
        chip_m = re.search(r"Chip ID:\s*\d+\((0x[0-9a-fA-F]+)\)", out)
        if chip_m:
            chip = chip_m.group(1).lower()
            check(chip == EXPECTED["chip_id"], False,
                  f"Chip ID ({EXPECTED['chip_id']} = Radeon 890M)", chip)

        # APU unified memory
        apu_ok = "Memory Properties:       APU" in out
        check(apu_ok, not apu_ok, "APU unified memory flag",
              "present" if apu_ok else "absent — GTT pool may not be exposed")

        # GPU memory pool size (want ≥ 16 GB = GTT is visible)
        agent2 = out.find("Agent 2")
        if agent2 != -1:
            sizes = re.findall(r"Size:\s*(\d+)\(", out[agent2:])
            if sizes:
                kb = int(sizes[0])
                gb = kb / (1024 ** 2)
                large = gb >= 16
                check(large, not large,
                      "GPU memory pool",
                      f"{gb:.1f} GB {'(GTT visible ✔)' if large else '(only VRAM? GTT may be missing)'}")

        # HSA version cross-check
        gfx_env = os.environ.get("HSA_OVERRIDE_GFX_VERSION", "")
        check(gfx_env == EXPECTED["gfx_version"], False,
              "HSA_OVERRIDE_GFX_VERSION matches ISA",
              gfx_env or "NOT SET")

 # 6. llama.cpp Vulkan build
 section("llama.cpp (Vulkan build)")

 candidates = [
    "/opt/llama.cpp/build/bin/llama-server",
    shutil.which("llama-server") or "",
 ]
 binary = next((c for c in candidates if c and Path(c).exists()), None)

 if not binary:
    check(False, False, "llama-server binary",
          "not found — build: cmake -DGGML_VULKAN=ON && cmake --build build -j$(nproc)")
 else:
    check(True, False, "llama-server binary", binary)
    check(os.access(binary, os.X_OK), False, "  executable bit set")

    _, ldd, _  = run(f"ldd {binary} 2>/dev/null")
    has_vk = "libvulkan" in ldd
    check(has_vk, not has_vk, "  linked against libvulkan",
          "" if has_vk else "rebuild with -DGGML_VULKAN=ON")

    _, cnt, _ = run(f"strings {binary} 2>/dev/null | grep -c 'ggml.vulkan'")
    compiled_in = cnt.isdigit() and int(cnt) > 0
    if not has_vk:   # only show strings check when ldd wasn't conclusive
        check(compiled_in, not compiled_in, "  ggml-vulkan compiled in",
              f"{cnt} references found" if compiled_in else "0 — backend missing")

 # ── Summary ────────────────────────────────────────────────────────────────────
 total  = len(results)
 passed = sum(1 for ok, _  in results if ok)
 warned = sum(1 for ok, wn in results if not ok and wn)
 failed = total - passed - warned

 print(f"\n{BOLD}{'─' * 58}{R}")
 print(f"  {GRN}{passed} passed{R}  {YLW}{warned} warned{R}  {RED}{failed} failed{R}  "
      f"{DIM}({total} checks){R}")
 if failed == 0 and warned == 0:
    print(f"  {GRN}{BOLD}System is ready for Vulkan inference.{R}")
 elif failed == 0:
    print(f"  {YLW}{BOLD}Review warnings before running inference.{R}")
 else:
    print(f"  {RED}{BOLD}Fix failures before running inference.{R}")
 print()
	#!/usr/bin/env python3
	"""
	GPU/ROCm/Vulkan system readiness check for gfx1150 (Radeon 890M) LXC inference.
	Checks system installations and inference capabilities only.
	No external dependencies — stdlib only. Run as root for full device access checks.
	"""

	import os, re, shutil, subprocess, grp, pwd, sys
	from pathlib import Path

	# ── ANSI ──────────────────────────────────────────────────────────────────────
	R = "\033[0m"; BOLD = "\033[1m"; DIM = "\033[2m"
	GRN = "\033[92m"; YLW = "\033[93m"; RED = "\033[91m"; CYN = "\033[96m"
	PASS = f"{GRN}✔{R}"; FAIL = f"{RED}✘{R}"; WARN = f"{YLW}⚠{R}"; INFO = f"{DIM}·{R}"

	# ── Expected values ────────────────────────────────────────────────────────────
	EXPECTED = {
	"gfx_version": "11.5.0",
	"chip_id": "0x150e",
	"isa": "gfx1150",
	"drm_major": 226,
	"kfd_major": 234,
	"accel_major": 261,
	}

	DEVICES = {
	"/dev/dri/renderD128": (EXPECTED["drm_major"], 128, "render", 993),
	"/dev/dri/card0": (EXPECTED["drm_major"], 0, "video", 44),
	"/dev/kfd": (EXPECTED["kfd_major"], 0, "render", 993),
	"/dev/accel/accel0": (EXPECTED["accel_major"], 0, "render", 993),
	}

	# ── Helpers ────────────────────────────────────────────────────────────────────
	def run(cmd, timeout=10):
	try:
	r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
	return r.returncode, r.stdout.strip(), r.stderr.strip()
	except subprocess.TimeoutExpired:
	return -1, "", "TIMEOUT"

	def p(status, label, detail=""):
	detail_s = f" {DIM}{detail}{R}" if detail else ""
	print(f" {status} {label}{detail_s}")

	def section(title):
	print(f"\n{BOLD}{CYN}── {title} {'─' * (52 - len(title))}{R}")

	# ── Checks ─────────────────────────────────────────────────────────────────────
	results = [] # (passed: bool, warn: bool)

	def check(passed, warned, label, detail=""):
	icon = PASS if passed else (WARN if warned else FAIL)
	p(icon, label, detail)
	results.append((passed, warned))

	# 1. Environment variables
	section("Environment Variables")

	gfx = os.environ.get("HSA_OVERRIDE_GFX_VERSION", "")
	check(gfx == EXPECTED["gfx_version"], False,
	"HSA_OVERRIDE_GFX_VERSION",
	gfx if gfx else f"NOT SET — must be {EXPECTED['gfx_version']}")

	sdma = os.environ.get("HSA_ENABLE_SDMA", "")
	check(sdma == "0", False,
	"HSA_ENABLE_SDMA",
	sdma if sdma else "NOT SET — must be 0 (prevents iGPU DMA hangs)")

	# 2. Device nodes
	section("Device Nodes")

	process_gids = set(os.getgroups())
	is_root = os.getuid() == 0

	for path, (exp_major, exp_minor, exp_group, exp_gid) in DEVICES.items():
	dev = Path(path)
	if not dev.exists():
	check(False, False, path, "missing — check LXC dev[n] config")
	continue

	st = os.stat(path)
	major = os.major(st.st_rdev)
	minor = os.minor(st.st_rdev)

	if major != exp_major:
	check(False, False, path, f"wrong major {major} (expected {exp_major})")
	continue

	minor_ok = (minor == exp_minor)
	# card0/minor=0 may be card1 if simpledrm is active — warn instead of fail
	if path.endswith("card0") and not minor_ok:
	check(False, True, path,
	f"minor={minor} not 0 — simpledrm may own card0; verify vendor on host")
	else:
	check(minor_ok, False, path, f"major={major} minor={minor}")

	# Access check
	can_open = is_root or (exp_gid in process_gids)
	check(can_open, False,
	f" {path} openable by process",
	"" if can_open else f"not in {exp_group}(gid={exp_gid}) — run: usermod -aG {exp_group} <user>")

	# 3. Group membership
	section("Groups")

	for grp_name in ("render", "video"):
	try:
	info = grp.getgrnam(grp_name)
	check(True, False, f"Group '{grp_name}'", f"gid={info.gr_gid}")
	in_grp = is_root or (info.gr_gid in process_gids)
	check(in_grp, False, f" Current process in '{grp_name}'",
	"" if in_grp else f"run: usermod -aG {grp_name} $(whoami) && relogin")
	except KeyError:
	check(False, False, f"Group '{grp_name}'", "missing — run: groupadd render / groupadd video")

	# 4. Vulkan
	section("Vulkan")

	glslc = shutil.which("glslc")
	check(bool(glslc), False,
	"glslc shader compiler",
	glslc or "missing — run: apt install glslang-tools (required to build llama.cpp)")

	vk_bin = shutil.which("vulkaninfo")
	check(bool(vk_bin), False, "vulkaninfo", vk_bin or "missing — apt install vulkan-tools")

	if vk_bin:
	rc, out, err = run("vulkaninfo --summary 2>/dev/null", timeout=15)
	check(rc == 0 and bool(out), False, "vulkaninfo execution",
	"" if (rc == 0 and out) else (err or "no output"))

	if rc == 0 and out:
	# Driver — must be RADV, not AMDVLK
	drv = re.search(r"driverName\s=\s(\S+)", out)
	drv_name = drv.group(1) if drv else ""
	is_radv = "radv" in drv_name.lower()
	check(is_radv, not is_radv,
	"Vulkan driver is RADV (Mesa)",
	drv_name or "unknown — RADV required; AMDVLK causes llama.cpp instability")

	# Device identity
	dev_m = re.search(r"deviceName\s=\s(.+)", out)
	dev_name = dev_m.group(1).strip() if dev_m else ""
	is_1150 = EXPECTED["isa"].upper() in dev_name.upper()
	check(is_1150, False,
	f"GPU identity ({EXPECTED['isa']})",
	dev_name or "unknown")

	# API version
	api_m = re.search(r"apiVersion\s=\s(.+)", out)
	if api_m:
	p(INFO, "Vulkan API version", api_m.group(1).strip())

	# 5. ROCm / HSA
	section("ROCm / HSA")

	rocminfo = shutil.which("rocminfo")
	if not rocminfo:
	check(False, True, "rocminfo",
	"not installed — ROCm HIP unavailable (Vulkan path is fine without it)")
	else:
	check(True, False, "rocminfo", rocminfo)
	rc, out, err = run("rocminfo", timeout=60)

	if rc == -1 and err == "TIMEOUT":
	check(False, True, "rocminfo execution",
	"timed out after 60s — GPU agent may be slow or /dev/kfd not accessible; retry manually")
	elif rc != 0 or not out:
	check(False, False, "rocminfo execution", err or "no output")
	else:
	check(True, False, "rocminfo execution")

	if rc == 0 and out:
	# ROCk / kfd
	rock_ok = "ROCk module is loaded" in out
	check(rock_ok, False, "ROCk module loaded (/dev/kfd accessible)",
	"" if rock_ok else "/dev/kfd not reachable from this process")

	# GPU agent with correct ISA
	isa_ok = EXPECTED["isa"] in out
	check(isa_ok, False, f"GPU agent ISA ({EXPECTED['isa']})",
	"" if isa_ok else "not found — passthrough or HSA_OVERRIDE_GFX_VERSION wrong")

	# Chip ID
	chip_m = re.search(r"Chip ID:\s*\d+\((0x[0-9a-fA-F]+)\)", out)
	if chip_m:
	chip = chip_m.group(1).lower()
	check(chip == EXPECTED["chip_id"], False,
	f"Chip ID ({EXPECTED['chip_id']} = Radeon 890M)", chip)

	# APU unified memory
	apu_ok = "Memory Properties: APU" in out
	check(apu_ok, not apu_ok, "APU unified memory flag",
	"present" if apu_ok else "absent — GTT pool may not be exposed")

	# GPU memory pool size (want ≥ 16 GB = GTT is visible)
	agent2 = out.find("Agent 2")
	if agent2 != -1:
	sizes = re.findall(r"Size:\s*(\d+)\(", out[agent2:])
	if sizes:
	kb = int(sizes[0])
	gb = kb / (1024 ** 2)
	large = gb >= 16
	check(large, not large,
	"GPU memory pool",
	f"{gb:.1f} GB {'(GTT visible ✔)' if large else '(only VRAM? GTT may be missing)'}")

	# HSA version cross-check
	gfx_env = os.environ.get("HSA_OVERRIDE_GFX_VERSION", "")
	check(gfx_env == EXPECTED["gfx_version"], False,
	"HSA_OVERRIDE_GFX_VERSION matches ISA",
	gfx_env or "NOT SET")

	# 6. llama.cpp Vulkan build
	section("llama.cpp (Vulkan build)")

	candidates = [
	"/opt/llama.cpp/build/bin/llama-server",
	shutil.which("llama-server") or "",
	]
	binary = next((c for c in candidates if c and Path(c).exists()), None)

	if not binary:
	check(False, False, "llama-server binary",
	"not found — build: cmake -DGGML_VULKAN=ON && cmake --build build -j$(nproc)")
	else:
	check(True, False, "llama-server binary", binary)
	check(os.access(binary, os.X_OK), False, " executable bit set")

	_, ldd, _ = run(f"ldd {binary} 2>/dev/null")
	has_vk = "libvulkan" in ldd
	check(has_vk, not has_vk, " linked against libvulkan",
	"" if has_vk else "rebuild with -DGGML_VULKAN=ON")

	_, cnt, _ = run(f"strings {binary} 2>/dev/null \| grep -c 'ggml.vulkan'")
	compiled_in = cnt.isdigit() and int(cnt) > 0
	if not has_vk: # only show strings check when ldd wasn't conclusive
	check(compiled_in, not compiled_in, " ggml-vulkan compiled in",
	f"{cnt} references found" if compiled_in else "0 — backend missing")

	# ── Summary ────────────────────────────────────────────────────────────────────
	total = len(results)
	passed = sum(1 for ok, _ in results if ok)
	warned = sum(1 for ok, wn in results if not ok and wn)
	failed = total - passed - warned

	print(f"\n{BOLD}{'─' * 58}{R}")
	print(f" {GRN}{passed} passed{R} {YLW}{warned} warned{R} {RED}{failed} failed{R} "
	f"{DIM}({total} checks){R}")
	if failed == 0 and warned == 0:
	print(f" {GRN}{BOLD}System is ready for Vulkan inference.{R}")
	elif failed == 0:
	print(f" {YLW}{BOLD}Review warnings before running inference.{R}")
	else:
	print(f" {RED}{BOLD}Fix failures before running inference.{R}")
	print()
No results found