This is a from-scratch, copy/paste runbook for getting:
- Model:
nvidia/Qwen3.5-397B-A17B-NVFP4 - Runtime: SGLang
- GPUs: all 4 GPUs (
TP=4) - Python:
3.12viauv - PyTorch: nightly
cu130 - Build threading cap:
16
- OS:
Ubuntu 24.04.4 LTS - Kernel:
6.8.0-100-generic - CPU:
AMD Ryzen Threadripper PRO 9985WX 64-Cores(128 threads) - RAM:
1.0 TiB - GPUs:
4x NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition - GPU memory:
97887 MiBeach - Compute capability:
12.0 - Driver:
580.126.16 - CUDA toolkit:
13.0.88 - Python:
3.12.11 - Torch used here:
2.12.0.dev20260223+cu130
export WORKDIR=/path/to/workdir
mkdir -p "$WORKDIR"
cd "$WORKDIR"
# Clone once
[ -d sglang ] || git clone https://github.com/sgl-project/sglang.git
cd sglang
# Optional: ensure latest main
# git pull --ff-only
cd "$WORKDIR"
uv venv --python 3.12 .venv
source .venv/bin/activate
# Install sglang (editable)
uv pip install -e "$WORKDIR/sglang/python"
# Force torch nightly cu130
uv pip install --index-url https://download.pytorch.org/whl/nightly/cu130 --upgrade --force-reinstall \
torch torchvision torchaudioThese are the final required source patches on this machine:
- CMake policy + CUDA 13
ccclinclude + FA3 flag behavior insgl-kernel/CMakeLists.txt - FA3 import fallback in
sgl-kernel/python/sgl_kernel/flash_attn.py
export WORKDIR=/path/to/workdir
source "$WORKDIR/.venv/bin/activate"
cd "$WORKDIR/sglang"
python - <<'PY'
from pathlib import Path
cmake = Path('sgl-kernel/CMakeLists.txt')
s = cmake.read_text()
s = s.replace(
"cmake_policy(SET CMP0169 OLD)\ncmake_policy(SET CMP0177 NEW)\n",
"if(POLICY CMP0169)\n cmake_policy(SET CMP0169 OLD)\nendif()\nif(POLICY CMP0177)\n cmake_policy(SET CMP0177 NEW)\nendif()\n"
)
needle = """include_directories(
SYSTEM ${TORCH_INCLUDE_DIRS}
SYSTEM ${NCCL_INCLUDE_DIR}
SYSTEM ${CUDAToolkit_INCLUDE_DIRS}
${PROJECT_SOURCE_DIR}/include
${PROJECT_SOURCE_DIR}/csrc
)
"""
insert = needle + """
# CUDA 13 places <cuda/...> headers under include/cccl.
# Add this include dir when present so third-party deps (e.g. mscclpp)
# can resolve headers like <cuda/atomic>.
list(GET CUDAToolkit_INCLUDE_DIRS 0 CUDA_INCLUDE_DIR_0)
if (EXISTS "${CUDA_INCLUDE_DIR_0}/cccl")
include_directories(SYSTEM "${CUDA_INCLUDE_DIR_0}/cccl")
endif()
"""
if needle in s and "include/cccl" not in s:
s = s.replace(needle, insert)
s = s.replace(
'option(SGL_KERNEL_ENABLE_FA3 "Enable FA3" OFF)',
'option(SGL_KERNEL_ENABLE_FA3 "Enable FA3" ON)'
)
s = s.replace(
'if ("${CUDA_VERSION}" VERSION_GREATER_EQUAL "12.4")',
'if ("${CUDA_VERSION}" VERSION_GREATER_EQUAL "12.4" AND SGL_KERNEL_ENABLE_FA3)'
)
cmake.write_text(s)
fa = Path('sgl-kernel/python/sgl_kernel/flash_attn.py')
fs = fa.read_text()
fs = fs.replace(
"except:\n raise ImportError(\n \"Can not import FA3 in sgl_kernel. Please check your installation.\"\n )\n",
"except:\n flash_ops = None\n"
)
fa.write_text(fs)
print('Patched', cmake)
print('Patched', fa)
PYexport WORKDIR=/path/to/workdir
source "$WORKDIR/.venv/bin/activate"
cd "$WORKDIR/sglang/sgl-kernel"
export CUDA_HOME=/usr/local/cuda-13.0
export TORCH_CUDA_ARCH_LIST=12.0a
export MAX_JOBS=16
export CMAKE_BUILD_PARALLEL_LEVEL=16
export CMAKE_ARGS='-DSGL_KERNEL_COMPILE_THREADS=16 -DENABLE_BELOW_SM90=OFF -DSGL_KERNEL_ENABLE_FP4=ON -DSGL_KERNEL_ENABLE_FA3=OFF'
uv build --wheel -Cbuild-dir=build-sm120d . --no-build-isolation --verbose
uv pip install --force-reinstall dist/sgl_kernel-*.whlImportant on this model/hardware combo:
- Use
--attention-backend triton. - Pin NVFP4/MoE backends to avoid pathological decode (
!/token-0 collapse):--moe-runner-backend flashinfer_cutlass--fp4-gemm-backend flashinfer_cudnn
export WORKDIR=/path/to/workdir
source "$WORKDIR/.venv/bin/activate"
cd "$WORKDIR/sglang"
export CUDA_VISIBLE_DEVICES=0,1,2,3
export OMP_NUM_THREADS=16
python -m sglang.launch_server \
--model nvidia/Qwen3.5-397B-A17B-NVFP4 \
--tensor-parallel-size 4 \
--quantization modelopt_fp4 \
--trust-remote-code \
--attention-backend triton \
--moe-runner-backend flashinfer_cutlass \
--fp4-gemm-backend flashinfer_cudnn \
--host 0.0.0.0 --port 30000export WORKDIR=/path/to/workdir
source "$WORKDIR/.venv/bin/activate"
python - <<'PY'
import json, urllib.request
with urllib.request.urlopen('http://127.0.0.1:30000/model_info', timeout=30) as r:
info = json.loads(r.read().decode())
print('model:', info['model_path'])
payload = {
'model': 'nvidia/Qwen3.5-397B-A17B-NVFP4',
'messages': [{'role':'user','content':'Reply with exactly: OK'}],
'max_tokens': 16,
'temperature': 0.0,
'chat_template_kwargs': {'enable_thinking': False},
}
req = urllib.request.Request(
'http://127.0.0.1:30000/v1/chat/completions',
data=json.dumps(payload).encode('utf-8'),
headers={'Content-Type':'application/json'},
)
with urllib.request.urlopen(req, timeout=120) as r:
out = json.loads(r.read().decode())
print('response:', out['choices'][0]['message']['content'])
PYThinking-enabled check:
export WORKDIR=/path/to/workdir
source "$WORKDIR/.venv/bin/activate"
python - <<'PY'
import json, urllib.request
payload = {
'model': 'nvidia/Qwen3.5-397B-A17B-NVFP4',
'messages': [{'role':'user','content':'What is 17*19? Answer with one number at the end.'}],
'max_tokens': 128,
'temperature': 0.0,
'chat_template_kwargs': {'enable_thinking': True},
}
req = urllib.request.Request(
'http://127.0.0.1:30000/v1/chat/completions',
data=json.dumps(payload).encode('utf-8'),
headers={'Content-Type':'application/json'},
)
with urllib.request.urlopen(req, timeout=120) as r:
out = json.loads(r.read().decode())
print('response:', out['choices'][0]['message']['content'])
PYIf you see corruption (!!! spam / repeated ! / token id 0 loops), make sure you are not using auto MoE backend selection for this checkpoint on this stack. Pin:
--moe-runner-backend flashinfer_cutlass--fp4-gemm-backend flashinfer_cudnn
Method:
- Input per request: exactly
1000token IDs ([23066] * 1000) - Generation target per request:
1000tokens (max_new_tokens=1000,ignore_eos=true) - Parallel requests tested:
1,2,4,8,16,32 - Endpoint:
POST /generate(streaming)
Benchmark execution note:
- The numbers below were measured with the exact method above (
1000input IDs,1000generated tokens, parallelism1/2/4/8/16/32, streaming/generate). - No repo-local benchmark file is required to interpret these results.
Decoded token throughput:
Per-request decode tok/s= mean over requests ofcompletion_tokens / (end_time - first_token_time)Aggregate decode tok/s=sum(completion_tokens) / (last_end_time - earliest_first_token_time)- Latest run file:
/tmp/nvfp4_fixed_bench_results.json(timestamp2026-02-23)
| Parallel requests | Per-request decode tok/s | Aggregate decode tok/s |
|---|---|---|
| 1 | 67.76 | 67.76 |
| 2 | 55.94 | 111.56 |
| 4 | 55.69 | 219.59 |
| 8 | 54.02 | 418.93 |
| 16 | 48.92 | 747.14 |
| 32 | 45.33 | 1372.96 |
All benchmark requests completed with exactly 1000 output tokens.
This section is self-contained for readers who cannot access repo files.
Test definition:
- Question count:
50(26baseline +24harder reasoning) - Categories (count):
Math(10), Logic(3), Pattern(2), Date(3), General(6), Coding(2), Hard Math(2), Hard Algebra(3), Hard Probability(3), Hard Number Theory(4), Hard Discrete(1), Hard Arithmetic(1), Hard Geometry(2), Hard Combinatorics(3), Hard Fractions(1), Hard Linear Algebra(1), Hard Recurrence(1), Hard Percentages(1), Hard Statistics(1) - Per-question prompt requires final line format:
FINAL_ANSWER: ... - Grading is strict regex matching on extracted final answer (case-insensitive)
- Answer extraction uses
FINAL_ANSWER:when present; otherwise falls back to the last non-empty line
Sample prompts from the set:
Compute 17 * 19. Answer with only the number.What day of the week was 2024-02-29? One word.Expected value of X^2 for a fair six-sided die X, as a simplest fraction a/b.How many nonnegative integer solutions to x+y+z=10? Only number.
Execution note:
- The results below were produced by running the 50-question definition above twice, changing only
enable_thinking(falsevstrue) atmax_tokens=8000.
How to enable/disable thinking mode in API calls:
- Disable thinking:
"chat_template_kwargs": {"enable_thinking": false}
- Enable thinking:
"chat_template_kwargs": {"enable_thinking": true}
Test settings used here:
- Endpoint:
POST /v1/chat/completions temperature=0.0,top_p=1.0- Max output tokens per request:
8000 - Output extraction expects a final line:
FINAL_ANSWER: ...
| Mode | Correct | Accuracy | Elapsed | Output JSON |
|---|---|---|---|---|
| Non-thinking | 49 / 50 | 98.0% | 191.3s (~3.2 min) | /tmp/nvfp4_smartness_eval_nonthinking_8k_50q.json |
| Thinking | 48 / 50 | 96.0% | 1623.5s (~27.1 min) | /tmp/nvfp4_smartness_eval_thinking_8k_50q.json |
Recommendation:
- Use non-thinking mode by default on this machine/checkpoint.
- Do not enable thinking mode for normal serving or benchmark runs unless you have a specific task that needs long chain-of-thought style generation.
Why:
- Accuracy was lower with thinking mode in this test (
96%vs98%). - Runtime was much worse with thinking mode (
1623.5svs191.3s, about8.5xslower end-to-end). - Thinking mode increases long generations and can reduce strict output-format compliance (for this eval, one case failed because no final
FINAL_ANSWERline was emitted before long generation behavior). - For throughput-sensitive or structured-answer workloads, this is a net negative in both speed and reliability.
Notes:
- One miss in non-thinking was a strict-format mismatch (
9\pivs regex expecting9pi/9π). - Thinking mode had that same formatting miss plus one prompt where the model did not emit a final
FINAL_ANSWERline before hitting long generation.
Hey bro, in my case, the model's reply is missing the tag (but there is a ), have you encountered a similar problem? How do you solve it?