Created
June 16, 2026 16:18
-
-
Save potto007/0fcfc08bef6c6ed7f941ac6801c0998c to your computer and use it in GitHub Desktop.
Convenience script to run llama-diffusion-server created by https://github.com/potto007/llama.cpp/commit/73b7ca4a5d94c35978f822b531f2a82c47d446b0
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # Launch the OpenAI-compatible DiffusionGemma HTTP server (llama-diffusion-server). | |
| # DiffusionGemma can't be served by the llama-server router (non-autoregressive decode), so this runs as its | |
| # own process on its own port. Override any default via env, e.g. PORT=9000 ./run-diffusion-server.sh | |
| set -euo pipefail | |
| LLAMA_CPP="${LLAMA_CPP:-$HOME/src/llama.cpp}" | |
| BIN="${BIN:-$LLAMA_CPP/build/bin/llama-diffusion-server}" | |
| MODEL="${MODEL:-$HOME/.models/diffusiongemma-26B-A4B-it-Q4_K_M.gguf}" | |
| HOST="${HOST:-127.0.0.1}" | |
| PORT="${PORT:-8088}" | |
| NGL="${NGL:-99}" | |
| CTX="${CTX:-20000}" # ctx tokens (0 = auto-size). 20000 fits the 32GB card with the output-cap fix | |
| # (~16GB weights + ~9GB activations); the old logits-buffer wall capped this at ~8k. | |
| FA="${FA:-1}" # flash-attn: REQUIRED for large contexts (without it a non-causal run needs an | |
| # fp32 [n_head,N,N] scores buffer that explodes past ~12k tokens). 1=on, 0=off. | |
| ALIAS="${ALIAS:-diffusiongemma-26b-a4b}" # model id reported by /v1/models; matches the opencode provider | |
| if [[ ! -x "$BIN" ]]; then | |
| echo "error: $BIN not found. Build it with:" >&2 | |
| echo " cmake --build $LLAMA_CPP/build -j --target llama-diffusion-server" >&2 | |
| exit 1 | |
| fi | |
| if [[ ! -f "$MODEL" ]]; then | |
| echo "error: model not found: $MODEL (set MODEL=...)" >&2 | |
| exit 1 | |
| fi | |
| fa_args=(); [[ "$FA" != "0" ]] && fa_args+=(-fa) | |
| echo "starting llama-diffusion-server on http://$HOST:$PORT (model=$ALIAS, ngl=$NGL, ctx=$CTX, fa=$FA)" | |
| echo "note: needs ~16GB VRAM for weights + compute; stop/idle the rlm router first if the 32GB card is full." | |
| # extra args are passed straight through, e.g. --show-reasoning or --raw | |
| exec "$BIN" -m "$MODEL" --host "$HOST" --port "$PORT" -ngl "$NGL" -c "$CTX" -a "$ALIAS" "${fa_args[@]}" "$@" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment