potto007 · June 16, 2026 16:18
diff --git a/run-diffusion-server.sh b/run-diffusion-server.sh
 #!/usr/bin/env bash
 # Launch the OpenAI-compatible DiffusionGemma HTTP server (llama-diffusion-server).
 # DiffusionGemma can't be served by the llama-server router (non-autoregressive decode), so this runs as its
 # own process on its own port. Override any default via env, e.g.  PORT=9000 ./run-diffusion-server.sh
 set -euo pipefail

 LLAMA_CPP="${LLAMA_CPP:-$HOME/src/llama.cpp}"
 BIN="${BIN:-$LLAMA_CPP/build/bin/llama-diffusion-server}"
 MODEL="${MODEL:-$HOME/.models/diffusiongemma-26B-A4B-it-Q4_K_M.gguf}"
 HOST="${HOST:-127.0.0.1}"
 PORT="${PORT:-8088}"
 NGL="${NGL:-99}"
 CTX="${CTX:-20000}"     # ctx tokens (0 = auto-size). 20000 fits the 32GB card with the output-cap fix
                        # (~16GB weights + ~9GB activations); the old logits-buffer wall capped this at ~8k.
 FA="${FA:-1}"           # flash-attn: REQUIRED for large contexts (without it a non-causal run needs an
                        # fp32 [n_head,N,N] scores buffer that explodes past ~12k tokens). 1=on, 0=off.
 ALIAS="${ALIAS:-diffusiongemma-26b-a4b}"   # model id reported by /v1/models; matches the opencode provider

 if [[ ! -x "$BIN" ]]; then
    echo "error: $BIN not found. Build it with:" >&2
    echo "  cmake --build $LLAMA_CPP/build -j --target llama-diffusion-server" >&2
    exit 1
 fi
 if [[ ! -f "$MODEL" ]]; then
    echo "error: model not found: $MODEL (set MODEL=...)" >&2
    exit 1
 fi

 fa_args=(); [[ "$FA" != "0" ]] && fa_args+=(-fa)
 echo "starting llama-diffusion-server on http://$HOST:$PORT  (model=$ALIAS, ngl=$NGL, ctx=$CTX, fa=$FA)"
 echo "note: needs ~16GB VRAM for weights + compute; stop/idle the rlm router first if the 32GB card is full."
 # extra args are passed straight through, e.g. --show-reasoning or --raw
 exec "$BIN" -m "$MODEL" --host "$HOST" --port "$PORT" -ngl "$NGL" -c "$CTX" -a "$ALIAS" "${fa_args[@]}" "$@"
	#!/usr/bin/env bash
	# Launch the OpenAI-compatible DiffusionGemma HTTP server (llama-diffusion-server).
	# DiffusionGemma can't be served by the llama-server router (non-autoregressive decode), so this runs as its
	# own process on its own port. Override any default via env, e.g. PORT=9000 ./run-diffusion-server.sh
	set -euo pipefail

	LLAMA_CPP="${LLAMA_CPP:-$HOME/src/llama.cpp}"
	BIN="${BIN:-$LLAMA_CPP/build/bin/llama-diffusion-server}"
	MODEL="${MODEL:-$HOME/.models/diffusiongemma-26B-A4B-it-Q4_K_M.gguf}"
	HOST="${HOST:-127.0.0.1}"
	PORT="${PORT:-8088}"
	NGL="${NGL:-99}"
	CTX="${CTX:-20000}" # ctx tokens (0 = auto-size). 20000 fits the 32GB card with the output-cap fix
	# (~16GB weights + ~9GB activations); the old logits-buffer wall capped this at ~8k.
	FA="${FA:-1}" # flash-attn: REQUIRED for large contexts (without it a non-causal run needs an
	# fp32 [n_head,N,N] scores buffer that explodes past ~12k tokens). 1=on, 0=off.
	ALIAS="${ALIAS:-diffusiongemma-26b-a4b}" # model id reported by /v1/models; matches the opencode provider

	if [[ ! -x "$BIN" ]]; then
	echo "error: $BIN not found. Build it with:" >&2
	echo " cmake --build $LLAMA_CPP/build -j --target llama-diffusion-server" >&2
	exit 1
	fi
	if [[ ! -f "$MODEL" ]]; then
	echo "error: model not found: $MODEL (set MODEL=...)" >&2
	exit 1
	fi

	fa_args=(); [[ "$FA" != "0" ]] && fa_args+=(-fa)
	echo "starting llama-diffusion-server on http://$HOST:$PORT (model=$ALIAS, ngl=$NGL, ctx=$CTX, fa=$FA)"
	echo "note: needs ~16GB VRAM for weights + compute; stop/idle the rlm router first if the 32GB card is full."
	# extra args are passed straight through, e.g. --show-reasoning or --raw
	exec "$BIN" -m "$MODEL" --host "$HOST" --port "$PORT" -ngl "$NGL" -c "$CTX" -a "$ALIAS" "${fa_args[@]}" "$@"
No results found