Archmonger · June 10, 2026 14:22
diff --git a/Sparkrun-MiniMax-M2.5-230B-MXFP4.yaml b/Sparkrun-MiniMax-M2.5-230B-MXFP4.yaml
 metadata:
  description: |
    MiniMax is the largest model physically capable of fitting on a single Spark.
    Note: It does run slower than Qwen due to lack of MTP.
    Achieves ~18 tk/s.
  model_dtype: MXFP4
  num_kv_heads: 2
  head_dim: 256
  num_layers: 48
  model_params: 122B

 name: exdysa/MiniMax-M2.5-REAP-139B-A10B-GGUF-MXFP4_MOE
 runtime: llama-cpp
 model: exdysa/MiniMax-M2.5-REAP-139B-A10B-GGUF-MXFP4_MOE
 recipe_version: '1'
 solo_only: true
 max_nodes: 1
 container: scitrera/dgx-spark-llama-cpp:latest

 command: |
  llama-server \
    -m {model} \
    --host {host} \
    --port {port} \
    --n-gpu-layers {n_gpu_layers} \
    --batch-size {batch_size} \
    --ubatch-size {ubatch_size} \
    --cache-type-k {cache_type_k} \
    --cache-type-v {cache_type_v} \
    --no-mmap \
    --keep {keep} \
    --cache-prompt \
    --cache-reuse {cache_reuse} \
    --parallel {parallel} \
    --spec-type {spec_type} \
    --no-ui \
    --threads {threads} \
    --flash-attn {flash_attn} \
    --ctx-size {context_size} \
    --ctx-checkpoints {context_checkpoints} \
    --cache-ram {cache_ram}

 defaults:
  host: '0.0.0.0'
  port: 8000
  n_gpu_layers: 99
  batch_size: 4096
  ubatch_size: 2048
  cache_type_k: 'q8_0'
  cache_type_v: 'q8_0'
  cache_ram: 1024
  cache_reuse: 128
  context_size: 196608
  context_checkpoints: 10
  flash_attn: 'on'
  keep: -1
  parallel: 1
  threads: 1
  spec_type: 'ngram-cache'

 env:
  TZ: America/Los_Angeles
diff --git a/Sparkrun-Qwen3.5-122B-A10B-MTP-MXFP4.yaml b/Sparkrun-Qwen3.5-122B-A10B-MTP-MXFP4.yaml
 metadata:
  description: |
    Maximized speed variant of Qwen3.5 122B for 1 user -> 1 spark.
    Strict focus on TTFT and token generation speed, at the cost
    of some intelligence.
    Achieves ~40 tk/s.
  model_dtype: MXFP4
  num_kv_heads: 2
  head_dim: 256
  num_layers: 48
  model_params: 122B
 
 name: unsloth/Qwen3.5-122B-A10B-MTP-GGUF-MXFP4_MOE
 runtime: llama-cpp
 model: unsloth/Qwen3.5-122B-A10B-MTP-GGUF:MXFP4_MOE
 recipe_version: '1'
 solo_only: true
 max_nodes: 1
 container: scitrera/dgx-spark-llama-cpp:latest

  llama-server \
    -m {model} \
    --host {host} \
    --port {port} \
    --n-gpu-layers {n_gpu_layers} \
    --batch-size {batch_size} \
    --ubatch-size {ubatch_size} \
    --cache-type-k {cache_type_k} \
    --cache-type-v {cache_type_v} \
    --no-mmap \
    --keep {keep} \
    --cache-prompt \
    --cache-reuse {cache_reuse} \
    --parallel {parallel} \
    --spec-type {spec_type} \
    --spec-draft-n-max {spec_draft_n_max} \
    --spec-draft-n-min {spec_draft_n_min} \
    --no-ui \
    --threads {threads} \
    --flash-attn {flash_attn} \
    --ctx-size {context_size} \
    --ctx-checkpoints {context_checkpoints} \
    --cache-ram {cache_ram}

 defaults:
  host: '0.0.0.0'
  port: 8000
  n_gpu_layers: 99
  batch_size: 4096
  ubatch_size: 2048
  cache_type_k: 'f16'
  cache_type_v: 'f16'
  cache_ram: 2048
  cache_reuse: 128
  context_size: 262144
  context_checkpoints: 24
  flash_attn: 'on'
  keep: -1
  parallel: 1
  threads: 1
  spec_type: 'draft-mtp'
  spec_draft_n_max: 3
  spec_draft_n_min: 0.6
  
 env:
  TZ: America/Los_Angeles
diff --git a/Sparkrun-Qwen3.5-122B-A10B-MTP-Q5-K-M.yaml b/Sparkrun-Qwen3.5-122B-A10B-MTP-Q5-K-M.yaml
 metadata:
  description: |
    Maximized intelligence variant of Qwen3.5 122B for 1 user -> 1 spark.
    Strict focus on using the least lobotomized quant possible that will
    fit on a spark, alongside Qwen's full context size. While it is possible
    to fit smarter quants on a Spark, you would need to sacrifice context size.
    A strong attempt was made to retain TTFT and token generation speed via
    optimized flags.
    This recipe pushes the DGX Spark platform to the limits, and just *barely*
    does not cause OOM. My spark's idle RAM usage is 3 GB with no models loaded.
    If your idle RAM usage is higher than mine, you might hit OOM.
    Achieves ~32 tk/s.
  model_dtype: Q5_K_M
  num_kv_heads: 2
  head_dim: 256
  num_layers: 48
  model_params: 122B

 name: unsloth/Qwen3.5-122B-A10B-MTP-GGUF-Q5_K_M
 runtime: llama-cpp
 model: unsloth/Qwen3.5-122B-A10B-MTP-GGUF:Q5_K_M
 recipe_version: '1'
 solo_only: true
 max_nodes: 1
 container: scitrera/dgx-spark-llama-cpp:latest

 command: |
  llama-server \
    -m {model} \
    --host {host} \
    --port {port} \
    --n-gpu-layers {n_gpu_layers} \
    --batch-size {batch_size} \
    --ubatch-size {ubatch_size} \
    --cache-type-k {cache_type_k} \
    --cache-type-v {cache_type_v} \
    --no-mmap \
    --keep {keep} \
    --cache-prompt \
    --cache-reuse {cache_reuse} \
    --parallel {parallel} \
    --spec-type {spec_type} \
    --spec-draft-n-max {spec_draft_n_max} \
    --spec-draft-n-min {spec_draft_n_min} \
    --no-wui \
    --threads {threads} \
    --flash-attn {flash_attn} \
    --ctx-size {context_size} \
    --ctx-checkpoints {context_checkpoints} \
    --cache-ram {cache_ram}

 defaults:
  host: '0.0.0.0'
  port: 8000
  n_gpu_layers: 99
  batch_size: 4096
  ubatch_size: 2048
  cache_type_k: 'q8_0'
  cache_type_v: 'q8_0'
  cache_ram: 1024
  cache_reuse: 128
  context_size: 262144
  context_checkpoints: 12
  flash_attn: 'on'
  keep: -1
  parallel: 1
  threads: 1
  spec_type: 'draft-mtp'
  spec_draft_n_max: 3
  spec_draft_n_min: 0.6

 env:
  TZ: America/Los_Angeles
	metadata:
	description: \|
	MiniMax is the largest model physically capable of fitting on a single Spark.
	Note: It does run slower than Qwen due to lack of MTP.
	Achieves ~18 tk/s.
	model_dtype: MXFP4
	num_kv_heads: 2
	head_dim: 256
	num_layers: 48
	model_params: 122B

	name: exdysa/MiniMax-M2.5-REAP-139B-A10B-GGUF-MXFP4_MOE
	runtime: llama-cpp
	model: exdysa/MiniMax-M2.5-REAP-139B-A10B-GGUF-MXFP4_MOE
	recipe_version: '1'
	solo_only: true
	max_nodes: 1
	container: scitrera/dgx-spark-llama-cpp:latest

	command: \|
	llama-server \
	-m {model} \
	--host {host} \
	--port {port} \
	--n-gpu-layers {n_gpu_layers} \
	--batch-size {batch_size} \
	--ubatch-size {ubatch_size} \
	--cache-type-k {cache_type_k} \
	--cache-type-v {cache_type_v} \
	--no-mmap \
	--keep {keep} \
	--cache-prompt \
	--cache-reuse {cache_reuse} \
	--parallel {parallel} \
	--spec-type {spec_type} \
	--no-ui \
	--threads {threads} \
	--flash-attn {flash_attn} \
	--ctx-size {context_size} \
	--ctx-checkpoints {context_checkpoints} \
	--cache-ram {cache_ram}

	defaults:
	host: '0.0.0.0'
	port: 8000
	n_gpu_layers: 99
	batch_size: 4096
	ubatch_size: 2048
	cache_type_k: 'q8_0'
	cache_type_v: 'q8_0'
	cache_ram: 1024
	cache_reuse: 128
	context_size: 196608
	context_checkpoints: 10
	flash_attn: 'on'
	keep: -1
	parallel: 1
	threads: 1
	spec_type: 'ngram-cache'

	env:
	TZ: America/Los_Angeles
	metadata:
	description: \|
	Maximized speed variant of Qwen3.5 122B for 1 user -> 1 spark.
	Strict focus on TTFT and token generation speed, at the cost
	of some intelligence.
	Achieves ~40 tk/s.
	model_dtype: MXFP4
	num_kv_heads: 2
	head_dim: 256
	num_layers: 48
	model_params: 122B

	name: unsloth/Qwen3.5-122B-A10B-MTP-GGUF-MXFP4_MOE
	runtime: llama-cpp
	model: unsloth/Qwen3.5-122B-A10B-MTP-GGUF:MXFP4_MOE
	recipe_version: '1'
	solo_only: true
	max_nodes: 1
	container: scitrera/dgx-spark-llama-cpp:latest

	llama-server \
	-m {model} \
	--host {host} \
	--port {port} \
	--n-gpu-layers {n_gpu_layers} \
	--batch-size {batch_size} \
	--ubatch-size {ubatch_size} \
	--cache-type-k {cache_type_k} \
	--cache-type-v {cache_type_v} \
	--no-mmap \
	--keep {keep} \
	--cache-prompt \
	--cache-reuse {cache_reuse} \
	--parallel {parallel} \
	--spec-type {spec_type} \
	--spec-draft-n-max {spec_draft_n_max} \
	--spec-draft-n-min {spec_draft_n_min} \
	--no-ui \
	--threads {threads} \
	--flash-attn {flash_attn} \
	--ctx-size {context_size} \
	--ctx-checkpoints {context_checkpoints} \
	--cache-ram {cache_ram}

	defaults:
	host: '0.0.0.0'
	port: 8000
	n_gpu_layers: 99
	batch_size: 4096
	ubatch_size: 2048
	cache_type_k: 'f16'
	cache_type_v: 'f16'
	cache_ram: 2048
	cache_reuse: 128
	context_size: 262144
	context_checkpoints: 24
	flash_attn: 'on'
	keep: -1
	parallel: 1
	threads: 1
	spec_type: 'draft-mtp'
	spec_draft_n_max: 3
	spec_draft_n_min: 0.6

	env:
	TZ: America/Los_Angeles
	metadata:
	description: \|
	Maximized intelligence variant of Qwen3.5 122B for 1 user -> 1 spark.
	Strict focus on using the least lobotomized quant possible that will
	fit on a spark, alongside Qwen's full context size. While it is possible
	to fit smarter quants on a Spark, you would need to sacrifice context size.
	A strong attempt was made to retain TTFT and token generation speed via
	optimized flags.
	This recipe pushes the DGX Spark platform to the limits, and just barely
	does not cause OOM. My spark's idle RAM usage is 3 GB with no models loaded.
	If your idle RAM usage is higher than mine, you might hit OOM.
	Achieves ~32 tk/s.
	model_dtype: Q5_K_M
	num_kv_heads: 2
	head_dim: 256
	num_layers: 48
	model_params: 122B

	name: unsloth/Qwen3.5-122B-A10B-MTP-GGUF-Q5_K_M
	runtime: llama-cpp
	model: unsloth/Qwen3.5-122B-A10B-MTP-GGUF:Q5_K_M
	recipe_version: '1'
	solo_only: true
	max_nodes: 1
	container: scitrera/dgx-spark-llama-cpp:latest

	command: \|
	llama-server \
	-m {model} \
	--host {host} \
	--port {port} \
	--n-gpu-layers {n_gpu_layers} \
	--batch-size {batch_size} \
	--ubatch-size {ubatch_size} \
	--cache-type-k {cache_type_k} \
	--cache-type-v {cache_type_v} \
	--no-mmap \
	--keep {keep} \
	--cache-prompt \
	--cache-reuse {cache_reuse} \
	--parallel {parallel} \
	--spec-type {spec_type} \
	--spec-draft-n-max {spec_draft_n_max} \
	--spec-draft-n-min {spec_draft_n_min} \
	--no-wui \
	--threads {threads} \
	--flash-attn {flash_attn} \
	--ctx-size {context_size} \
	--ctx-checkpoints {context_checkpoints} \
	--cache-ram {cache_ram}

	defaults:
	host: '0.0.0.0'
	port: 8000
	n_gpu_layers: 99
	batch_size: 4096
	ubatch_size: 2048
	cache_type_k: 'q8_0'
	cache_type_v: 'q8_0'
	cache_ram: 1024
	cache_reuse: 128
	context_size: 262144
	context_checkpoints: 12
	flash_attn: 'on'
	keep: -1
	parallel: 1
	threads: 1
	spec_type: 'draft-mtp'
	spec_draft_n_max: 3
	spec_draft_n_min: 0.6

	env:
	TZ: America/Los_Angeles