Skip to content

Instantly share code, notes, and snippets.

@julien-c
Created May 13, 2026 18:22
Show Gist options
  • Select an option

  • Save julien-c/971d17c6ba1db6495c942e4e2a7e77b2 to your computer and use it in GitHub Desktop.

Select an option

Save julien-c/971d17c6ba1db6495c942e4e2a7e77b2 to your computer and use it in GitHub Desktop.
pi-mono extension: llama.cpp provider with dynamic model + context window discovery
import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
import { Type } from "typebox";
import { Compile } from "typebox/compile";
const DEFAULT_BASE_URL = "http://localhost:8080/v1";
const PROPS_TIMEOUT_MS = 120_000;
const ModelsResponseSchema = Type.Object({
data: Type.Optional(
Type.Array(
Type.Object({
id: Type.String(),
status: Type.Optional(
Type.Object({
value: Type.Optional(Type.String()),
}),
),
architecture: Type.Optional(
Type.Object({
input_modalities: Type.Optional(Type.Array(Type.String())),
}),
),
}),
),
),
});
const validateModelsResponse = Compile(ModelsResponseSchema);
type LlamaModel = NonNullable<Parameters<ExtensionAPI["registerProvider"]>[1]["models"]>[number];
export default async function (pi: ExtensionAPI) {
let currentModels: LlamaModel[] = [];
pi.registerCommand("llama-version", {
description: "Print llama-server --version output",
handler: async (_args, ctx) => {
const result = await pi.exec("llama-server", ["--version"]);
const output = `${result.stderr ?? ""}\n${result.stdout ?? ""}`;
const versionLine = output.split("\n").map((l) => l.trim()).find((l) => /^version:\s/i.test(l));
ctx.ui.notify(
versionLine ?? `llama-server exited with code ${result.code}`,
versionLine ? "info" : "error",
);
},
});
const baseUrl = (process.env.LLAMA_BASE_URL ?? DEFAULT_BASE_URL).replace(/\/+$/, "");
async function refreshProvider(): Promise<void> {
try {
const response = await fetch(`${baseUrl}/models`);
if (!response.ok) {
console.warn(`[llama-cpp] ${baseUrl}/models returned ${response.status}`);
return;
}
const payload: unknown = await response.json();
if (!validateModelsResponse.Check(payload)) {
const errors = [...validateModelsResponse.Errors(payload)]
.map((e) => `${e.path} ${e.message}`)
.join("; ");
console.warn(`[llama-cpp] invalid /models response: ${errors}`);
return;
}
const previousById = new Map(currentModels.map((m) => [m.id, m]));
currentModels = (payload.data ?? []).map((model) => {
const isLoaded = model.status?.value === "loaded";
const modalities = model.architecture?.input_modalities ?? ["text"];
const input = modalities.filter((m): m is "text" | "image" => m === "text" || m === "image");
const suffixes: string[] = [];
if (input.includes("image")) suffixes.push("(image)");
if (isLoaded) suffixes.push("(loaded ✅)");
return {
id: model.id,
name: suffixes.length > 0 ? `${model.id} ${suffixes.join(" ")}` : model.id,
input,
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: previousById.get(model.id)?.contextWindow ?? 0,
// maxTokens: -1,
} as LlamaModel;
});
if (currentModels.length === 0) {
console.warn(`[llama-cpp] no models returned from ${baseUrl}/models`);
return;
}
pi.registerProvider("llama-cpp", {
name: "llama.cpp",
baseUrl,
apiKey: "LLAMA_API_KEY",
api: "openai-completions",
models: currentModels,
});
} catch (error) {
console.warn(`[llama-cpp] failed to reach ${baseUrl}/models: ${(error as Error).message}`);
}
}
const discoveredContext = new Set<string>();
const pendingContext = new Set<string>();
async function discoverContextWindow(modelId: string): Promise<void> {
if (discoveredContext.has(modelId) || pendingContext.has(modelId)) return;
const model = currentModels.find((m) => m.id === modelId);
if (!model) return;
pendingContext.add(modelId);
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), PROPS_TIMEOUT_MS);
const propsBase = `${baseUrl.replace(/\/v1$/, "")}/props?model=${encodeURIComponent(modelId)}&autoload=`;
try {
let response = await fetch(`${propsBase}false`, { signal: controller.signal });
if (response.status === 400 || response.status === 404) {
response = await fetch(`${propsBase}true`, { signal: controller.signal });
}
if (!response.ok) {
console.warn(`[llama-cpp] /props for ${modelId} returned ${response.status}`);
return;
}
const data = (await response.json()) as { default_generation_settings?: { n_ctx?: number } };
const nCtx = data.default_generation_settings?.n_ctx;
if (typeof nCtx === "number" && nCtx > 0) {
model.contextWindow = nCtx;
discoveredContext.add(modelId);
pi.registerProvider("llama-cpp", {
name: "llama.cpp",
baseUrl,
apiKey: "LLAMA_API_KEY",
api: "openai-completions",
models: currentModels,
});
console.log(`[llama-cpp] contextWindow=${nCtx} for ${modelId}`);
}
} catch (error) {
const err = error as Error;
const msg = err.name === "AbortError" ? "timeout" : err.message;
console.warn(`[llama-cpp] /props for ${modelId} failed: ${msg}`);
} finally {
clearTimeout(timer);
pendingContext.delete(modelId);
}
}
await refreshProvider();
pi.on("input", async (event) => {
const trimmed = event.text.trim().toLowerCase();
if (trimmed === "/model" || trimmed === "/models") {
await refreshProvider();
}
});
pi.on("model_select", (event) => {
if (event.model.provider !== "llama-cpp") return;
void discoverContextWindow(event.model.id);
});
pi.on("before_provider_request", (event) => {
const modelId = (event.payload as { model?: unknown })?.model;
if (typeof modelId === "string") {
void discoverContextWindow(modelId);
}
});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment