Skip to content

Instantly share code, notes, and snippets.

@scmanjarrez
Created September 10, 2025 12:33
Show Gist options
  • Save scmanjarrez/41a8c4b6bf7da2e7e59899d51bcf7de9 to your computer and use it in GitHub Desktop.
Save scmanjarrez/41a8c4b6bf7da2e7e59899d51bcf7de9 to your computer and use it in GitHub Desktop.
Script to benchmark llm, based on https://github.com/MinhNgyuen/llm-benchmark and updated to fix errors
import argparse
from types import SimpleNamespace
from ollama import ChatResponse, Client
def run_benchmark(model_name: str, prompt: str, verbose: bool) -> ChatResponse:
last_element = None
if verbose:
stream = CLIENT.chat(
model=model_name,
messages=[
{
"role": "user",
"content": prompt,
},
],
stream=True,
)
for chunk in stream:
print(chunk["message"]["content"], end="", flush=True)
last_element = chunk
else:
last_element = CLIENT.chat(
model=model_name,
messages=[
{
"role": "user",
"content": prompt,
},
],
)
if not last_element:
print("System Error: No response received from ollama")
return None
print("Last element: ", last_element)
# with open("data/ollama/ollama_res.json", "w") as outfile:
# outfile.write(json.dumps(last_element, indent=4))
return last_element
def nanosec_to_sec(nanosec):
return nanosec / 1000000000
def inference_stats(model_response: ChatResponse):
# Use properties for calculations
prompt_ts = model_response.prompt_eval_count / (
nanosec_to_sec(model_response.prompt_eval_duration)
)
response_ts = model_response.eval_count / (
nanosec_to_sec(model_response.eval_duration)
)
total_ts = (
model_response.prompt_eval_count + model_response.eval_count
) / (
nanosec_to_sec(
model_response.prompt_eval_duration + model_response.eval_duration
)
)
print(
f"""
----------------------------------------------------
{model_response.model}
\tPrompt eval: {prompt_ts:.2f} t/s
\tResponse: {response_ts:.2f} t/s
\tTotal: {total_ts:.2f} t/s
Stats:
\tPrompt tokens: {model_response.prompt_eval_count}
\tResponse tokens: {model_response.eval_count}
\tModel load time: {nanosec_to_sec(model_response.load_duration):.2f}s
\tPrompt eval time: {nanosec_to_sec(model_response.prompt_eval_duration):.2f}s
\tResponse time: {nanosec_to_sec(model_response.eval_duration):.2f}s
\tTotal time: {nanosec_to_sec(model_response.total_duration):.2f}s
----------------------------------------------------
"""
)
def average_stats(responses: list[ChatResponse]):
if len(responses) == 0:
print("No stats to average")
return
res = SimpleNamespace(
model=responses[-1].model,
total_duration=sum(r.total_duration for r in responses),
load_duration=sum(r.load_duration for r in responses),
prompt_eval_count=sum(r.prompt_eval_count for r in responses),
prompt_eval_duration=sum(r.prompt_eval_duration for r in responses),
eval_count=sum(r.eval_count for r in responses),
eval_duration=sum(r.eval_duration for r in responses),
)
inference_stats(res)
def get_benchmark_models(skip_models: list[str] = []) -> list[str]:
models = CLIENT.list().get("models", [])
model_names = [model["model"] for model in models]
if len(skip_models) > 0:
model_names = [
model for model in model_names if model not in skip_models
]
print(f"Evaluating models: {model_names}\n")
return model_names
def main():
verbose = args.verbose
skip_models = args.skip_models
prompts = args.prompts
print(
f"\nVerbose: {verbose}\nSkip models: {skip_models}\nPrompts: {prompts}"
)
model_names = get_benchmark_models(skip_models)
benchmarks = {}
for model_name in model_names:
responses: list[ChatResponse] = []
for prompt in prompts:
if verbose:
print(f"\n\nBenchmarking: {model_name}\nPrompt: {prompt}")
response = run_benchmark(model_name, prompt, verbose=verbose)
responses.append(response)
if verbose:
print(f"Response: {response.message.content}")
inference_stats(response)
benchmarks[model_name] = responses
for model_name, responses in benchmarks.items():
average_stats(responses)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Run benchmarks on your Ollama models."
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
help="Increase output verbosity",
default=False,
)
parser.add_argument(
"-s",
"--skip-models",
nargs="*",
default=[],
help="list of model names to skip. Separate multiple models with spaces.",
)
parser.add_argument(
"-r",
"--remote",
default="localhost:11434",
help="list of model names to skip. Separate multiple models with spaces.",
)
parser.add_argument(
"-p",
"--prompts",
nargs="*",
default=[
"Why is the sky blue?",
"Write a report on the financials of Apple Inc.",
],
help="list of prompts to use for benchmarking. Separate multiple prompts with spaces.",
)
args = parser.parse_args()
CLIENT = Client(
host=args.remote,
)
main()
# Example usage:
# python benchmark.py --verbose --skip-models aisherpa/mistral-7b-instruct-v02:Q5_K_M llama2:latest --prompts "What color is the sky" "Write a report on the financials of Microsoft"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment