scmanjarrez · September 10, 2025 12:33
diff --git a/llm-benchmark.py b/llm-benchmark.py
 import argparse
 from types import SimpleNamespace

 from ollama import ChatResponse, Client


 def run_benchmark(model_name: str, prompt: str, verbose: bool) -> ChatResponse:
    last_element = None

    if verbose:
        stream = CLIENT.chat(
            model=model_name,
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                },
            ],
            stream=True,
        )
        for chunk in stream:
            print(chunk["message"]["content"], end="", flush=True)
            last_element = chunk
    else:
        last_element = CLIENT.chat(
            model=model_name,
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                },
            ],
        )

    if not last_element:
        print("System Error: No response received from ollama")
        return None

    print("Last element: ", last_element)
    # with open("data/ollama/ollama_res.json", "w") as outfile:
    #     outfile.write(json.dumps(last_element, indent=4))

    return last_element


 def nanosec_to_sec(nanosec):
    return nanosec / 1000000000


 def inference_stats(model_response: ChatResponse):
    # Use properties for calculations
    prompt_ts = model_response.prompt_eval_count / (
        nanosec_to_sec(model_response.prompt_eval_duration)
    )
    response_ts = model_response.eval_count / (
        nanosec_to_sec(model_response.eval_duration)
    )
    total_ts = (
        model_response.prompt_eval_count + model_response.eval_count
    ) / (
        nanosec_to_sec(
            model_response.prompt_eval_duration + model_response.eval_duration
        )
    )

    print(
        f"""
 ----------------------------------------------------
        {model_response.model}
        \tPrompt eval: {prompt_ts:.2f} t/s
        \tResponse: {response_ts:.2f} t/s
        \tTotal: {total_ts:.2f} t/s

        Stats:
        \tPrompt tokens: {model_response.prompt_eval_count}
        \tResponse tokens: {model_response.eval_count}
        \tModel load time: {nanosec_to_sec(model_response.load_duration):.2f}s
        \tPrompt eval time: {nanosec_to_sec(model_response.prompt_eval_duration):.2f}s
        \tResponse time: {nanosec_to_sec(model_response.eval_duration):.2f}s
        \tTotal time: {nanosec_to_sec(model_response.total_duration):.2f}s
 ----------------------------------------------------
        """
    )


 def average_stats(responses: list[ChatResponse]):
    if len(responses) == 0:
        print("No stats to average")
        return

    res = SimpleNamespace(
        model=responses[-1].model,
        total_duration=sum(r.total_duration for r in responses),
        load_duration=sum(r.load_duration for r in responses),
        prompt_eval_count=sum(r.prompt_eval_count for r in responses),
        prompt_eval_duration=sum(r.prompt_eval_duration for r in responses),
        eval_count=sum(r.eval_count for r in responses),
        eval_duration=sum(r.eval_duration for r in responses),
    )
    inference_stats(res)


 def get_benchmark_models(skip_models: list[str] = []) -> list[str]:
    models = CLIENT.list().get("models", [])
    model_names = [model["model"] for model in models]
    if len(skip_models) > 0:
        model_names = [
            model for model in model_names if model not in skip_models
        ]
    print(f"Evaluating models: {model_names}\n")
    return model_names


 def main():
    verbose = args.verbose
    skip_models = args.skip_models
    prompts = args.prompts
    print(
        f"\nVerbose: {verbose}\nSkip models: {skip_models}\nPrompts: {prompts}"
    )

    model_names = get_benchmark_models(skip_models)
    benchmarks = {}

    for model_name in model_names:
        responses: list[ChatResponse] = []
        for prompt in prompts:
            if verbose:
                print(f"\n\nBenchmarking: {model_name}\nPrompt: {prompt}")
            response = run_benchmark(model_name, prompt, verbose=verbose)
            responses.append(response)

            if verbose:
                print(f"Response: {response.message.content}")
                inference_stats(response)
        benchmarks[model_name] = responses

    for model_name, responses in benchmarks.items():
        average_stats(responses)


 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Run benchmarks on your Ollama models."
    )
    parser.add_argument(
        "-v",
        "--verbose",
        action="store_true",
        help="Increase output verbosity",
        default=False,
    )
    parser.add_argument(
        "-s",
        "--skip-models",
        nargs="*",
        default=[],
        help="list of model names to skip. Separate multiple models with spaces.",
    )
    parser.add_argument(
        "-r",
        "--remote",
        default="localhost:11434",
        help="list of model names to skip. Separate multiple models with spaces.",
    )
    parser.add_argument(
        "-p",
        "--prompts",
        nargs="*",
        default=[
            "Why is the sky blue?",
            "Write a report on the financials of Apple Inc.",
        ],
        help="list of prompts to use for benchmarking. Separate multiple prompts with spaces.",
    )

    args = parser.parse_args()

    CLIENT = Client(
        host=args.remote,
    )

    main()
    # Example usage:
    # python benchmark.py --verbose --skip-models aisherpa/mistral-7b-instruct-v02:Q5_K_M llama2:latest --prompts "What color is the sky" "Write a report on the financials of Microsoft"
	import argparse
	from types import SimpleNamespace

	from ollama import ChatResponse, Client


	def run_benchmark(model_name: str, prompt: str, verbose: bool) -> ChatResponse:
	last_element = None

	if verbose:
	stream = CLIENT.chat(
	model=model_name,
	messages=[
	{
	"role": "user",
	"content": prompt,
	},
	],
	stream=True,
	)
	for chunk in stream:
	print(chunk["message"]["content"], end="", flush=True)
	last_element = chunk
	else:
	last_element = CLIENT.chat(
	model=model_name,
	messages=[
	{
	"role": "user",
	"content": prompt,
	},
	],
	)

	if not last_element:
	print("System Error: No response received from ollama")
	return None

	print("Last element: ", last_element)
	# with open("data/ollama/ollama_res.json", "w") as outfile:
	# outfile.write(json.dumps(last_element, indent=4))

	return last_element


	def nanosec_to_sec(nanosec):
	return nanosec / 1000000000


	def inference_stats(model_response: ChatResponse):
	# Use properties for calculations
	prompt_ts = model_response.prompt_eval_count / (
	nanosec_to_sec(model_response.prompt_eval_duration)
	)
	response_ts = model_response.eval_count / (
	nanosec_to_sec(model_response.eval_duration)
	)
	total_ts = (
	model_response.prompt_eval_count + model_response.eval_count
	) / (
	nanosec_to_sec(
	model_response.prompt_eval_duration + model_response.eval_duration
	)
	)

	print(
	f"""
	----------------------------------------------------
	{model_response.model}
	\tPrompt eval: {prompt_ts:.2f} t/s
	\tResponse: {response_ts:.2f} t/s
	\tTotal: {total_ts:.2f} t/s

	Stats:
	\tPrompt tokens: {model_response.prompt_eval_count}
	\tResponse tokens: {model_response.eval_count}
	\tModel load time: {nanosec_to_sec(model_response.load_duration):.2f}s
	\tPrompt eval time: {nanosec_to_sec(model_response.prompt_eval_duration):.2f}s
	\tResponse time: {nanosec_to_sec(model_response.eval_duration):.2f}s
	\tTotal time: {nanosec_to_sec(model_response.total_duration):.2f}s
	----------------------------------------------------
	"""
	)


	def average_stats(responses: list[ChatResponse]):
	if len(responses) == 0:
	print("No stats to average")
	return

	res = SimpleNamespace(
	model=responses[-1].model,
	total_duration=sum(r.total_duration for r in responses),
	load_duration=sum(r.load_duration for r in responses),
	prompt_eval_count=sum(r.prompt_eval_count for r in responses),
	prompt_eval_duration=sum(r.prompt_eval_duration for r in responses),
	eval_count=sum(r.eval_count for r in responses),
	eval_duration=sum(r.eval_duration for r in responses),
	)
	inference_stats(res)


	def get_benchmark_models(skip_models: list[str] = []) -> list[str]:
	models = CLIENT.list().get("models", [])
	model_names = [model["model"] for model in models]
	if len(skip_models) > 0:
	model_names = [
	model for model in model_names if model not in skip_models
	]
	print(f"Evaluating models: {model_names}\n")
	return model_names


	def main():
	verbose = args.verbose
	skip_models = args.skip_models
	prompts = args.prompts
	print(
	f"\nVerbose: {verbose}\nSkip models: {skip_models}\nPrompts: {prompts}"
	)

	model_names = get_benchmark_models(skip_models)
	benchmarks = {}

	for model_name in model_names:
	responses: list[ChatResponse] = []
	for prompt in prompts:
	if verbose:
	print(f"\n\nBenchmarking: {model_name}\nPrompt: {prompt}")
	response = run_benchmark(model_name, prompt, verbose=verbose)
	responses.append(response)

	if verbose:
	print(f"Response: {response.message.content}")
	inference_stats(response)
	benchmarks[model_name] = responses

	for model_name, responses in benchmarks.items():
	average_stats(responses)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(
	description="Run benchmarks on your Ollama models."
	)
	parser.add_argument(
	"-v",
	"--verbose",
	action="store_true",
	help="Increase output verbosity",
	default=False,
	)
	parser.add_argument(
	"-s",
	"--skip-models",
	nargs="*",
	default=[],
	help="list of model names to skip. Separate multiple models with spaces.",
	)
	parser.add_argument(
	"-r",
	"--remote",
	default="localhost:11434",
	help="list of model names to skip. Separate multiple models with spaces.",
	)
	parser.add_argument(
	"-p",
	"--prompts",
	nargs="*",
	default=[
	"Why is the sky blue?",
	"Write a report on the financials of Apple Inc.",
	],
	help="list of prompts to use for benchmarking. Separate multiple prompts with spaces.",
	)

	args = parser.parse_args()

	CLIENT = Client(
	host=args.remote,
	)

	main()
	# Example usage:
	# python benchmark.py --verbose --skip-models aisherpa/mistral-7b-instruct-v02:Q5_K_M llama2:latest --prompts "What color is the sky" "Write a report on the financials of Microsoft"