Created
September 10, 2025 12:33
-
-
Save scmanjarrez/41a8c4b6bf7da2e7e59899d51bcf7de9 to your computer and use it in GitHub Desktop.
Script to benchmark llm, based on https://github.com/MinhNgyuen/llm-benchmark and updated to fix errors
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| from types import SimpleNamespace | |
| from ollama import ChatResponse, Client | |
| def run_benchmark(model_name: str, prompt: str, verbose: bool) -> ChatResponse: | |
| last_element = None | |
| if verbose: | |
| stream = CLIENT.chat( | |
| model=model_name, | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": prompt, | |
| }, | |
| ], | |
| stream=True, | |
| ) | |
| for chunk in stream: | |
| print(chunk["message"]["content"], end="", flush=True) | |
| last_element = chunk | |
| else: | |
| last_element = CLIENT.chat( | |
| model=model_name, | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": prompt, | |
| }, | |
| ], | |
| ) | |
| if not last_element: | |
| print("System Error: No response received from ollama") | |
| return None | |
| print("Last element: ", last_element) | |
| # with open("data/ollama/ollama_res.json", "w") as outfile: | |
| # outfile.write(json.dumps(last_element, indent=4)) | |
| return last_element | |
| def nanosec_to_sec(nanosec): | |
| return nanosec / 1000000000 | |
| def inference_stats(model_response: ChatResponse): | |
| # Use properties for calculations | |
| prompt_ts = model_response.prompt_eval_count / ( | |
| nanosec_to_sec(model_response.prompt_eval_duration) | |
| ) | |
| response_ts = model_response.eval_count / ( | |
| nanosec_to_sec(model_response.eval_duration) | |
| ) | |
| total_ts = ( | |
| model_response.prompt_eval_count + model_response.eval_count | |
| ) / ( | |
| nanosec_to_sec( | |
| model_response.prompt_eval_duration + model_response.eval_duration | |
| ) | |
| ) | |
| print( | |
| f""" | |
| ---------------------------------------------------- | |
| {model_response.model} | |
| \tPrompt eval: {prompt_ts:.2f} t/s | |
| \tResponse: {response_ts:.2f} t/s | |
| \tTotal: {total_ts:.2f} t/s | |
| Stats: | |
| \tPrompt tokens: {model_response.prompt_eval_count} | |
| \tResponse tokens: {model_response.eval_count} | |
| \tModel load time: {nanosec_to_sec(model_response.load_duration):.2f}s | |
| \tPrompt eval time: {nanosec_to_sec(model_response.prompt_eval_duration):.2f}s | |
| \tResponse time: {nanosec_to_sec(model_response.eval_duration):.2f}s | |
| \tTotal time: {nanosec_to_sec(model_response.total_duration):.2f}s | |
| ---------------------------------------------------- | |
| """ | |
| ) | |
| def average_stats(responses: list[ChatResponse]): | |
| if len(responses) == 0: | |
| print("No stats to average") | |
| return | |
| res = SimpleNamespace( | |
| model=responses[-1].model, | |
| total_duration=sum(r.total_duration for r in responses), | |
| load_duration=sum(r.load_duration for r in responses), | |
| prompt_eval_count=sum(r.prompt_eval_count for r in responses), | |
| prompt_eval_duration=sum(r.prompt_eval_duration for r in responses), | |
| eval_count=sum(r.eval_count for r in responses), | |
| eval_duration=sum(r.eval_duration for r in responses), | |
| ) | |
| inference_stats(res) | |
| def get_benchmark_models(skip_models: list[str] = []) -> list[str]: | |
| models = CLIENT.list().get("models", []) | |
| model_names = [model["model"] for model in models] | |
| if len(skip_models) > 0: | |
| model_names = [ | |
| model for model in model_names if model not in skip_models | |
| ] | |
| print(f"Evaluating models: {model_names}\n") | |
| return model_names | |
| def main(): | |
| verbose = args.verbose | |
| skip_models = args.skip_models | |
| prompts = args.prompts | |
| print( | |
| f"\nVerbose: {verbose}\nSkip models: {skip_models}\nPrompts: {prompts}" | |
| ) | |
| model_names = get_benchmark_models(skip_models) | |
| benchmarks = {} | |
| for model_name in model_names: | |
| responses: list[ChatResponse] = [] | |
| for prompt in prompts: | |
| if verbose: | |
| print(f"\n\nBenchmarking: {model_name}\nPrompt: {prompt}") | |
| response = run_benchmark(model_name, prompt, verbose=verbose) | |
| responses.append(response) | |
| if verbose: | |
| print(f"Response: {response.message.content}") | |
| inference_stats(response) | |
| benchmarks[model_name] = responses | |
| for model_name, responses in benchmarks.items(): | |
| average_stats(responses) | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser( | |
| description="Run benchmarks on your Ollama models." | |
| ) | |
| parser.add_argument( | |
| "-v", | |
| "--verbose", | |
| action="store_true", | |
| help="Increase output verbosity", | |
| default=False, | |
| ) | |
| parser.add_argument( | |
| "-s", | |
| "--skip-models", | |
| nargs="*", | |
| default=[], | |
| help="list of model names to skip. Separate multiple models with spaces.", | |
| ) | |
| parser.add_argument( | |
| "-r", | |
| "--remote", | |
| default="localhost:11434", | |
| help="list of model names to skip. Separate multiple models with spaces.", | |
| ) | |
| parser.add_argument( | |
| "-p", | |
| "--prompts", | |
| nargs="*", | |
| default=[ | |
| "Why is the sky blue?", | |
| "Write a report on the financials of Apple Inc.", | |
| ], | |
| help="list of prompts to use for benchmarking. Separate multiple prompts with spaces.", | |
| ) | |
| args = parser.parse_args() | |
| CLIENT = Client( | |
| host=args.remote, | |
| ) | |
| main() | |
| # Example usage: | |
| # python benchmark.py --verbose --skip-models aisherpa/mistral-7b-instruct-v02:Q5_K_M llama2:latest --prompts "What color is the sky" "Write a report on the financials of Microsoft" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment