Created
August 15, 2025 00:02
-
-
Save bizrockman/155e8f6830d6daf03b2328022d1fda9b to your computer and use it in GitHub Desktop.
A simple script to give a summarization of a tau2 run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
tau2_summarizer.py | |
------------------ | |
Summarize Tau²-Bench result files (JSON) for Dual-Control, Retail runs. | |
- Reads a Tau² result JSON (created via `--save-to ...`) | |
- Aggregates per-task token usage and durations | |
- Computes Pass@1, totals, and normalized "per interaction" metrics (Dual = 2 * tasks) | |
- Optionally computes costs if prices are provided (USD per 1M tokens for input/output) | |
- Emits console summary and optional CSV exports (details + summary) | |
Usage: | |
python tau2_summarizer.py results.json | |
python tau2_summarizer.py results.json --summary-csv summary.csv --details-csv details.csv | |
python tau2_summarizer.py results.json --input-ppm 0.60 --output-ppm 2.20 | |
python tau2_summarizer.py results.json --euro-rate 0.92 --input-ppm 0.60 --output-ppm 2.20 | |
Notes: | |
- Pass@1: success if reward >= 1.0 (or fallback: simulation["success"] True) | |
- duration is expected to be in seconds in Tau² output | |
""" | |
import argparse | |
import csv | |
import json | |
import sys | |
from pathlib import Path | |
from typing import Any, Dict, List | |
def load_results_file(filepath: Path) -> Dict[str, Any]: | |
try: | |
with filepath.open('r', encoding='utf-8') as f: | |
return json.load(f) | |
except FileNotFoundError: | |
sys.exit(f"❌ Datei nicht gefunden: {filepath}") | |
except json.JSONDecodeError: | |
sys.exit(f"❌ Ungültiges JSON-Format: {filepath}") | |
def _int(x) -> int: | |
try: | |
return int(x) | |
except Exception: | |
return 0 | |
def extract_usage_tokens(msg: Dict[str, Any]) -> Dict[str, int]: | |
""" | |
Robustly extract input/output tokens from a message. | |
Supports multiple possible schemas: | |
- msg["usage"] = {"prompt_tokens", "completion_tokens"} | |
- msg["token_usage"] = {"input_tokens", "output_tokens"} | |
- alternate keys if providers differ | |
""" | |
usage = msg.get("usage") | |
if usage is None or not isinstance(usage, dict): | |
usage = msg.get("token_usage", {}) or {} | |
prompt = ( | |
_int(usage.get("prompt_tokens")) | |
or _int(usage.get("input_tokens")) | |
or 0 | |
) | |
completion = ( | |
_int(usage.get("completion_tokens")) | |
or _int(usage.get("output_tokens")) | |
or 0 | |
) | |
return {"prompt_tokens": prompt, "completion_tokens": completion} | |
def extract_sim_stats(sim: Dict[str, Any]) -> Dict[str, Any]: | |
in_tok = 0 | |
out_tok = 0 | |
msg_list = sim.get('messages') or [] | |
missing_usage = 0 | |
for msg in msg_list: | |
toks = extract_usage_tokens(msg) | |
if toks["prompt_tokens"] == 0 and toks["completion_tokens"] == 0: | |
# No usage info for this message — that's OK; just count as zero | |
missing_usage += 1 | |
in_tok += toks["prompt_tokens"] | |
out_tok += toks["completion_tokens"] | |
total = in_tok + out_tok | |
duration = float(sim.get('duration') or 0.0) # seconds | |
# Pass@1 success detection: prefer reward >= 1.0, fallback to boolean | |
reward = float(sim.get('reward_info', {}).get('reward') or 0.0) | |
success_flag = sim.get("success") | |
ok = 1 if (reward >= 1.0 or success_flag is True) else 0 | |
return { | |
'task_id': sim.get('task_id'), | |
'input_tokens': in_tok, | |
'output_tokens': out_tok, | |
'total_tokens': total, | |
'duration_s': duration, | |
'success': ok, | |
'reward': reward, | |
'missing_usage_msgs': missing_usage, | |
'messages_count': len(msg_list), | |
} | |
def calc_costs_usd(input_tokens: int, output_tokens: int, input_ppm: float, output_ppm: float) -> float: | |
"""Return total cost in USD using prices per million tokens (ppm).""" | |
return (input_tokens / 1_000_000.0) * input_ppm + (output_tokens / 1_000_000.0) * output_ppm | |
def human_int(n: float) -> str: | |
return f"{n:,.0f}".replace(",", ".") | |
def main() -> None: | |
ap = argparse.ArgumentParser(description="Summarize Tau²-Bench results (Retail, Dual-Control)") | |
ap.add_argument("results_json", type=Path, help="Pfad zur Tau² Ergebnisdatei (JSON, via --save-to erzeugt)") | |
ap.add_argument("--summary-csv", type=Path, help="Pfad für aggregierte Zusammenfassung als CSV", default=None) | |
ap.add_argument("--details-csv", type=Path, help="Pfad für Detailtabelle pro Task als CSV", default=None) | |
ap.add_argument("--input-ppm", type=float, default=None, help="Preis USD / 1M Input-Token (optional)") | |
ap.add_argument("--output-ppm", type=float, default=None, help="Preis USD / 1M Output-Token (optional)") | |
ap.add_argument("--euro-rate", type=float, default=None, help="USD→EUR Umrechnungsfaktor (optional, z.B. 0.92)") | |
ap.add_argument("--interactions", type=int, default=None, | |
help="Override der Interaktionszahl (Default: 2 * #Tasks im Dual-Setup)") | |
args = ap.parse_args() | |
data = load_results_file(args.results_json) | |
sims = data.get("simulations", []) | |
if not sims: | |
sys.exit("❌ Keine Simulationen in der Datei gefunden.") | |
# Extract per-task stats | |
rows: List[Dict[str, Any]] = [extract_sim_stats(sim) for sim in sims] | |
# Totals | |
total_input = sum(r["input_tokens"] for r in rows) | |
total_output = sum(r["output_tokens"] for r in rows) | |
total_tokens = total_input + total_output | |
total_duration_s = sum(r["duration_s"] for r in rows) | |
num_tasks = len(rows) | |
num_success = sum(r["success"] for r in rows) | |
pass_at_1 = (num_success / num_tasks * 100.0) if num_tasks else 0.0 | |
# Dual-control normalization | |
interactions = args.interactions if args.interactions is not None else (2 * num_tasks) | |
# Optional cost computation | |
usd_cost = None | |
eur_cost = None | |
cost_per_interaction_usd = None | |
cost_per_interaction_eur = None | |
if args.input_ppm is not None and args.output_ppm is not None: | |
usd_cost = calc_costs_usd(total_input, total_output, args.input_ppm, args.output_ppm) | |
cost_per_interaction_usd = usd_cost / interactions if interactions else None | |
if args.euro_rate: | |
eur_cost = usd_cost * args.euro_rate | |
cost_per_interaction_eur = eur_cost / interactions if interactions else None | |
# Console summary | |
model_name = data.get("info", {}).get("agent_info", {}).get("llm", "unknown") | |
print("📊 Tau²-Bench Zusammenfassung") | |
print(f"Modell: {model_name}") | |
print(f"Tasks: {num_tasks}") | |
print(f"Pass@1: {pass_at_1:.1f} % ({num_success}/{num_tasks})") | |
print(f"Input-Token gesamt: {human_int(total_input)}") | |
print(f"Output-Token gesamt: {human_int(total_output)}") | |
print(f"Gesamt-Token: {human_int(total_tokens)}") | |
print(f"Laufzeit gesamt [s]: {total_duration_s:.2f}") | |
print(f"Interaktionen (Dual): {interactions}") | |
if interactions: | |
print(f"Zeit / Interaktion [s]: {total_duration_s / interactions:.3f}") | |
# Missing-usage Hinweis (falls relevant) | |
missing_msgs = sum(r["missing_usage_msgs"] for r in rows) | |
if missing_msgs: | |
print(f"ℹ️ Hinweise: {missing_msgs} Nachrichten ohne Nutzungsdaten (usage) – Token wurden dort als 0 gezählt.") | |
if usd_cost is not None: | |
print(f"Kosten gesamt [USD]: ${usd_cost:.4f}") | |
if cost_per_interaction_usd is not None: | |
print(f"Kosten / Interaktion: ${cost_per_interaction_usd:.6f} (USD)") | |
if eur_cost is not None: | |
print(f"Kosten gesamt [EUR]: €{eur_cost:.4f} (Rate {args.euro_rate})") | |
if cost_per_interaction_eur is not None: | |
print(f"Kosten / Interaktion: €{cost_per_interaction_eur:.6f}") | |
# Optional CSV exports | |
if args.details_csv: | |
with args.details_csv.open("w", newline="", encoding="utf-8") as f: | |
w = csv.writer(f) | |
w.writerow(["task_id", "input_tokens", "output_tokens", "total_tokens", | |
"duration_s", "success", "reward", "messages_count", "missing_usage_msgs"]) | |
for r in rows: | |
w.writerow([r["task_id"], r["input_tokens"], r["output_tokens"], r["total_tokens"], | |
r["duration_s"], r["success"], r["reward"], r["messages_count"], r["missing_usage_msgs"]]) | |
print(f"💾 Details CSV gespeichert: {args.details_csv}") | |
if args.summary_csv: | |
with args.summary_csv.open("w", newline="", encoding="utf-8") as f: | |
w = csv.writer(f) | |
headers = [ | |
"model", "tasks", "pass_at_1_percent", | |
"input_tokens", "output_tokens", "total_tokens", | |
"duration_total_s", "interactions_dual", "time_per_interaction_s" | |
] | |
if usd_cost is not None: | |
headers += ["cost_total_usd", "cost_per_interaction_usd"] | |
if eur_cost is not None: | |
headers += ["cost_total_eur", "cost_per_interaction_eur"] | |
w.writerow(headers) | |
row = [ | |
model_name, num_tasks, f"{pass_at_1:.1f}", | |
total_input, total_output, total_tokens, | |
f"{total_duration_s:.2f}", interactions, | |
f"{(total_duration_s / interactions):.6f}" if interactions else "" | |
] | |
if usd_cost is not None: | |
row += [f"{usd_cost:.6f}", f"{cost_per_interaction_usd:.6f}"] | |
if eur_cost is not None: | |
row += [f"{eur_cost:.6f}", f"{cost_per_interaction_eur:.6f}"] | |
w.writerow(row) | |
print(f"💾 Summary CSV gespeichert: {args.summary_csv}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment