Skip to content

Instantly share code, notes, and snippets.

@bizrockman
Created August 15, 2025 00:02
Show Gist options
  • Save bizrockman/155e8f6830d6daf03b2328022d1fda9b to your computer and use it in GitHub Desktop.
Save bizrockman/155e8f6830d6daf03b2328022d1fda9b to your computer and use it in GitHub Desktop.
A simple script to give a summarization of a tau2 run
#!/usr/bin/env python3
"""
tau2_summarizer.py
------------------
Summarize Tau²-Bench result files (JSON) for Dual-Control, Retail runs.
- Reads a Tau² result JSON (created via `--save-to ...`)
- Aggregates per-task token usage and durations
- Computes Pass@1, totals, and normalized "per interaction" metrics (Dual = 2 * tasks)
- Optionally computes costs if prices are provided (USD per 1M tokens for input/output)
- Emits console summary and optional CSV exports (details + summary)
Usage:
python tau2_summarizer.py results.json
python tau2_summarizer.py results.json --summary-csv summary.csv --details-csv details.csv
python tau2_summarizer.py results.json --input-ppm 0.60 --output-ppm 2.20
python tau2_summarizer.py results.json --euro-rate 0.92 --input-ppm 0.60 --output-ppm 2.20
Notes:
- Pass@1: success if reward >= 1.0 (or fallback: simulation["success"] True)
- duration is expected to be in seconds in Tau² output
"""
import argparse
import csv
import json
import sys
from pathlib import Path
from typing import Any, Dict, List
def load_results_file(filepath: Path) -> Dict[str, Any]:
try:
with filepath.open('r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
sys.exit(f"❌ Datei nicht gefunden: {filepath}")
except json.JSONDecodeError:
sys.exit(f"❌ Ungültiges JSON-Format: {filepath}")
def _int(x) -> int:
try:
return int(x)
except Exception:
return 0
def extract_usage_tokens(msg: Dict[str, Any]) -> Dict[str, int]:
"""
Robustly extract input/output tokens from a message.
Supports multiple possible schemas:
- msg["usage"] = {"prompt_tokens", "completion_tokens"}
- msg["token_usage"] = {"input_tokens", "output_tokens"}
- alternate keys if providers differ
"""
usage = msg.get("usage")
if usage is None or not isinstance(usage, dict):
usage = msg.get("token_usage", {}) or {}
prompt = (
_int(usage.get("prompt_tokens"))
or _int(usage.get("input_tokens"))
or 0
)
completion = (
_int(usage.get("completion_tokens"))
or _int(usage.get("output_tokens"))
or 0
)
return {"prompt_tokens": prompt, "completion_tokens": completion}
def extract_sim_stats(sim: Dict[str, Any]) -> Dict[str, Any]:
in_tok = 0
out_tok = 0
msg_list = sim.get('messages') or []
missing_usage = 0
for msg in msg_list:
toks = extract_usage_tokens(msg)
if toks["prompt_tokens"] == 0 and toks["completion_tokens"] == 0:
# No usage info for this message — that's OK; just count as zero
missing_usage += 1
in_tok += toks["prompt_tokens"]
out_tok += toks["completion_tokens"]
total = in_tok + out_tok
duration = float(sim.get('duration') or 0.0) # seconds
# Pass@1 success detection: prefer reward >= 1.0, fallback to boolean
reward = float(sim.get('reward_info', {}).get('reward') or 0.0)
success_flag = sim.get("success")
ok = 1 if (reward >= 1.0 or success_flag is True) else 0
return {
'task_id': sim.get('task_id'),
'input_tokens': in_tok,
'output_tokens': out_tok,
'total_tokens': total,
'duration_s': duration,
'success': ok,
'reward': reward,
'missing_usage_msgs': missing_usage,
'messages_count': len(msg_list),
}
def calc_costs_usd(input_tokens: int, output_tokens: int, input_ppm: float, output_ppm: float) -> float:
"""Return total cost in USD using prices per million tokens (ppm)."""
return (input_tokens / 1_000_000.0) * input_ppm + (output_tokens / 1_000_000.0) * output_ppm
def human_int(n: float) -> str:
return f"{n:,.0f}".replace(",", ".")
def main() -> None:
ap = argparse.ArgumentParser(description="Summarize Tau²-Bench results (Retail, Dual-Control)")
ap.add_argument("results_json", type=Path, help="Pfad zur Tau² Ergebnisdatei (JSON, via --save-to erzeugt)")
ap.add_argument("--summary-csv", type=Path, help="Pfad für aggregierte Zusammenfassung als CSV", default=None)
ap.add_argument("--details-csv", type=Path, help="Pfad für Detailtabelle pro Task als CSV", default=None)
ap.add_argument("--input-ppm", type=float, default=None, help="Preis USD / 1M Input-Token (optional)")
ap.add_argument("--output-ppm", type=float, default=None, help="Preis USD / 1M Output-Token (optional)")
ap.add_argument("--euro-rate", type=float, default=None, help="USD→EUR Umrechnungsfaktor (optional, z.B. 0.92)")
ap.add_argument("--interactions", type=int, default=None,
help="Override der Interaktionszahl (Default: 2 * #Tasks im Dual-Setup)")
args = ap.parse_args()
data = load_results_file(args.results_json)
sims = data.get("simulations", [])
if not sims:
sys.exit("❌ Keine Simulationen in der Datei gefunden.")
# Extract per-task stats
rows: List[Dict[str, Any]] = [extract_sim_stats(sim) for sim in sims]
# Totals
total_input = sum(r["input_tokens"] for r in rows)
total_output = sum(r["output_tokens"] for r in rows)
total_tokens = total_input + total_output
total_duration_s = sum(r["duration_s"] for r in rows)
num_tasks = len(rows)
num_success = sum(r["success"] for r in rows)
pass_at_1 = (num_success / num_tasks * 100.0) if num_tasks else 0.0
# Dual-control normalization
interactions = args.interactions if args.interactions is not None else (2 * num_tasks)
# Optional cost computation
usd_cost = None
eur_cost = None
cost_per_interaction_usd = None
cost_per_interaction_eur = None
if args.input_ppm is not None and args.output_ppm is not None:
usd_cost = calc_costs_usd(total_input, total_output, args.input_ppm, args.output_ppm)
cost_per_interaction_usd = usd_cost / interactions if interactions else None
if args.euro_rate:
eur_cost = usd_cost * args.euro_rate
cost_per_interaction_eur = eur_cost / interactions if interactions else None
# Console summary
model_name = data.get("info", {}).get("agent_info", {}).get("llm", "unknown")
print("📊 Tau²-Bench Zusammenfassung")
print(f"Modell: {model_name}")
print(f"Tasks: {num_tasks}")
print(f"Pass@1: {pass_at_1:.1f} % ({num_success}/{num_tasks})")
print(f"Input-Token gesamt: {human_int(total_input)}")
print(f"Output-Token gesamt: {human_int(total_output)}")
print(f"Gesamt-Token: {human_int(total_tokens)}")
print(f"Laufzeit gesamt [s]: {total_duration_s:.2f}")
print(f"Interaktionen (Dual): {interactions}")
if interactions:
print(f"Zeit / Interaktion [s]: {total_duration_s / interactions:.3f}")
# Missing-usage Hinweis (falls relevant)
missing_msgs = sum(r["missing_usage_msgs"] for r in rows)
if missing_msgs:
print(f"ℹ️ Hinweise: {missing_msgs} Nachrichten ohne Nutzungsdaten (usage) – Token wurden dort als 0 gezählt.")
if usd_cost is not None:
print(f"Kosten gesamt [USD]: ${usd_cost:.4f}")
if cost_per_interaction_usd is not None:
print(f"Kosten / Interaktion: ${cost_per_interaction_usd:.6f} (USD)")
if eur_cost is not None:
print(f"Kosten gesamt [EUR]: €{eur_cost:.4f} (Rate {args.euro_rate})")
if cost_per_interaction_eur is not None:
print(f"Kosten / Interaktion: €{cost_per_interaction_eur:.6f}")
# Optional CSV exports
if args.details_csv:
with args.details_csv.open("w", newline="", encoding="utf-8") as f:
w = csv.writer(f)
w.writerow(["task_id", "input_tokens", "output_tokens", "total_tokens",
"duration_s", "success", "reward", "messages_count", "missing_usage_msgs"])
for r in rows:
w.writerow([r["task_id"], r["input_tokens"], r["output_tokens"], r["total_tokens"],
r["duration_s"], r["success"], r["reward"], r["messages_count"], r["missing_usage_msgs"]])
print(f"💾 Details CSV gespeichert: {args.details_csv}")
if args.summary_csv:
with args.summary_csv.open("w", newline="", encoding="utf-8") as f:
w = csv.writer(f)
headers = [
"model", "tasks", "pass_at_1_percent",
"input_tokens", "output_tokens", "total_tokens",
"duration_total_s", "interactions_dual", "time_per_interaction_s"
]
if usd_cost is not None:
headers += ["cost_total_usd", "cost_per_interaction_usd"]
if eur_cost is not None:
headers += ["cost_total_eur", "cost_per_interaction_eur"]
w.writerow(headers)
row = [
model_name, num_tasks, f"{pass_at_1:.1f}",
total_input, total_output, total_tokens,
f"{total_duration_s:.2f}", interactions,
f"{(total_duration_s / interactions):.6f}" if interactions else ""
]
if usd_cost is not None:
row += [f"{usd_cost:.6f}", f"{cost_per_interaction_usd:.6f}"]
if eur_cost is not None:
row += [f"{eur_cost:.6f}", f"{cost_per_interaction_eur:.6f}"]
w.writerow(row)
print(f"💾 Summary CSV gespeichert: {args.summary_csv}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment