Update benchmark

This commit is contained in:
Joao Moura
2026-05-12 17:56:19 -04:00
committed by alex-clawd
parent 4c33de86a9
commit 813173c85f
2 changed files with 315 additions and 60 deletions

View File

@@ -7,7 +7,7 @@ import json
import re
import time
from pathlib import Path
from typing import Any
from typing import Any, Callable
from pydantic import BaseModel, Field
@@ -154,6 +154,7 @@ async def run_benchmark(
cases: list[BenchmarkCase],
models: list[str] | None = None,
judge_model: str = "openai/gpt-4o-mini",
on_progress: Callable[[dict[str, Any]], None] | None = None,
) -> dict[str, list[BenchmarkResult]]:
"""Run benchmark cases against an agent definition, optionally across multiple models.
@@ -162,6 +163,7 @@ async def run_benchmark(
cases: List of benchmark cases to run.
models: Optional list of model identifiers to compare. If None, uses agent's default.
judge_model: Model to use for LLM judge evaluation.
on_progress: Optional callback receiving progress dicts with a "type" key.
Returns:
Dict mapping model name to list of BenchmarkResult.
@@ -171,13 +173,19 @@ async def run_benchmark(
if models is None or len(models) == 0:
models = [defn.get("llm", "default")]
def _emit(event: dict[str, Any]) -> None:
if on_progress:
on_progress(event)
results_by_model: dict[str, list[BenchmarkResult]] = {}
for model in models:
for mi, model in enumerate(models):
model_results: list[BenchmarkResult] = []
_emit({"type": "model_start", "model": model, "model_index": mi, "total_models": len(models), "total_cases": len(cases)})
for i, case in enumerate(cases):
# Override the model and disable memory for benchmark runs
_emit({"type": "case_start", "model": model, "case_index": i, "total_cases": len(cases), "input": case.input})
bench_defn = dict(defn)
if model != "default":
bench_defn["llm"] = model
@@ -187,17 +195,17 @@ async def run_benchmark(
try:
agent = _load_agent(bench_defn)
except Exception as e:
model_results.append(
BenchmarkResult(
case_index=i,
input=case.input,
expected=case.expected,
actual=f"[Agent creation error: {e}]",
model=model,
passed=False,
score=0.0,
)
result = BenchmarkResult(
case_index=i,
input=case.input,
expected=case.expected,
actual=f"[Agent creation error: {e}]",
model=model,
passed=False,
score=0.0,
)
model_results.append(result)
_emit({"type": "case_done", "model": model, "case_index": i, "total_cases": len(cases), "passed": False, "score": 0.0, "time_ms": 0, "error": str(e)})
continue
start_ms = _current_time_ms()
@@ -212,55 +220,57 @@ async def run_benchmark(
except Exception as e:
elapsed_ms = _current_time_ms() - start_ms
model_results.append(
BenchmarkResult(
case_index=i,
input=case.input,
expected=case.expected,
actual=f"[Error: {e}]",
model=model,
passed=False,
score=0.0,
response_time_ms=elapsed_ms,
)
result = BenchmarkResult(
case_index=i,
input=case.input,
expected=case.expected,
actual=f"[Error: {e}]",
model=model,
passed=False,
score=0.0,
response_time_ms=elapsed_ms,
)
model_results.append(result)
_emit({"type": "case_done", "model": model, "case_index": i, "total_cases": len(cases), "passed": False, "score": 0.0, "time_ms": elapsed_ms, "error": str(e)})
continue
# Evaluate
passed = False
score = 0.0
if case.expected is not None:
passed, score = _check_expected(case.expected, actual)
if case.criteria is not None:
_emit({"type": "judging", "model": model, "case_index": i, "total_cases": len(cases)})
criteria_passed, criteria_score = await _judge_with_llm(
case.criteria, case.input, actual, judge_model
)
if case.expected is not None:
# Combine: both must pass, average scores
passed = passed and criteria_passed
score = (score + criteria_score) / 2.0
else:
passed = criteria_passed
score = criteria_score
model_results.append(
BenchmarkResult(
case_index=i,
input=case.input,
expected=case.expected,
actual=actual,
model=model,
passed=passed,
score=score,
input_tokens=input_tokens,
output_tokens=output_tokens,
response_time_ms=elapsed_ms,
cost=cost,
)
result = BenchmarkResult(
case_index=i,
input=case.input,
expected=case.expected,
actual=actual,
model=model,
passed=passed,
score=score,
input_tokens=input_tokens,
output_tokens=output_tokens,
response_time_ms=elapsed_ms,
cost=cost,
)
model_results.append(result)
_emit({"type": "case_done", "model": model, "case_index": i, "total_cases": len(cases), "passed": passed, "score": score, "time_ms": elapsed_ms})
results_by_model[model] = model_results
total_passed = sum(1 for r in model_results if r.passed)
avg_score = sum(r.score for r in model_results) / len(model_results) if model_results else 0.0
_emit({"type": "model_done", "model": model, "passed": total_passed, "total": len(model_results), "avg_score": avg_score})
return results_by_model
@@ -378,3 +388,167 @@ def format_comparison_table(results_by_model: dict[str, list[BenchmarkResult]])
lines.append(f"Best model: {best_model} (avg score: {best_score:.2f})")
return "\n".join(lines)
# ---------------------------------------------------------------------------
# Rich-based terminal charts
# ---------------------------------------------------------------------------
def _score_color(score: float) -> str:
if score >= 0.7:
return "green"
if score >= 0.4:
return "yellow"
return "red"
def _score_bar(score: float, width: int = 20) -> str:
clamped = max(0.0, min(1.0, score))
filled = round(clamped * width)
empty = width - filled
color = _score_color(score)
bar = f"[{color}]{'' * filled}[/{color}]"
if empty:
bar += f"[dim]{'' * empty}[/dim]"
return bar
def _fmt_tokens(n: int) -> str:
if n >= 1_000_000:
return f"{n / 1_000_000:.1f}M"
if n >= 1_000:
return f"{n / 1_000:.1f}k"
return str(n)
def _fmt_cost(cost: float | None) -> str:
if cost is None:
return ""
if cost < 0.01:
return f"${cost:.4f}"
return f"${cost:.2f}"
def print_results_chart(
results: list[BenchmarkResult],
console: Any | None = None,
) -> None:
from rich.console import Console
from rich.panel import Panel
if not console:
console = Console()
if not results:
console.print("[dim]No results to display.[/]")
return
model = results[0].model
has_cost = any(r.cost is not None for r in results)
inner_w = max(console.width - 4, 60)
bar_w = 12
overhead = 2 + 2 + 2 + 2 + bar_w + 1 + 4 + 2 + 4 + 2 + 6
if has_cost:
overhead += 2 + 7
input_w = max(15, inner_w - overhead)
rows: list[str] = []
for r in results:
inp = r.input[:input_w - 1] + "" if len(r.input) >= input_w else r.input
inp_pad = inp + " " * max(0, input_w - len(inp))
bar = _score_bar(r.score, bar_w)
badge = "[green]PASS[/green]" if r.passed else "[red]FAIL[/red]"
time_s = f"{r.response_time_ms / 1000:>5.1f}s"
cost_part = f" [dim]{_fmt_cost(r.cost):>7}[/dim]" if has_cost else ""
rows.append(
f" [dim]{r.case_index:>2}[/dim] {inp_pad} {bar} {r.score:.2f} {badge} [dim]{time_s}[/dim]{cost_part}"
)
n = len(results)
passed = sum(1 for r in results if r.passed)
avg = sum(r.score for r in results) / n
total_time = sum(r.response_time_ms for r in results) / 1000
total_in = sum(r.input_tokens for r in results)
total_out = sum(r.output_tokens for r in results)
total_cost = sum(r.cost for r in results if r.cost is not None)
color = _score_color(avg)
summary_parts = [
f"[{color}]{passed}/{n} passed[/]",
f"avg [{color}]{avg:.2f}[/]",
f"[dim]{total_time:.1f}s[/]",
f"[dim]↑{_fmt_tokens(total_in)}{_fmt_tokens(total_out)}[/]",
]
if total_cost > 0:
summary_parts.append(f"[dim]{_fmt_cost(total_cost)}[/]")
body = "\n".join(rows) + "\n\n " + " · ".join(summary_parts)
panel = Panel(
body,
title=f"[bold cyan]{model}[/]",
title_align="left",
border_style="cyan",
padding=(1, 0),
expand=False,
)
console.print(panel)
def print_comparison_chart(
results_by_model: dict[str, list[BenchmarkResult]],
console: Any | None = None,
) -> None:
from rich.console import Console
from rich.panel import Panel
if not console:
console = Console()
if not results_by_model:
console.print("[dim]No results to compare.[/]")
return
inner_w = max(console.width - 4, 60)
fixed_right = 1 + 4 + 2 + 5 + 2 + 6 + 4
models_data: list[tuple[str, int, int, float, float]] = []
best_model = ""
best_score = -1.0
for model, results in results_by_model.items():
n = len(results)
passed = sum(1 for r in results if r.passed)
avg = sum(r.score for r in results) / n if n else 0.0
total_time = sum(r.response_time_ms for r in results) / 1000
models_data.append((model, passed, n, avg, total_time))
if avg > best_score:
best_score = avg
best_model = model
max_name_len = min(max(len(m) for m, *_ in models_data), 28)
bar_width = max(12, inner_w - max_name_len - fixed_right - 4)
bar_width = min(bar_width, 30)
lines: list[str] = []
for model, passed, n, avg, total_time in models_data:
name = (model[:max_name_len - 1] + "" if len(model) > max_name_len else model).ljust(max_name_len)
bar = _score_bar(avg, bar_width)
pass_color = _score_color(avg)
star = " [bold green]★[/]" if model == best_model and len(models_data) > 1 else ""
lines.append(
f" {name} {bar} {avg:.2f} "
f"[{pass_color}]{passed}/{n}[/] "
f"[dim]{total_time:>5.1f}s[/]"
f"{star}"
)
body = "\n".join(lines)
panel = Panel(
body,
title="[bold]Model Comparison[/]",
title_align="left",
border_style="dim",
padding=(1, 1),
expand=False,
)
console.print(panel)

View File

@@ -235,17 +235,27 @@ def _train_new_agents(agent_files: list, n_iterations: int) -> None:
click.secho(f" Error loading agent {agent_name}: {e}", fg="red")
continue
from rich.console import Console as _Console
_console = _Console()
for iteration in range(n_iterations):
click.secho(f"\n Iteration {iteration + 1}/{n_iterations}", fg="cyan")
for case in cases:
for ci, case in enumerate(cases):
user_input = case.input
click.echo(f"\n Input: {user_input}")
snippet = user_input[:60] + ("" if len(user_input) > 60 else "")
_console.print(f"\n \\[{ci + 1}/{len(cases)}] {snippet}")
try:
response = asyncio.run(agent.amessage(user_input))
import time as _time
_t0 = _time.monotonic()
with _console.status("[cyan] Running…[/]", spinner="dots"):
response = asyncio.run(agent.amessage(user_input))
_elapsed = _time.monotonic() - _t0
_console.print(f" [green]✓[/] done ({_elapsed:.1f}s)")
click.echo(f" Response: {response.content[:500]}")
except Exception as e:
click.secho(f" Error: {e}", fg="red")
_console.print(f" [red]✗[/] error: {e}")
continue
if case.criteria:
@@ -533,6 +543,70 @@ def test(
evaluate_crew(n_iterations, crew_model, trained_agents_file=trained_agents_file)
def _make_benchmark_progress():
"""Create a progress callback with Rich spinner animation."""
import time
from rich.console import Console
from rich.spinner import Spinner
from rich.live import Live
console = Console()
state: dict = {"live": None}
def _stop_live():
if state["live"]:
state["live"].stop()
state["live"] = None
def progress(event: dict) -> None:
t = event["type"]
if t == "model_start":
_stop_live()
label = event["model"]
if event["total_models"] > 1:
label = f"\\[{event['model_index'] + 1}/{event['total_models']}] {label}"
console.print(f"\n[bold cyan]▶ {label}[/] [dim]({event['total_cases']} cases)[/]")
elif t == "case_start":
_stop_live()
idx = event["case_index"] + 1
total = event["total_cases"]
snippet = event["input"][:60] + ("" if len(event["input"]) > 60 else "")
console.print(f" [dim]\\[{idx}/{total}][/] {snippet}")
state["live"] = Live(
Spinner("dots", text=" running…", style="cyan"),
console=console,
transient=True,
)
state["live"].start()
elif t == "judging":
if state["live"]:
state["live"].update(
Spinner("dots", text=" judging…", style="cyan")
)
elif t == "case_done":
_stop_live()
elapsed_s = event["time_ms"] / 1000
if event.get("error"):
console.print(f" [red]✗ ERROR[/red] ({elapsed_s:.1f}s)")
elif event["passed"]:
console.print(f" [green]✓ PASS[/green] score={event['score']:.2f} ({elapsed_s:.1f}s)")
else:
console.print(f" [red]✗ FAIL[/red] score={event['score']:.2f} ({elapsed_s:.1f}s)")
elif t == "model_done":
_stop_live()
p, tot, avg = event["passed"], event["total"], event["avg_score"]
color = "green" if p == tot else ("yellow" if p > 0 else "red")
console.print(f" [{color}]── {p}/{tot} passed · avg score {avg:.2f}[/{color}]")
return progress
def _test_new_agents(
agent_files: list,
n_iterations: int,
@@ -544,14 +618,16 @@ def _test_new_agents(
import asyncio
from pathlib import Path
from rich.console import Console as _RichConsole
from crewai_cli.benchmark import (
format_results_table,
load_benchmark_cases,
print_results_chart,
run_benchmark,
)
_con = _RichConsole()
tests_dir = Path("tests")
# Fallback for projects created before the rename
if not tests_dir.is_dir() and Path("benchmarks").is_dir():
tests_dir = Path("benchmarks")
all_passed = True
@@ -584,6 +660,7 @@ def _test_new_agents(
cases=cases,
models=model_list,
judge_model=judge_model,
on_progress=_make_benchmark_progress(),
)
)
except Exception as e:
@@ -594,19 +671,19 @@ def _test_new_agents(
agents_tested += 1
for model_name, results in results_by_model.items():
click.echo(format_results_table(results))
_con.print()
print_results_chart(results, console=_con)
failed = [r for r in results if r.score < threshold]
if failed:
all_passed = False
click.secho(
f" FAILED: {len(failed)}/{len(results)} cases below threshold ({threshold})",
fg="red",
_con.print(
f"\n [red bold]FAILED: {len(failed)}/{len(results)} "
f"cases below threshold ({threshold})[/]"
)
else:
click.secho(
f" PASSED: all {len(results)} cases >= {threshold}",
fg="green",
_con.print(
f"\n [green bold]PASSED: all {len(results)} cases >= {threshold}[/]"
)
click.echo()
@@ -1372,13 +1449,17 @@ def benchmark(
uv_args.extend(["-m", m])
_relaunch_via_uv(uv_args)
from rich.console import Console as _RichConsole
from crewai_cli.benchmark import (
format_comparison_table,
format_results_table,
load_benchmark_cases,
print_comparison_chart,
print_results_chart,
run_benchmark,
)
_con = _RichConsole()
try:
cases = load_benchmark_cases(cases_path)
except (FileNotFoundError, ValueError) as e:
@@ -1401,20 +1482,20 @@ def benchmark(
cases=cases,
models=model_list,
judge_model=judge_model,
on_progress=_make_benchmark_progress(),
)
)
except Exception as e:
click.secho(f"Error running benchmark: {e}", fg="red")
raise SystemExit(1) from e
# Print results for each model
for model, results in results_by_model.items():
click.echo(format_results_table(results))
click.echo()
_con.print()
print_results_chart(results, console=_con)
# Print comparison if multiple models
if len(results_by_model) > 1:
click.echo(format_comparison_table(results_by_model))
_con.print()
print_comparison_chart(results_by_model, console=_con)
if __name__ == "__main__":