mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-07-04 06:29:22 +00:00
Update benchmark
This commit is contained in:
@@ -7,7 +7,7 @@ import json
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import Any, Callable
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
@@ -154,6 +154,7 @@ async def run_benchmark(
|
||||
cases: list[BenchmarkCase],
|
||||
models: list[str] | None = None,
|
||||
judge_model: str = "openai/gpt-4o-mini",
|
||||
on_progress: Callable[[dict[str, Any]], None] | None = None,
|
||||
) -> dict[str, list[BenchmarkResult]]:
|
||||
"""Run benchmark cases against an agent definition, optionally across multiple models.
|
||||
|
||||
@@ -162,6 +163,7 @@ async def run_benchmark(
|
||||
cases: List of benchmark cases to run.
|
||||
models: Optional list of model identifiers to compare. If None, uses agent's default.
|
||||
judge_model: Model to use for LLM judge evaluation.
|
||||
on_progress: Optional callback receiving progress dicts with a "type" key.
|
||||
|
||||
Returns:
|
||||
Dict mapping model name to list of BenchmarkResult.
|
||||
@@ -171,13 +173,19 @@ async def run_benchmark(
|
||||
if models is None or len(models) == 0:
|
||||
models = [defn.get("llm", "default")]
|
||||
|
||||
def _emit(event: dict[str, Any]) -> None:
|
||||
if on_progress:
|
||||
on_progress(event)
|
||||
|
||||
results_by_model: dict[str, list[BenchmarkResult]] = {}
|
||||
|
||||
for model in models:
|
||||
for mi, model in enumerate(models):
|
||||
model_results: list[BenchmarkResult] = []
|
||||
_emit({"type": "model_start", "model": model, "model_index": mi, "total_models": len(models), "total_cases": len(cases)})
|
||||
|
||||
for i, case in enumerate(cases):
|
||||
# Override the model and disable memory for benchmark runs
|
||||
_emit({"type": "case_start", "model": model, "case_index": i, "total_cases": len(cases), "input": case.input})
|
||||
|
||||
bench_defn = dict(defn)
|
||||
if model != "default":
|
||||
bench_defn["llm"] = model
|
||||
@@ -187,17 +195,17 @@ async def run_benchmark(
|
||||
try:
|
||||
agent = _load_agent(bench_defn)
|
||||
except Exception as e:
|
||||
model_results.append(
|
||||
BenchmarkResult(
|
||||
case_index=i,
|
||||
input=case.input,
|
||||
expected=case.expected,
|
||||
actual=f"[Agent creation error: {e}]",
|
||||
model=model,
|
||||
passed=False,
|
||||
score=0.0,
|
||||
)
|
||||
result = BenchmarkResult(
|
||||
case_index=i,
|
||||
input=case.input,
|
||||
expected=case.expected,
|
||||
actual=f"[Agent creation error: {e}]",
|
||||
model=model,
|
||||
passed=False,
|
||||
score=0.0,
|
||||
)
|
||||
model_results.append(result)
|
||||
_emit({"type": "case_done", "model": model, "case_index": i, "total_cases": len(cases), "passed": False, "score": 0.0, "time_ms": 0, "error": str(e)})
|
||||
continue
|
||||
|
||||
start_ms = _current_time_ms()
|
||||
@@ -212,55 +220,57 @@ async def run_benchmark(
|
||||
|
||||
except Exception as e:
|
||||
elapsed_ms = _current_time_ms() - start_ms
|
||||
model_results.append(
|
||||
BenchmarkResult(
|
||||
case_index=i,
|
||||
input=case.input,
|
||||
expected=case.expected,
|
||||
actual=f"[Error: {e}]",
|
||||
model=model,
|
||||
passed=False,
|
||||
score=0.0,
|
||||
response_time_ms=elapsed_ms,
|
||||
)
|
||||
result = BenchmarkResult(
|
||||
case_index=i,
|
||||
input=case.input,
|
||||
expected=case.expected,
|
||||
actual=f"[Error: {e}]",
|
||||
model=model,
|
||||
passed=False,
|
||||
score=0.0,
|
||||
response_time_ms=elapsed_ms,
|
||||
)
|
||||
model_results.append(result)
|
||||
_emit({"type": "case_done", "model": model, "case_index": i, "total_cases": len(cases), "passed": False, "score": 0.0, "time_ms": elapsed_ms, "error": str(e)})
|
||||
continue
|
||||
|
||||
# Evaluate
|
||||
passed = False
|
||||
score = 0.0
|
||||
|
||||
if case.expected is not None:
|
||||
passed, score = _check_expected(case.expected, actual)
|
||||
if case.criteria is not None:
|
||||
_emit({"type": "judging", "model": model, "case_index": i, "total_cases": len(cases)})
|
||||
criteria_passed, criteria_score = await _judge_with_llm(
|
||||
case.criteria, case.input, actual, judge_model
|
||||
)
|
||||
if case.expected is not None:
|
||||
# Combine: both must pass, average scores
|
||||
passed = passed and criteria_passed
|
||||
score = (score + criteria_score) / 2.0
|
||||
else:
|
||||
passed = criteria_passed
|
||||
score = criteria_score
|
||||
|
||||
model_results.append(
|
||||
BenchmarkResult(
|
||||
case_index=i,
|
||||
input=case.input,
|
||||
expected=case.expected,
|
||||
actual=actual,
|
||||
model=model,
|
||||
passed=passed,
|
||||
score=score,
|
||||
input_tokens=input_tokens,
|
||||
output_tokens=output_tokens,
|
||||
response_time_ms=elapsed_ms,
|
||||
cost=cost,
|
||||
)
|
||||
result = BenchmarkResult(
|
||||
case_index=i,
|
||||
input=case.input,
|
||||
expected=case.expected,
|
||||
actual=actual,
|
||||
model=model,
|
||||
passed=passed,
|
||||
score=score,
|
||||
input_tokens=input_tokens,
|
||||
output_tokens=output_tokens,
|
||||
response_time_ms=elapsed_ms,
|
||||
cost=cost,
|
||||
)
|
||||
model_results.append(result)
|
||||
_emit({"type": "case_done", "model": model, "case_index": i, "total_cases": len(cases), "passed": passed, "score": score, "time_ms": elapsed_ms})
|
||||
|
||||
results_by_model[model] = model_results
|
||||
total_passed = sum(1 for r in model_results if r.passed)
|
||||
avg_score = sum(r.score for r in model_results) / len(model_results) if model_results else 0.0
|
||||
_emit({"type": "model_done", "model": model, "passed": total_passed, "total": len(model_results), "avg_score": avg_score})
|
||||
|
||||
return results_by_model
|
||||
|
||||
@@ -378,3 +388,167 @@ def format_comparison_table(results_by_model: dict[str, list[BenchmarkResult]])
|
||||
lines.append(f"Best model: {best_model} (avg score: {best_score:.2f})")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Rich-based terminal charts
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _score_color(score: float) -> str:
|
||||
if score >= 0.7:
|
||||
return "green"
|
||||
if score >= 0.4:
|
||||
return "yellow"
|
||||
return "red"
|
||||
|
||||
|
||||
def _score_bar(score: float, width: int = 20) -> str:
|
||||
clamped = max(0.0, min(1.0, score))
|
||||
filled = round(clamped * width)
|
||||
empty = width - filled
|
||||
color = _score_color(score)
|
||||
bar = f"[{color}]{'█' * filled}[/{color}]"
|
||||
if empty:
|
||||
bar += f"[dim]{'░' * empty}[/dim]"
|
||||
return bar
|
||||
|
||||
|
||||
def _fmt_tokens(n: int) -> str:
|
||||
if n >= 1_000_000:
|
||||
return f"{n / 1_000_000:.1f}M"
|
||||
if n >= 1_000:
|
||||
return f"{n / 1_000:.1f}k"
|
||||
return str(n)
|
||||
|
||||
|
||||
def _fmt_cost(cost: float | None) -> str:
|
||||
if cost is None:
|
||||
return ""
|
||||
if cost < 0.01:
|
||||
return f"${cost:.4f}"
|
||||
return f"${cost:.2f}"
|
||||
|
||||
|
||||
def print_results_chart(
|
||||
results: list[BenchmarkResult],
|
||||
console: Any | None = None,
|
||||
) -> None:
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
|
||||
if not console:
|
||||
console = Console()
|
||||
|
||||
if not results:
|
||||
console.print("[dim]No results to display.[/]")
|
||||
return
|
||||
|
||||
model = results[0].model
|
||||
has_cost = any(r.cost is not None for r in results)
|
||||
|
||||
inner_w = max(console.width - 4, 60)
|
||||
bar_w = 12
|
||||
overhead = 2 + 2 + 2 + 2 + bar_w + 1 + 4 + 2 + 4 + 2 + 6
|
||||
if has_cost:
|
||||
overhead += 2 + 7
|
||||
input_w = max(15, inner_w - overhead)
|
||||
|
||||
rows: list[str] = []
|
||||
for r in results:
|
||||
inp = r.input[:input_w - 1] + "…" if len(r.input) >= input_w else r.input
|
||||
inp_pad = inp + " " * max(0, input_w - len(inp))
|
||||
bar = _score_bar(r.score, bar_w)
|
||||
badge = "[green]PASS[/green]" if r.passed else "[red]FAIL[/red]"
|
||||
time_s = f"{r.response_time_ms / 1000:>5.1f}s"
|
||||
cost_part = f" [dim]{_fmt_cost(r.cost):>7}[/dim]" if has_cost else ""
|
||||
rows.append(
|
||||
f" [dim]{r.case_index:>2}[/dim] {inp_pad} {bar} {r.score:.2f} {badge} [dim]{time_s}[/dim]{cost_part}"
|
||||
)
|
||||
|
||||
n = len(results)
|
||||
passed = sum(1 for r in results if r.passed)
|
||||
avg = sum(r.score for r in results) / n
|
||||
total_time = sum(r.response_time_ms for r in results) / 1000
|
||||
total_in = sum(r.input_tokens for r in results)
|
||||
total_out = sum(r.output_tokens for r in results)
|
||||
total_cost = sum(r.cost for r in results if r.cost is not None)
|
||||
|
||||
color = _score_color(avg)
|
||||
summary_parts = [
|
||||
f"[{color}]{passed}/{n} passed[/]",
|
||||
f"avg [{color}]{avg:.2f}[/]",
|
||||
f"[dim]{total_time:.1f}s[/]",
|
||||
f"[dim]↑{_fmt_tokens(total_in)} ↓{_fmt_tokens(total_out)}[/]",
|
||||
]
|
||||
if total_cost > 0:
|
||||
summary_parts.append(f"[dim]{_fmt_cost(total_cost)}[/]")
|
||||
|
||||
body = "\n".join(rows) + "\n\n " + " · ".join(summary_parts)
|
||||
panel = Panel(
|
||||
body,
|
||||
title=f"[bold cyan]{model}[/]",
|
||||
title_align="left",
|
||||
border_style="cyan",
|
||||
padding=(1, 0),
|
||||
expand=False,
|
||||
)
|
||||
console.print(panel)
|
||||
|
||||
|
||||
def print_comparison_chart(
|
||||
results_by_model: dict[str, list[BenchmarkResult]],
|
||||
console: Any | None = None,
|
||||
) -> None:
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
|
||||
if not console:
|
||||
console = Console()
|
||||
|
||||
if not results_by_model:
|
||||
console.print("[dim]No results to compare.[/]")
|
||||
return
|
||||
|
||||
inner_w = max(console.width - 4, 60)
|
||||
fixed_right = 1 + 4 + 2 + 5 + 2 + 6 + 4
|
||||
models_data: list[tuple[str, int, int, float, float]] = []
|
||||
best_model = ""
|
||||
best_score = -1.0
|
||||
|
||||
for model, results in results_by_model.items():
|
||||
n = len(results)
|
||||
passed = sum(1 for r in results if r.passed)
|
||||
avg = sum(r.score for r in results) / n if n else 0.0
|
||||
total_time = sum(r.response_time_ms for r in results) / 1000
|
||||
models_data.append((model, passed, n, avg, total_time))
|
||||
if avg > best_score:
|
||||
best_score = avg
|
||||
best_model = model
|
||||
|
||||
max_name_len = min(max(len(m) for m, *_ in models_data), 28)
|
||||
bar_width = max(12, inner_w - max_name_len - fixed_right - 4)
|
||||
bar_width = min(bar_width, 30)
|
||||
|
||||
lines: list[str] = []
|
||||
for model, passed, n, avg, total_time in models_data:
|
||||
name = (model[:max_name_len - 1] + "…" if len(model) > max_name_len else model).ljust(max_name_len)
|
||||
bar = _score_bar(avg, bar_width)
|
||||
pass_color = _score_color(avg)
|
||||
star = " [bold green]★[/]" if model == best_model and len(models_data) > 1 else ""
|
||||
lines.append(
|
||||
f" {name} {bar} {avg:.2f} "
|
||||
f"[{pass_color}]{passed}/{n}[/] "
|
||||
f"[dim]{total_time:>5.1f}s[/]"
|
||||
f"{star}"
|
||||
)
|
||||
|
||||
body = "\n".join(lines)
|
||||
panel = Panel(
|
||||
body,
|
||||
title="[bold]Model Comparison[/]",
|
||||
title_align="left",
|
||||
border_style="dim",
|
||||
padding=(1, 1),
|
||||
expand=False,
|
||||
)
|
||||
console.print(panel)
|
||||
|
||||
@@ -235,17 +235,27 @@ def _train_new_agents(agent_files: list, n_iterations: int) -> None:
|
||||
click.secho(f" Error loading agent {agent_name}: {e}", fg="red")
|
||||
continue
|
||||
|
||||
from rich.console import Console as _Console
|
||||
|
||||
_console = _Console()
|
||||
|
||||
for iteration in range(n_iterations):
|
||||
click.secho(f"\n Iteration {iteration + 1}/{n_iterations}", fg="cyan")
|
||||
for case in cases:
|
||||
for ci, case in enumerate(cases):
|
||||
user_input = case.input
|
||||
click.echo(f"\n Input: {user_input}")
|
||||
snippet = user_input[:60] + ("…" if len(user_input) > 60 else "")
|
||||
_console.print(f"\n \\[{ci + 1}/{len(cases)}] {snippet}")
|
||||
|
||||
try:
|
||||
response = asyncio.run(agent.amessage(user_input))
|
||||
import time as _time
|
||||
_t0 = _time.monotonic()
|
||||
with _console.status("[cyan] Running…[/]", spinner="dots"):
|
||||
response = asyncio.run(agent.amessage(user_input))
|
||||
_elapsed = _time.monotonic() - _t0
|
||||
_console.print(f" [green]✓[/] done ({_elapsed:.1f}s)")
|
||||
click.echo(f" Response: {response.content[:500]}")
|
||||
except Exception as e:
|
||||
click.secho(f" Error: {e}", fg="red")
|
||||
_console.print(f" [red]✗[/] error: {e}")
|
||||
continue
|
||||
|
||||
if case.criteria:
|
||||
@@ -533,6 +543,70 @@ def test(
|
||||
evaluate_crew(n_iterations, crew_model, trained_agents_file=trained_agents_file)
|
||||
|
||||
|
||||
def _make_benchmark_progress():
|
||||
"""Create a progress callback with Rich spinner animation."""
|
||||
import time
|
||||
|
||||
from rich.console import Console
|
||||
from rich.spinner import Spinner
|
||||
from rich.live import Live
|
||||
|
||||
console = Console()
|
||||
state: dict = {"live": None}
|
||||
|
||||
def _stop_live():
|
||||
if state["live"]:
|
||||
state["live"].stop()
|
||||
state["live"] = None
|
||||
|
||||
def progress(event: dict) -> None:
|
||||
t = event["type"]
|
||||
|
||||
if t == "model_start":
|
||||
_stop_live()
|
||||
label = event["model"]
|
||||
if event["total_models"] > 1:
|
||||
label = f"\\[{event['model_index'] + 1}/{event['total_models']}] {label}"
|
||||
console.print(f"\n[bold cyan]▶ {label}[/] [dim]({event['total_cases']} cases)[/]")
|
||||
|
||||
elif t == "case_start":
|
||||
_stop_live()
|
||||
idx = event["case_index"] + 1
|
||||
total = event["total_cases"]
|
||||
snippet = event["input"][:60] + ("…" if len(event["input"]) > 60 else "")
|
||||
console.print(f" [dim]\\[{idx}/{total}][/] {snippet}")
|
||||
state["live"] = Live(
|
||||
Spinner("dots", text=" running…", style="cyan"),
|
||||
console=console,
|
||||
transient=True,
|
||||
)
|
||||
state["live"].start()
|
||||
|
||||
elif t == "judging":
|
||||
if state["live"]:
|
||||
state["live"].update(
|
||||
Spinner("dots", text=" judging…", style="cyan")
|
||||
)
|
||||
|
||||
elif t == "case_done":
|
||||
_stop_live()
|
||||
elapsed_s = event["time_ms"] / 1000
|
||||
if event.get("error"):
|
||||
console.print(f" [red]✗ ERROR[/red] ({elapsed_s:.1f}s)")
|
||||
elif event["passed"]:
|
||||
console.print(f" [green]✓ PASS[/green] score={event['score']:.2f} ({elapsed_s:.1f}s)")
|
||||
else:
|
||||
console.print(f" [red]✗ FAIL[/red] score={event['score']:.2f} ({elapsed_s:.1f}s)")
|
||||
|
||||
elif t == "model_done":
|
||||
_stop_live()
|
||||
p, tot, avg = event["passed"], event["total"], event["avg_score"]
|
||||
color = "green" if p == tot else ("yellow" if p > 0 else "red")
|
||||
console.print(f" [{color}]── {p}/{tot} passed · avg score {avg:.2f}[/{color}]")
|
||||
|
||||
return progress
|
||||
|
||||
|
||||
def _test_new_agents(
|
||||
agent_files: list,
|
||||
n_iterations: int,
|
||||
@@ -544,14 +618,16 @@ def _test_new_agents(
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
|
||||
from rich.console import Console as _RichConsole
|
||||
|
||||
from crewai_cli.benchmark import (
|
||||
format_results_table,
|
||||
load_benchmark_cases,
|
||||
print_results_chart,
|
||||
run_benchmark,
|
||||
)
|
||||
|
||||
_con = _RichConsole()
|
||||
tests_dir = Path("tests")
|
||||
# Fallback for projects created before the rename
|
||||
if not tests_dir.is_dir() and Path("benchmarks").is_dir():
|
||||
tests_dir = Path("benchmarks")
|
||||
all_passed = True
|
||||
@@ -584,6 +660,7 @@ def _test_new_agents(
|
||||
cases=cases,
|
||||
models=model_list,
|
||||
judge_model=judge_model,
|
||||
on_progress=_make_benchmark_progress(),
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
@@ -594,19 +671,19 @@ def _test_new_agents(
|
||||
agents_tested += 1
|
||||
|
||||
for model_name, results in results_by_model.items():
|
||||
click.echo(format_results_table(results))
|
||||
_con.print()
|
||||
print_results_chart(results, console=_con)
|
||||
|
||||
failed = [r for r in results if r.score < threshold]
|
||||
if failed:
|
||||
all_passed = False
|
||||
click.secho(
|
||||
f" FAILED: {len(failed)}/{len(results)} cases below threshold ({threshold})",
|
||||
fg="red",
|
||||
_con.print(
|
||||
f"\n [red bold]FAILED: {len(failed)}/{len(results)} "
|
||||
f"cases below threshold ({threshold})[/]"
|
||||
)
|
||||
else:
|
||||
click.secho(
|
||||
f" PASSED: all {len(results)} cases >= {threshold}",
|
||||
fg="green",
|
||||
_con.print(
|
||||
f"\n [green bold]PASSED: all {len(results)} cases >= {threshold}[/]"
|
||||
)
|
||||
|
||||
click.echo()
|
||||
@@ -1372,13 +1449,17 @@ def benchmark(
|
||||
uv_args.extend(["-m", m])
|
||||
_relaunch_via_uv(uv_args)
|
||||
|
||||
from rich.console import Console as _RichConsole
|
||||
|
||||
from crewai_cli.benchmark import (
|
||||
format_comparison_table,
|
||||
format_results_table,
|
||||
load_benchmark_cases,
|
||||
print_comparison_chart,
|
||||
print_results_chart,
|
||||
run_benchmark,
|
||||
)
|
||||
|
||||
_con = _RichConsole()
|
||||
|
||||
try:
|
||||
cases = load_benchmark_cases(cases_path)
|
||||
except (FileNotFoundError, ValueError) as e:
|
||||
@@ -1401,20 +1482,20 @@ def benchmark(
|
||||
cases=cases,
|
||||
models=model_list,
|
||||
judge_model=judge_model,
|
||||
on_progress=_make_benchmark_progress(),
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
click.secho(f"Error running benchmark: {e}", fg="red")
|
||||
raise SystemExit(1) from e
|
||||
|
||||
# Print results for each model
|
||||
for model, results in results_by_model.items():
|
||||
click.echo(format_results_table(results))
|
||||
click.echo()
|
||||
_con.print()
|
||||
print_results_chart(results, console=_con)
|
||||
|
||||
# Print comparison if multiple models
|
||||
if len(results_by_model) > 1:
|
||||
click.echo(format_comparison_table(results_by_model))
|
||||
_con.print()
|
||||
print_comparison_chart(results_by_model, console=_con)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user