Update benchmark

2026-07-05 23:19:22 +00:00 · 2026-05-12 17:56:19 -04:00
parent 4c33de86a9
commit 813173c85f
2 changed files with 315 additions and 60 deletions
--- a/lib/cli/src/crewai_cli/benchmark.py
+++ b/lib/cli/src/crewai_cli/benchmark.py
@@ -7,7 +7,7 @@ import json
 import re
 import time
 from pathlib import Path
-from typing import Any
+from typing import Any, Callable

 from pydantic import BaseModel, Field

@@ -154,6 +154,7 @@ async def run_benchmark(
    cases: list[BenchmarkCase],
    models: list[str] | None = None,
    judge_model: str = "openai/gpt-4o-mini",
+    on_progress: Callable[[dict[str, Any]], None] | None = None,
 ) -> dict[str, list[BenchmarkResult]]:
    """Run benchmark cases against an agent definition, optionally across multiple models.

@@ -162,6 +163,7 @@ async def run_benchmark(
        cases: List of benchmark cases to run.
        models: Optional list of model identifiers to compare. If None, uses agent's default.
        judge_model: Model to use for LLM judge evaluation.
+        on_progress: Optional callback receiving progress dicts with a "type" key.

    Returns:
        Dict mapping model name to list of BenchmarkResult.
@@ -171,13 +173,19 @@ async def run_benchmark(
    if models is None or len(models) == 0:
        models = [defn.get("llm", "default")]

+    def _emit(event: dict[str, Any]) -> None:
+        if on_progress:
+            on_progress(event)
+
    results_by_model: dict[str, list[BenchmarkResult]] = {}

-    for model in models:
+    for mi, model in enumerate(models):
        model_results: list[BenchmarkResult] = []
+        _emit({"type": "model_start", "model": model, "model_index": mi, "total_models": len(models), "total_cases": len(cases)})

        for i, case in enumerate(cases):
-            # Override the model and disable memory for benchmark runs
+            _emit({"type": "case_start", "model": model, "case_index": i, "total_cases": len(cases), "input": case.input})
+
            bench_defn = dict(defn)
            if model != "default":
                bench_defn["llm"] = model
@@ -187,17 +195,17 @@ async def run_benchmark(
            try:
                agent = _load_agent(bench_defn)
            except Exception as e:
-                model_results.append(
-                    BenchmarkResult(
-                        case_index=i,
-                        input=case.input,
-                        expected=case.expected,
-                        actual=f"[Agent creation error: {e}]",
-                        model=model,
-                        passed=False,
-                        score=0.0,
-                    )
+                result = BenchmarkResult(
+                    case_index=i,
+                    input=case.input,
+                    expected=case.expected,
+                    actual=f"[Agent creation error: {e}]",
+                    model=model,
+                    passed=False,
+                    score=0.0,
                )
+                model_results.append(result)
+                _emit({"type": "case_done", "model": model, "case_index": i, "total_cases": len(cases), "passed": False, "score": 0.0, "time_ms": 0, "error": str(e)})
                continue

            start_ms = _current_time_ms()
@@ -212,55 +220,57 @@ async def run_benchmark(

            except Exception as e:
                elapsed_ms = _current_time_ms() - start_ms
-                model_results.append(
-                    BenchmarkResult(
-                        case_index=i,
-                        input=case.input,
-                        expected=case.expected,
-                        actual=f"[Error: {e}]",
-                        model=model,
-                        passed=False,
-                        score=0.0,
-                        response_time_ms=elapsed_ms,
-                    )
+                result = BenchmarkResult(
+                    case_index=i,
+                    input=case.input,
+                    expected=case.expected,
+                    actual=f"[Error: {e}]",
+                    model=model,
+                    passed=False,
+                    score=0.0,
+                    response_time_ms=elapsed_ms,
                )
+                model_results.append(result)
+                _emit({"type": "case_done", "model": model, "case_index": i, "total_cases": len(cases), "passed": False, "score": 0.0, "time_ms": elapsed_ms, "error": str(e)})
                continue

-            # Evaluate
            passed = False
            score = 0.0

            if case.expected is not None:
                passed, score = _check_expected(case.expected, actual)
            if case.criteria is not None:
+                _emit({"type": "judging", "model": model, "case_index": i, "total_cases": len(cases)})
                criteria_passed, criteria_score = await _judge_with_llm(
                    case.criteria, case.input, actual, judge_model
                )
                if case.expected is not None:
-                    # Combine: both must pass, average scores
                    passed = passed and criteria_passed
                    score = (score + criteria_score) / 2.0
                else:
                    passed = criteria_passed
                    score = criteria_score

-            model_results.append(
-                BenchmarkResult(
-                    case_index=i,
-                    input=case.input,
-                    expected=case.expected,
-                    actual=actual,
-                    model=model,
-                    passed=passed,
-                    score=score,
-                    input_tokens=input_tokens,
-                    output_tokens=output_tokens,
-                    response_time_ms=elapsed_ms,
-                    cost=cost,
-                )
+            result = BenchmarkResult(
+                case_index=i,
+                input=case.input,
+                expected=case.expected,
+                actual=actual,
+                model=model,
+                passed=passed,
+                score=score,
+                input_tokens=input_tokens,
+                output_tokens=output_tokens,
+                response_time_ms=elapsed_ms,
+                cost=cost,
            )
+            model_results.append(result)
+            _emit({"type": "case_done", "model": model, "case_index": i, "total_cases": len(cases), "passed": passed, "score": score, "time_ms": elapsed_ms})

        results_by_model[model] = model_results
+        total_passed = sum(1 for r in model_results if r.passed)
+        avg_score = sum(r.score for r in model_results) / len(model_results) if model_results else 0.0
+        _emit({"type": "model_done", "model": model, "passed": total_passed, "total": len(model_results), "avg_score": avg_score})

    return results_by_model

@@ -378,3 +388,167 @@ def format_comparison_table(results_by_model: dict[str, list[BenchmarkResult]])
        lines.append(f"Best model: {best_model} (avg score: {best_score:.2f})")

    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# Rich-based terminal charts
+# ---------------------------------------------------------------------------
+
+def _score_color(score: float) -> str:
+    if score >= 0.7:
+        return "green"
+    if score >= 0.4:
+        return "yellow"
+    return "red"
+
+
+def _score_bar(score: float, width: int = 20) -> str:
+    clamped = max(0.0, min(1.0, score))
+    filled = round(clamped * width)
+    empty = width - filled
+    color = _score_color(score)
+    bar = f"[{color}]{'█' * filled}[/{color}]"
+    if empty:
+        bar += f"[dim]{'░' * empty}[/dim]"
+    return bar
+
+
+def _fmt_tokens(n: int) -> str:
+    if n >= 1_000_000:
+        return f"{n / 1_000_000:.1f}M"
+    if n >= 1_000:
+        return f"{n / 1_000:.1f}k"
+    return str(n)
+
+
+def _fmt_cost(cost: float | None) -> str:
+    if cost is None:
+        return ""
+    if cost < 0.01:
+        return f"${cost:.4f}"
+    return f"${cost:.2f}"
+
+
+def print_results_chart(
+    results: list[BenchmarkResult],
+    console: Any | None = None,
+) -> None:
+    from rich.console import Console
+    from rich.panel import Panel
+
+    if not console:
+        console = Console()
+
+    if not results:
+        console.print("[dim]No results to display.[/]")
+        return
+
+    model = results[0].model
+    has_cost = any(r.cost is not None for r in results)
+
+    inner_w = max(console.width - 4, 60)
+    bar_w = 12
+    overhead = 2 + 2 + 2 + 2 + bar_w + 1 + 4 + 2 + 4 + 2 + 6
+    if has_cost:
+        overhead += 2 + 7
+    input_w = max(15, inner_w - overhead)
+
+    rows: list[str] = []
+    for r in results:
+        inp = r.input[:input_w - 1] + "…" if len(r.input) >= input_w else r.input
+        inp_pad = inp + " " * max(0, input_w - len(inp))
+        bar = _score_bar(r.score, bar_w)
+        badge = "[green]PASS[/green]" if r.passed else "[red]FAIL[/red]"
+        time_s = f"{r.response_time_ms / 1000:>5.1f}s"
+        cost_part = f"  [dim]{_fmt_cost(r.cost):>7}[/dim]" if has_cost else ""
+        rows.append(
+            f"  [dim]{r.case_index:>2}[/dim]  {inp_pad}  {bar} {r.score:.2f}  {badge}  [dim]{time_s}[/dim]{cost_part}"
+        )
+
+    n = len(results)
+    passed = sum(1 for r in results if r.passed)
+    avg = sum(r.score for r in results) / n
+    total_time = sum(r.response_time_ms for r in results) / 1000
+    total_in = sum(r.input_tokens for r in results)
+    total_out = sum(r.output_tokens for r in results)
+    total_cost = sum(r.cost for r in results if r.cost is not None)
+
+    color = _score_color(avg)
+    summary_parts = [
+        f"[{color}]{passed}/{n} passed[/]",
+        f"avg [{color}]{avg:.2f}[/]",
+        f"[dim]{total_time:.1f}s[/]",
+        f"[dim]↑{_fmt_tokens(total_in)} ↓{_fmt_tokens(total_out)}[/]",
+    ]
+    if total_cost > 0:
+        summary_parts.append(f"[dim]{_fmt_cost(total_cost)}[/]")
+
+    body = "\n".join(rows) + "\n\n  " + "  ·  ".join(summary_parts)
+    panel = Panel(
+        body,
+        title=f"[bold cyan]{model}[/]",
+        title_align="left",
+        border_style="cyan",
+        padding=(1, 0),
+        expand=False,
+    )
+    console.print(panel)
+
+
+def print_comparison_chart(
+    results_by_model: dict[str, list[BenchmarkResult]],
+    console: Any | None = None,
+) -> None:
+    from rich.console import Console
+    from rich.panel import Panel
+
+    if not console:
+        console = Console()
+
+    if not results_by_model:
+        console.print("[dim]No results to compare.[/]")
+        return
+
+    inner_w = max(console.width - 4, 60)
+    fixed_right = 1 + 4 + 2 + 5 + 2 + 6 + 4
+    models_data: list[tuple[str, int, int, float, float]] = []
+    best_model = ""
+    best_score = -1.0
+
+    for model, results in results_by_model.items():
+        n = len(results)
+        passed = sum(1 for r in results if r.passed)
+        avg = sum(r.score for r in results) / n if n else 0.0
+        total_time = sum(r.response_time_ms for r in results) / 1000
+        models_data.append((model, passed, n, avg, total_time))
+        if avg > best_score:
+            best_score = avg
+            best_model = model
+
+    max_name_len = min(max(len(m) for m, *_ in models_data), 28)
+    bar_width = max(12, inner_w - max_name_len - fixed_right - 4)
+    bar_width = min(bar_width, 30)
+
+    lines: list[str] = []
+    for model, passed, n, avg, total_time in models_data:
+        name = (model[:max_name_len - 1] + "…" if len(model) > max_name_len else model).ljust(max_name_len)
+        bar = _score_bar(avg, bar_width)
+        pass_color = _score_color(avg)
+        star = " [bold green]★[/]" if model == best_model and len(models_data) > 1 else ""
+        lines.append(
+            f"  {name}  {bar} {avg:.2f}  "
+            f"[{pass_color}]{passed}/{n}[/]  "
+            f"[dim]{total_time:>5.1f}s[/]"
+            f"{star}"
+        )
+
+    body = "\n".join(lines)
+    panel = Panel(
+        body,
+        title="[bold]Model Comparison[/]",
+        title_align="left",
+        border_style="dim",
+        padding=(1, 1),
+        expand=False,
+    )
+    console.print(panel)
--- a/lib/cli/src/crewai_cli/cli.py
+++ b/lib/cli/src/crewai_cli/cli.py
@@ -235,17 +235,27 @@ def _train_new_agents(agent_files: list, n_iterations: int) -> None:
            click.secho(f"  Error loading agent {agent_name}: {e}", fg="red")
            continue

+        from rich.console import Console as _Console
+
+        _console = _Console()
+
        for iteration in range(n_iterations):
            click.secho(f"\n  Iteration {iteration + 1}/{n_iterations}", fg="cyan")
-            for case in cases:
+            for ci, case in enumerate(cases):
                user_input = case.input
-                click.echo(f"\n  Input: {user_input}")
+                snippet = user_input[:60] + ("…" if len(user_input) > 60 else "")
+                _console.print(f"\n  \\[{ci + 1}/{len(cases)}] {snippet}")

                try:
-                    response = asyncio.run(agent.amessage(user_input))
+                    import time as _time
+                    _t0 = _time.monotonic()
+                    with _console.status("[cyan]  Running…[/]", spinner="dots"):
+                        response = asyncio.run(agent.amessage(user_input))
+                    _elapsed = _time.monotonic() - _t0
+                    _console.print(f"  [green]✓[/] done ({_elapsed:.1f}s)")
                    click.echo(f"  Response: {response.content[:500]}")
                except Exception as e:
-                    click.secho(f"  Error: {e}", fg="red")
+                    _console.print(f"  [red]✗[/] error: {e}")
                    continue

                if case.criteria:
@@ -533,6 +543,70 @@ def test(
        evaluate_crew(n_iterations, crew_model, trained_agents_file=trained_agents_file)


+def _make_benchmark_progress():
+    """Create a progress callback with Rich spinner animation."""
+    import time
+
+    from rich.console import Console
+    from rich.spinner import Spinner
+    from rich.live import Live
+
+    console = Console()
+    state: dict = {"live": None}
+
+    def _stop_live():
+        if state["live"]:
+            state["live"].stop()
+            state["live"] = None
+
+    def progress(event: dict) -> None:
+        t = event["type"]
+
+        if t == "model_start":
+            _stop_live()
+            label = event["model"]
+            if event["total_models"] > 1:
+                label = f"\\[{event['model_index'] + 1}/{event['total_models']}] {label}"
+            console.print(f"\n[bold cyan]▶ {label}[/]  [dim]({event['total_cases']} cases)[/]")
+
+        elif t == "case_start":
+            _stop_live()
+            idx = event["case_index"] + 1
+            total = event["total_cases"]
+            snippet = event["input"][:60] + ("…" if len(event["input"]) > 60 else "")
+            console.print(f"  [dim]\\[{idx}/{total}][/] {snippet}")
+            state["live"] = Live(
+                Spinner("dots", text="  running…", style="cyan"),
+                console=console,
+                transient=True,
+            )
+            state["live"].start()
+
+        elif t == "judging":
+            if state["live"]:
+                state["live"].update(
+                    Spinner("dots", text="  judging…", style="cyan")
+                )
+
+        elif t == "case_done":
+            _stop_live()
+            elapsed_s = event["time_ms"] / 1000
+            if event.get("error"):
+                console.print(f"    [red]✗ ERROR[/red]  ({elapsed_s:.1f}s)")
+            elif event["passed"]:
+                console.print(f"    [green]✓ PASS[/green]  score={event['score']:.2f}  ({elapsed_s:.1f}s)")
+            else:
+                console.print(f"    [red]✗ FAIL[/red]  score={event['score']:.2f}  ({elapsed_s:.1f}s)")
+
+        elif t == "model_done":
+            _stop_live()
+            p, tot, avg = event["passed"], event["total"], event["avg_score"]
+            color = "green" if p == tot else ("yellow" if p > 0 else "red")
+            console.print(f"  [{color}]── {p}/{tot} passed · avg score {avg:.2f}[/{color}]")
+
+    return progress
+
+
 def _test_new_agents(
    agent_files: list,
    n_iterations: int,
@@ -544,14 +618,16 @@ def _test_new_agents(
    import asyncio
    from pathlib import Path

+    from rich.console import Console as _RichConsole
+
    from crewai_cli.benchmark import (
-        format_results_table,
        load_benchmark_cases,
+        print_results_chart,
        run_benchmark,
    )

+    _con = _RichConsole()
    tests_dir = Path("tests")
-    # Fallback for projects created before the rename
    if not tests_dir.is_dir() and Path("benchmarks").is_dir():
        tests_dir = Path("benchmarks")
    all_passed = True
@@ -584,6 +660,7 @@ def _test_new_agents(
                    cases=cases,
                    models=model_list,
                    judge_model=judge_model,
+                    on_progress=_make_benchmark_progress(),
                )
            )
        except Exception as e:
@@ -594,19 +671,19 @@ def _test_new_agents(
        agents_tested += 1

        for model_name, results in results_by_model.items():
-            click.echo(format_results_table(results))
+            _con.print()
+            print_results_chart(results, console=_con)

            failed = [r for r in results if r.score < threshold]
            if failed:
                all_passed = False
-                click.secho(
-                    f"  FAILED: {len(failed)}/{len(results)} cases below threshold ({threshold})",
-                    fg="red",
+                _con.print(
+                    f"\n  [red bold]FAILED: {len(failed)}/{len(results)} "
+                    f"cases below threshold ({threshold})[/]"
                )
            else:
-                click.secho(
-                    f"  PASSED: all {len(results)} cases >= {threshold}",
-                    fg="green",
+                _con.print(
+                    f"\n  [green bold]PASSED: all {len(results)} cases >= {threshold}[/]"
                )

    click.echo()
@@ -1372,13 +1449,17 @@ def benchmark(
            uv_args.extend(["-m", m])
        _relaunch_via_uv(uv_args)

+    from rich.console import Console as _RichConsole
+
    from crewai_cli.benchmark import (
-        format_comparison_table,
-        format_results_table,
        load_benchmark_cases,
+        print_comparison_chart,
+        print_results_chart,
        run_benchmark,
    )

+    _con = _RichConsole()
+
    try:
        cases = load_benchmark_cases(cases_path)
    except (FileNotFoundError, ValueError) as e:
@@ -1401,20 +1482,20 @@ def benchmark(
                cases=cases,
                models=model_list,
                judge_model=judge_model,
+                on_progress=_make_benchmark_progress(),
            )
        )
    except Exception as e:
        click.secho(f"Error running benchmark: {e}", fg="red")
        raise SystemExit(1) from e

-    # Print results for each model
    for model, results in results_by_model.items():
-        click.echo(format_results_table(results))
-        click.echo()
+        _con.print()
+        print_results_chart(results, console=_con)

-    # Print comparison if multiple models
    if len(results_by_model) > 1:
-        click.echo(format_comparison_table(results_by_model))
+        _con.print()
+        print_comparison_chart(results_by_model, console=_con)


 if __name__ == "__main__":