From 813173c85f3c593ebea05954e0c14aeeb3d483d5 Mon Sep 17 00:00:00 2001
From: Joao Moura <joaomdmoura@gmail.com>
Date: Tue, 12 May 2026 17:56:19 -0400
Subject: [PATCH] Update benchmark

---
 lib/cli/src/crewai_cli/benchmark.py | 254 +++++++++++++++++++++++-----
 lib/cli/src/crewai_cli/cli.py       | 121 ++++++++++---
 2 files changed, 315 insertions(+), 60 deletions(-)

diff --git a/lib/cli/src/crewai_cli/benchmark.py b/lib/cli/src/crewai_cli/benchmark.py
index a911d394e..c4a87465f 100644
--- a/lib/cli/src/crewai_cli/benchmark.py
+++ b/lib/cli/src/crewai_cli/benchmark.py
@@ -7,7 +7,7 @@ import json
 import re
 import time
 from pathlib import Path
-from typing import Any
+from typing import Any, Callable
 
 from pydantic import BaseModel, Field
 
@@ -154,6 +154,7 @@ async def run_benchmark(
     cases: list[BenchmarkCase],
     models: list[str] | None = None,
     judge_model: str = "openai/gpt-4o-mini",
+    on_progress: Callable[[dict[str, Any]], None] | None = None,
 ) -> dict[str, list[BenchmarkResult]]:
     """Run benchmark cases against an agent definition, optionally across multiple models.
 
@@ -162,6 +163,7 @@ async def run_benchmark(
         cases: List of benchmark cases to run.
         models: Optional list of model identifiers to compare. If None, uses agent's default.
         judge_model: Model to use for LLM judge evaluation.
+        on_progress: Optional callback receiving progress dicts with a "type" key.
 
     Returns:
         Dict mapping model name to list of BenchmarkResult.
@@ -171,13 +173,19 @@ async def run_benchmark(
     if models is None or len(models) == 0:
         models = [defn.get("llm", "default")]
 
+    def _emit(event: dict[str, Any]) -> None:
+        if on_progress:
+            on_progress(event)
+
     results_by_model: dict[str, list[BenchmarkResult]] = {}
 
-    for model in models:
+    for mi, model in enumerate(models):
         model_results: list[BenchmarkResult] = []
+        _emit({"type": "model_start", "model": model, "model_index": mi, "total_models": len(models), "total_cases": len(cases)})
 
         for i, case in enumerate(cases):
-            # Override the model and disable memory for benchmark runs
+            _emit({"type": "case_start", "model": model, "case_index": i, "total_cases": len(cases), "input": case.input})
+
             bench_defn = dict(defn)
             if model != "default":
                 bench_defn["llm"] = model
@@ -187,17 +195,17 @@ async def run_benchmark(
             try:
                 agent = _load_agent(bench_defn)
             except Exception as e:
-                model_results.append(
-                    BenchmarkResult(
-                        case_index=i,
-                        input=case.input,
-                        expected=case.expected,
-                        actual=f"[Agent creation error: {e}]",
-                        model=model,
-                        passed=False,
-                        score=0.0,
-                    )
+                result = BenchmarkResult(
+                    case_index=i,
+                    input=case.input,
+                    expected=case.expected,
+                    actual=f"[Agent creation error: {e}]",
+                    model=model,
+                    passed=False,
+                    score=0.0,
                 )
+                model_results.append(result)
+                _emit({"type": "case_done", "model": model, "case_index": i, "total_cases": len(cases), "passed": False, "score": 0.0, "time_ms": 0, "error": str(e)})
                 continue
 
             start_ms = _current_time_ms()
@@ -212,55 +220,57 @@ async def run_benchmark(
 
             except Exception as e:
                 elapsed_ms = _current_time_ms() - start_ms
-                model_results.append(
-                    BenchmarkResult(
-                        case_index=i,
-                        input=case.input,
-                        expected=case.expected,
-                        actual=f"[Error: {e}]",
-                        model=model,
-                        passed=False,
-                        score=0.0,
-                        response_time_ms=elapsed_ms,
-                    )
+                result = BenchmarkResult(
+                    case_index=i,
+                    input=case.input,
+                    expected=case.expected,
+                    actual=f"[Error: {e}]",
+                    model=model,
+                    passed=False,
+                    score=0.0,
+                    response_time_ms=elapsed_ms,
                 )
+                model_results.append(result)
+                _emit({"type": "case_done", "model": model, "case_index": i, "total_cases": len(cases), "passed": False, "score": 0.0, "time_ms": elapsed_ms, "error": str(e)})
                 continue
 
-            # Evaluate
             passed = False
             score = 0.0
 
             if case.expected is not None:
                 passed, score = _check_expected(case.expected, actual)
             if case.criteria is not None:
+                _emit({"type": "judging", "model": model, "case_index": i, "total_cases": len(cases)})
                 criteria_passed, criteria_score = await _judge_with_llm(
                     case.criteria, case.input, actual, judge_model
                 )
                 if case.expected is not None:
-                    # Combine: both must pass, average scores
                     passed = passed and criteria_passed
                     score = (score + criteria_score) / 2.0
                 else:
                     passed = criteria_passed
                     score = criteria_score
 
-            model_results.append(
-                BenchmarkResult(
-                    case_index=i,
-                    input=case.input,
-                    expected=case.expected,
-                    actual=actual,
-                    model=model,
-                    passed=passed,
-                    score=score,
-                    input_tokens=input_tokens,
-                    output_tokens=output_tokens,
-                    response_time_ms=elapsed_ms,
-                    cost=cost,
-                )
+            result = BenchmarkResult(
+                case_index=i,
+                input=case.input,
+                expected=case.expected,
+                actual=actual,
+                model=model,
+                passed=passed,
+                score=score,
+                input_tokens=input_tokens,
+                output_tokens=output_tokens,
+                response_time_ms=elapsed_ms,
+                cost=cost,
             )
+            model_results.append(result)
+            _emit({"type": "case_done", "model": model, "case_index": i, "total_cases": len(cases), "passed": passed, "score": score, "time_ms": elapsed_ms})
 
         results_by_model[model] = model_results
+        total_passed = sum(1 for r in model_results if r.passed)
+        avg_score = sum(r.score for r in model_results) / len(model_results) if model_results else 0.0
+        _emit({"type": "model_done", "model": model, "passed": total_passed, "total": len(model_results), "avg_score": avg_score})
 
     return results_by_model
 
@@ -378,3 +388,167 @@ def format_comparison_table(results_by_model: dict[str, list[BenchmarkResult]])
         lines.append(f"Best model: {best_model} (avg score: {best_score:.2f})")
 
     return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# Rich-based terminal charts
+# ---------------------------------------------------------------------------
+
+def _score_color(score: float) -> str:
+    if score >= 0.7:
+        return "green"
+    if score >= 0.4:
+        return "yellow"
+    return "red"
+
+
+def _score_bar(score: float, width: int = 20) -> str:
+    clamped = max(0.0, min(1.0, score))
+    filled = round(clamped * width)
+    empty = width - filled
+    color = _score_color(score)
+    bar = f"[{color}]{'█' * filled}[/{color}]"
+    if empty:
+        bar += f"[dim]{'░' * empty}[/dim]"
+    return bar
+
+
+def _fmt_tokens(n: int) -> str:
+    if n >= 1_000_000:
+        return f"{n / 1_000_000:.1f}M"
+    if n >= 1_000:
+        return f"{n / 1_000:.1f}k"
+    return str(n)
+
+
+def _fmt_cost(cost: float | None) -> str:
+    if cost is None:
+        return ""
+    if cost < 0.01:
+        return f"${cost:.4f}"
+    return f"${cost:.2f}"
+
+
+def print_results_chart(
+    results: list[BenchmarkResult],
+    console: Any | None = None,
+) -> None:
+    from rich.console import Console
+    from rich.panel import Panel
+
+    if not console:
+        console = Console()
+
+    if not results:
+        console.print("[dim]No results to display.[/]")
+        return
+
+    model = results[0].model
+    has_cost = any(r.cost is not None for r in results)
+
+    inner_w = max(console.width - 4, 60)
+    bar_w = 12
+    overhead = 2 + 2 + 2 + 2 + bar_w + 1 + 4 + 2 + 4 + 2 + 6
+    if has_cost:
+        overhead += 2 + 7
+    input_w = max(15, inner_w - overhead)
+
+    rows: list[str] = []
+    for r in results:
+        inp = r.input[:input_w - 1] + "…" if len(r.input) >= input_w else r.input
+        inp_pad = inp + " " * max(0, input_w - len(inp))
+        bar = _score_bar(r.score, bar_w)
+        badge = "[green]PASS[/green]" if r.passed else "[red]FAIL[/red]"
+        time_s = f"{r.response_time_ms / 1000:>5.1f}s"
+        cost_part = f"  [dim]{_fmt_cost(r.cost):>7}[/dim]" if has_cost else ""
+        rows.append(
+            f"  [dim]{r.case_index:>2}[/dim]  {inp_pad}  {bar} {r.score:.2f}  {badge}  [dim]{time_s}[/dim]{cost_part}"
+        )
+
+    n = len(results)
+    passed = sum(1 for r in results if r.passed)
+    avg = sum(r.score for r in results) / n
+    total_time = sum(r.response_time_ms for r in results) / 1000
+    total_in = sum(r.input_tokens for r in results)
+    total_out = sum(r.output_tokens for r in results)
+    total_cost = sum(r.cost for r in results if r.cost is not None)
+
+    color = _score_color(avg)
+    summary_parts = [
+        f"[{color}]{passed}/{n} passed[/]",
+        f"avg [{color}]{avg:.2f}[/]",
+        f"[dim]{total_time:.1f}s[/]",
+        f"[dim]↑{_fmt_tokens(total_in)} ↓{_fmt_tokens(total_out)}[/]",
+    ]
+    if total_cost > 0:
+        summary_parts.append(f"[dim]{_fmt_cost(total_cost)}[/]")
+
+    body = "\n".join(rows) + "\n\n  " + "  ·  ".join(summary_parts)
+    panel = Panel(
+        body,
+        title=f"[bold cyan]{model}[/]",
+        title_align="left",
+        border_style="cyan",
+        padding=(1, 0),
+        expand=False,
+    )
+    console.print(panel)
+
+
+def print_comparison_chart(
+    results_by_model: dict[str, list[BenchmarkResult]],
+    console: Any | None = None,
+) -> None:
+    from rich.console import Console
+    from rich.panel import Panel
+
+    if not console:
+        console = Console()
+
+    if not results_by_model:
+        console.print("[dim]No results to compare.[/]")
+        return
+
+    inner_w = max(console.width - 4, 60)
+    fixed_right = 1 + 4 + 2 + 5 + 2 + 6 + 4
+    models_data: list[tuple[str, int, int, float, float]] = []
+    best_model = ""
+    best_score = -1.0
+
+    for model, results in results_by_model.items():
+        n = len(results)
+        passed = sum(1 for r in results if r.passed)
+        avg = sum(r.score for r in results) / n if n else 0.0
+        total_time = sum(r.response_time_ms for r in results) / 1000
+        models_data.append((model, passed, n, avg, total_time))
+        if avg > best_score:
+            best_score = avg
+            best_model = model
+
+    max_name_len = min(max(len(m) for m, *_ in models_data), 28)
+    bar_width = max(12, inner_w - max_name_len - fixed_right - 4)
+    bar_width = min(bar_width, 30)
+
+    lines: list[str] = []
+    for model, passed, n, avg, total_time in models_data:
+        name = (model[:max_name_len - 1] + "…" if len(model) > max_name_len else model).ljust(max_name_len)
+        bar = _score_bar(avg, bar_width)
+        pass_color = _score_color(avg)
+        star = " [bold green]★[/]" if model == best_model and len(models_data) > 1 else ""
+        lines.append(
+            f"  {name}  {bar} {avg:.2f}  "
+            f"[{pass_color}]{passed}/{n}[/]  "
+            f"[dim]{total_time:>5.1f}s[/]"
+            f"{star}"
+        )
+
+    body = "\n".join(lines)
+    panel = Panel(
+        body,
+        title="[bold]Model Comparison[/]",
+        title_align="left",
+        border_style="dim",
+        padding=(1, 1),
+        expand=False,
+    )
+    console.print(panel)
diff --git a/lib/cli/src/crewai_cli/cli.py b/lib/cli/src/crewai_cli/cli.py
index 7065bfe2e..771b2edfa 100644
--- a/lib/cli/src/crewai_cli/cli.py
+++ b/lib/cli/src/crewai_cli/cli.py
@@ -235,17 +235,27 @@ def _train_new_agents(agent_files: list, n_iterations: int) -> None:
             click.secho(f"  Error loading agent {agent_name}: {e}", fg="red")
             continue
 
+        from rich.console import Console as _Console
+
+        _console = _Console()
+
         for iteration in range(n_iterations):
             click.secho(f"\n  Iteration {iteration + 1}/{n_iterations}", fg="cyan")
-            for case in cases:
+            for ci, case in enumerate(cases):
                 user_input = case.input
-                click.echo(f"\n  Input: {user_input}")
+                snippet = user_input[:60] + ("…" if len(user_input) > 60 else "")
+                _console.print(f"\n  \\[{ci + 1}/{len(cases)}] {snippet}")
 
                 try:
-                    response = asyncio.run(agent.amessage(user_input))
+                    import time as _time
+                    _t0 = _time.monotonic()
+                    with _console.status("[cyan]  Running…[/]", spinner="dots"):
+                        response = asyncio.run(agent.amessage(user_input))
+                    _elapsed = _time.monotonic() - _t0
+                    _console.print(f"  [green]✓[/] done ({_elapsed:.1f}s)")
                     click.echo(f"  Response: {response.content[:500]}")
                 except Exception as e:
-                    click.secho(f"  Error: {e}", fg="red")
+                    _console.print(f"  [red]✗[/] error: {e}")
                     continue
 
                 if case.criteria:
@@ -533,6 +543,70 @@ def test(
         evaluate_crew(n_iterations, crew_model, trained_agents_file=trained_agents_file)
 
 
+def _make_benchmark_progress():
+    """Create a progress callback with Rich spinner animation."""
+    import time
+
+    from rich.console import Console
+    from rich.spinner import Spinner
+    from rich.live import Live
+
+    console = Console()
+    state: dict = {"live": None}
+
+    def _stop_live():
+        if state["live"]:
+            state["live"].stop()
+            state["live"] = None
+
+    def progress(event: dict) -> None:
+        t = event["type"]
+
+        if t == "model_start":
+            _stop_live()
+            label = event["model"]
+            if event["total_models"] > 1:
+                label = f"\\[{event['model_index'] + 1}/{event['total_models']}] {label}"
+            console.print(f"\n[bold cyan]▶ {label}[/]  [dim]({event['total_cases']} cases)[/]")
+
+        elif t == "case_start":
+            _stop_live()
+            idx = event["case_index"] + 1
+            total = event["total_cases"]
+            snippet = event["input"][:60] + ("…" if len(event["input"]) > 60 else "")
+            console.print(f"  [dim]\\[{idx}/{total}][/] {snippet}")
+            state["live"] = Live(
+                Spinner("dots", text="  running…", style="cyan"),
+                console=console,
+                transient=True,
+            )
+            state["live"].start()
+
+        elif t == "judging":
+            if state["live"]:
+                state["live"].update(
+                    Spinner("dots", text="  judging…", style="cyan")
+                )
+
+        elif t == "case_done":
+            _stop_live()
+            elapsed_s = event["time_ms"] / 1000
+            if event.get("error"):
+                console.print(f"    [red]✗ ERROR[/red]  ({elapsed_s:.1f}s)")
+            elif event["passed"]:
+                console.print(f"    [green]✓ PASS[/green]  score={event['score']:.2f}  ({elapsed_s:.1f}s)")
+            else:
+                console.print(f"    [red]✗ FAIL[/red]  score={event['score']:.2f}  ({elapsed_s:.1f}s)")
+
+        elif t == "model_done":
+            _stop_live()
+            p, tot, avg = event["passed"], event["total"], event["avg_score"]
+            color = "green" if p == tot else ("yellow" if p > 0 else "red")
+            console.print(f"  [{color}]── {p}/{tot} passed · avg score {avg:.2f}[/{color}]")
+
+    return progress
+
+
 def _test_new_agents(
     agent_files: list,
     n_iterations: int,
@@ -544,14 +618,16 @@ def _test_new_agents(
     import asyncio
     from pathlib import Path
 
+    from rich.console import Console as _RichConsole
+
     from crewai_cli.benchmark import (
-        format_results_table,
         load_benchmark_cases,
+        print_results_chart,
         run_benchmark,
     )
 
+    _con = _RichConsole()
     tests_dir = Path("tests")
-    # Fallback for projects created before the rename
     if not tests_dir.is_dir() and Path("benchmarks").is_dir():
         tests_dir = Path("benchmarks")
     all_passed = True
@@ -584,6 +660,7 @@ def _test_new_agents(
                     cases=cases,
                     models=model_list,
                     judge_model=judge_model,
+                    on_progress=_make_benchmark_progress(),
                 )
             )
         except Exception as e:
@@ -594,19 +671,19 @@ def _test_new_agents(
         agents_tested += 1
 
         for model_name, results in results_by_model.items():
-            click.echo(format_results_table(results))
+            _con.print()
+            print_results_chart(results, console=_con)
 
             failed = [r for r in results if r.score < threshold]
             if failed:
                 all_passed = False
-                click.secho(
-                    f"  FAILED: {len(failed)}/{len(results)} cases below threshold ({threshold})",
-                    fg="red",
+                _con.print(
+                    f"\n  [red bold]FAILED: {len(failed)}/{len(results)} "
+                    f"cases below threshold ({threshold})[/]"
                 )
             else:
-                click.secho(
-                    f"  PASSED: all {len(results)} cases >= {threshold}",
-                    fg="green",
+                _con.print(
+                    f"\n  [green bold]PASSED: all {len(results)} cases >= {threshold}[/]"
                 )
 
     click.echo()
@@ -1372,13 +1449,17 @@ def benchmark(
             uv_args.extend(["-m", m])
         _relaunch_via_uv(uv_args)
 
+    from rich.console import Console as _RichConsole
+
     from crewai_cli.benchmark import (
-        format_comparison_table,
-        format_results_table,
         load_benchmark_cases,
+        print_comparison_chart,
+        print_results_chart,
         run_benchmark,
     )
 
+    _con = _RichConsole()
+
     try:
         cases = load_benchmark_cases(cases_path)
     except (FileNotFoundError, ValueError) as e:
@@ -1401,20 +1482,20 @@ def benchmark(
                 cases=cases,
                 models=model_list,
                 judge_model=judge_model,
+                on_progress=_make_benchmark_progress(),
             )
         )
     except Exception as e:
         click.secho(f"Error running benchmark: {e}", fg="red")
         raise SystemExit(1) from e
 
-    # Print results for each model
     for model, results in results_by_model.items():
-        click.echo(format_results_table(results))
-        click.echo()
+        _con.print()
+        print_results_chart(results, console=_con)
 
-    # Print comparison if multiple models
     if len(results_by_model) > 1:
-        click.echo(format_comparison_table(results_by_model))
+        _con.print()
+        print_comparison_chart(results_by_model, console=_con)
 
 
 if __name__ == "__main__":