From 813173c85f3c593ebea05954e0c14aeeb3d483d5 Mon Sep 17 00:00:00 2001 From: Joao Moura Date: Tue, 12 May 2026 17:56:19 -0400 Subject: [PATCH] Update benchmark --- lib/cli/src/crewai_cli/benchmark.py | 254 +++++++++++++++++++++++----- lib/cli/src/crewai_cli/cli.py | 121 ++++++++++--- 2 files changed, 315 insertions(+), 60 deletions(-) diff --git a/lib/cli/src/crewai_cli/benchmark.py b/lib/cli/src/crewai_cli/benchmark.py index a911d394e..c4a87465f 100644 --- a/lib/cli/src/crewai_cli/benchmark.py +++ b/lib/cli/src/crewai_cli/benchmark.py @@ -7,7 +7,7 @@ import json import re import time from pathlib import Path -from typing import Any +from typing import Any, Callable from pydantic import BaseModel, Field @@ -154,6 +154,7 @@ async def run_benchmark( cases: list[BenchmarkCase], models: list[str] | None = None, judge_model: str = "openai/gpt-4o-mini", + on_progress: Callable[[dict[str, Any]], None] | None = None, ) -> dict[str, list[BenchmarkResult]]: """Run benchmark cases against an agent definition, optionally across multiple models. @@ -162,6 +163,7 @@ async def run_benchmark( cases: List of benchmark cases to run. models: Optional list of model identifiers to compare. If None, uses agent's default. judge_model: Model to use for LLM judge evaluation. + on_progress: Optional callback receiving progress dicts with a "type" key. Returns: Dict mapping model name to list of BenchmarkResult. @@ -171,13 +173,19 @@ async def run_benchmark( if models is None or len(models) == 0: models = [defn.get("llm", "default")] + def _emit(event: dict[str, Any]) -> None: + if on_progress: + on_progress(event) + results_by_model: dict[str, list[BenchmarkResult]] = {} - for model in models: + for mi, model in enumerate(models): model_results: list[BenchmarkResult] = [] + _emit({"type": "model_start", "model": model, "model_index": mi, "total_models": len(models), "total_cases": len(cases)}) for i, case in enumerate(cases): - # Override the model and disable memory for benchmark runs + _emit({"type": "case_start", "model": model, "case_index": i, "total_cases": len(cases), "input": case.input}) + bench_defn = dict(defn) if model != "default": bench_defn["llm"] = model @@ -187,17 +195,17 @@ async def run_benchmark( try: agent = _load_agent(bench_defn) except Exception as e: - model_results.append( - BenchmarkResult( - case_index=i, - input=case.input, - expected=case.expected, - actual=f"[Agent creation error: {e}]", - model=model, - passed=False, - score=0.0, - ) + result = BenchmarkResult( + case_index=i, + input=case.input, + expected=case.expected, + actual=f"[Agent creation error: {e}]", + model=model, + passed=False, + score=0.0, ) + model_results.append(result) + _emit({"type": "case_done", "model": model, "case_index": i, "total_cases": len(cases), "passed": False, "score": 0.0, "time_ms": 0, "error": str(e)}) continue start_ms = _current_time_ms() @@ -212,55 +220,57 @@ async def run_benchmark( except Exception as e: elapsed_ms = _current_time_ms() - start_ms - model_results.append( - BenchmarkResult( - case_index=i, - input=case.input, - expected=case.expected, - actual=f"[Error: {e}]", - model=model, - passed=False, - score=0.0, - response_time_ms=elapsed_ms, - ) + result = BenchmarkResult( + case_index=i, + input=case.input, + expected=case.expected, + actual=f"[Error: {e}]", + model=model, + passed=False, + score=0.0, + response_time_ms=elapsed_ms, ) + model_results.append(result) + _emit({"type": "case_done", "model": model, "case_index": i, "total_cases": len(cases), "passed": False, "score": 0.0, "time_ms": elapsed_ms, "error": str(e)}) continue - # Evaluate passed = False score = 0.0 if case.expected is not None: passed, score = _check_expected(case.expected, actual) if case.criteria is not None: + _emit({"type": "judging", "model": model, "case_index": i, "total_cases": len(cases)}) criteria_passed, criteria_score = await _judge_with_llm( case.criteria, case.input, actual, judge_model ) if case.expected is not None: - # Combine: both must pass, average scores passed = passed and criteria_passed score = (score + criteria_score) / 2.0 else: passed = criteria_passed score = criteria_score - model_results.append( - BenchmarkResult( - case_index=i, - input=case.input, - expected=case.expected, - actual=actual, - model=model, - passed=passed, - score=score, - input_tokens=input_tokens, - output_tokens=output_tokens, - response_time_ms=elapsed_ms, - cost=cost, - ) + result = BenchmarkResult( + case_index=i, + input=case.input, + expected=case.expected, + actual=actual, + model=model, + passed=passed, + score=score, + input_tokens=input_tokens, + output_tokens=output_tokens, + response_time_ms=elapsed_ms, + cost=cost, ) + model_results.append(result) + _emit({"type": "case_done", "model": model, "case_index": i, "total_cases": len(cases), "passed": passed, "score": score, "time_ms": elapsed_ms}) results_by_model[model] = model_results + total_passed = sum(1 for r in model_results if r.passed) + avg_score = sum(r.score for r in model_results) / len(model_results) if model_results else 0.0 + _emit({"type": "model_done", "model": model, "passed": total_passed, "total": len(model_results), "avg_score": avg_score}) return results_by_model @@ -378,3 +388,167 @@ def format_comparison_table(results_by_model: dict[str, list[BenchmarkResult]]) lines.append(f"Best model: {best_model} (avg score: {best_score:.2f})") return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# Rich-based terminal charts +# --------------------------------------------------------------------------- + +def _score_color(score: float) -> str: + if score >= 0.7: + return "green" + if score >= 0.4: + return "yellow" + return "red" + + +def _score_bar(score: float, width: int = 20) -> str: + clamped = max(0.0, min(1.0, score)) + filled = round(clamped * width) + empty = width - filled + color = _score_color(score) + bar = f"[{color}]{'█' * filled}[/{color}]" + if empty: + bar += f"[dim]{'░' * empty}[/dim]" + return bar + + +def _fmt_tokens(n: int) -> str: + if n >= 1_000_000: + return f"{n / 1_000_000:.1f}M" + if n >= 1_000: + return f"{n / 1_000:.1f}k" + return str(n) + + +def _fmt_cost(cost: float | None) -> str: + if cost is None: + return "" + if cost < 0.01: + return f"${cost:.4f}" + return f"${cost:.2f}" + + +def print_results_chart( + results: list[BenchmarkResult], + console: Any | None = None, +) -> None: + from rich.console import Console + from rich.panel import Panel + + if not console: + console = Console() + + if not results: + console.print("[dim]No results to display.[/]") + return + + model = results[0].model + has_cost = any(r.cost is not None for r in results) + + inner_w = max(console.width - 4, 60) + bar_w = 12 + overhead = 2 + 2 + 2 + 2 + bar_w + 1 + 4 + 2 + 4 + 2 + 6 + if has_cost: + overhead += 2 + 7 + input_w = max(15, inner_w - overhead) + + rows: list[str] = [] + for r in results: + inp = r.input[:input_w - 1] + "…" if len(r.input) >= input_w else r.input + inp_pad = inp + " " * max(0, input_w - len(inp)) + bar = _score_bar(r.score, bar_w) + badge = "[green]PASS[/green]" if r.passed else "[red]FAIL[/red]" + time_s = f"{r.response_time_ms / 1000:>5.1f}s" + cost_part = f" [dim]{_fmt_cost(r.cost):>7}[/dim]" if has_cost else "" + rows.append( + f" [dim]{r.case_index:>2}[/dim] {inp_pad} {bar} {r.score:.2f} {badge} [dim]{time_s}[/dim]{cost_part}" + ) + + n = len(results) + passed = sum(1 for r in results if r.passed) + avg = sum(r.score for r in results) / n + total_time = sum(r.response_time_ms for r in results) / 1000 + total_in = sum(r.input_tokens for r in results) + total_out = sum(r.output_tokens for r in results) + total_cost = sum(r.cost for r in results if r.cost is not None) + + color = _score_color(avg) + summary_parts = [ + f"[{color}]{passed}/{n} passed[/]", + f"avg [{color}]{avg:.2f}[/]", + f"[dim]{total_time:.1f}s[/]", + f"[dim]↑{_fmt_tokens(total_in)} ↓{_fmt_tokens(total_out)}[/]", + ] + if total_cost > 0: + summary_parts.append(f"[dim]{_fmt_cost(total_cost)}[/]") + + body = "\n".join(rows) + "\n\n " + " · ".join(summary_parts) + panel = Panel( + body, + title=f"[bold cyan]{model}[/]", + title_align="left", + border_style="cyan", + padding=(1, 0), + expand=False, + ) + console.print(panel) + + +def print_comparison_chart( + results_by_model: dict[str, list[BenchmarkResult]], + console: Any | None = None, +) -> None: + from rich.console import Console + from rich.panel import Panel + + if not console: + console = Console() + + if not results_by_model: + console.print("[dim]No results to compare.[/]") + return + + inner_w = max(console.width - 4, 60) + fixed_right = 1 + 4 + 2 + 5 + 2 + 6 + 4 + models_data: list[tuple[str, int, int, float, float]] = [] + best_model = "" + best_score = -1.0 + + for model, results in results_by_model.items(): + n = len(results) + passed = sum(1 for r in results if r.passed) + avg = sum(r.score for r in results) / n if n else 0.0 + total_time = sum(r.response_time_ms for r in results) / 1000 + models_data.append((model, passed, n, avg, total_time)) + if avg > best_score: + best_score = avg + best_model = model + + max_name_len = min(max(len(m) for m, *_ in models_data), 28) + bar_width = max(12, inner_w - max_name_len - fixed_right - 4) + bar_width = min(bar_width, 30) + + lines: list[str] = [] + for model, passed, n, avg, total_time in models_data: + name = (model[:max_name_len - 1] + "…" if len(model) > max_name_len else model).ljust(max_name_len) + bar = _score_bar(avg, bar_width) + pass_color = _score_color(avg) + star = " [bold green]★[/]" if model == best_model and len(models_data) > 1 else "" + lines.append( + f" {name} {bar} {avg:.2f} " + f"[{pass_color}]{passed}/{n}[/] " + f"[dim]{total_time:>5.1f}s[/]" + f"{star}" + ) + + body = "\n".join(lines) + panel = Panel( + body, + title="[bold]Model Comparison[/]", + title_align="left", + border_style="dim", + padding=(1, 1), + expand=False, + ) + console.print(panel) diff --git a/lib/cli/src/crewai_cli/cli.py b/lib/cli/src/crewai_cli/cli.py index 7065bfe2e..771b2edfa 100644 --- a/lib/cli/src/crewai_cli/cli.py +++ b/lib/cli/src/crewai_cli/cli.py @@ -235,17 +235,27 @@ def _train_new_agents(agent_files: list, n_iterations: int) -> None: click.secho(f" Error loading agent {agent_name}: {e}", fg="red") continue + from rich.console import Console as _Console + + _console = _Console() + for iteration in range(n_iterations): click.secho(f"\n Iteration {iteration + 1}/{n_iterations}", fg="cyan") - for case in cases: + for ci, case in enumerate(cases): user_input = case.input - click.echo(f"\n Input: {user_input}") + snippet = user_input[:60] + ("…" if len(user_input) > 60 else "") + _console.print(f"\n \\[{ci + 1}/{len(cases)}] {snippet}") try: - response = asyncio.run(agent.amessage(user_input)) + import time as _time + _t0 = _time.monotonic() + with _console.status("[cyan] Running…[/]", spinner="dots"): + response = asyncio.run(agent.amessage(user_input)) + _elapsed = _time.monotonic() - _t0 + _console.print(f" [green]✓[/] done ({_elapsed:.1f}s)") click.echo(f" Response: {response.content[:500]}") except Exception as e: - click.secho(f" Error: {e}", fg="red") + _console.print(f" [red]✗[/] error: {e}") continue if case.criteria: @@ -533,6 +543,70 @@ def test( evaluate_crew(n_iterations, crew_model, trained_agents_file=trained_agents_file) +def _make_benchmark_progress(): + """Create a progress callback with Rich spinner animation.""" + import time + + from rich.console import Console + from rich.spinner import Spinner + from rich.live import Live + + console = Console() + state: dict = {"live": None} + + def _stop_live(): + if state["live"]: + state["live"].stop() + state["live"] = None + + def progress(event: dict) -> None: + t = event["type"] + + if t == "model_start": + _stop_live() + label = event["model"] + if event["total_models"] > 1: + label = f"\\[{event['model_index'] + 1}/{event['total_models']}] {label}" + console.print(f"\n[bold cyan]▶ {label}[/] [dim]({event['total_cases']} cases)[/]") + + elif t == "case_start": + _stop_live() + idx = event["case_index"] + 1 + total = event["total_cases"] + snippet = event["input"][:60] + ("…" if len(event["input"]) > 60 else "") + console.print(f" [dim]\\[{idx}/{total}][/] {snippet}") + state["live"] = Live( + Spinner("dots", text=" running…", style="cyan"), + console=console, + transient=True, + ) + state["live"].start() + + elif t == "judging": + if state["live"]: + state["live"].update( + Spinner("dots", text=" judging…", style="cyan") + ) + + elif t == "case_done": + _stop_live() + elapsed_s = event["time_ms"] / 1000 + if event.get("error"): + console.print(f" [red]✗ ERROR[/red] ({elapsed_s:.1f}s)") + elif event["passed"]: + console.print(f" [green]✓ PASS[/green] score={event['score']:.2f} ({elapsed_s:.1f}s)") + else: + console.print(f" [red]✗ FAIL[/red] score={event['score']:.2f} ({elapsed_s:.1f}s)") + + elif t == "model_done": + _stop_live() + p, tot, avg = event["passed"], event["total"], event["avg_score"] + color = "green" if p == tot else ("yellow" if p > 0 else "red") + console.print(f" [{color}]── {p}/{tot} passed · avg score {avg:.2f}[/{color}]") + + return progress + + def _test_new_agents( agent_files: list, n_iterations: int, @@ -544,14 +618,16 @@ def _test_new_agents( import asyncio from pathlib import Path + from rich.console import Console as _RichConsole + from crewai_cli.benchmark import ( - format_results_table, load_benchmark_cases, + print_results_chart, run_benchmark, ) + _con = _RichConsole() tests_dir = Path("tests") - # Fallback for projects created before the rename if not tests_dir.is_dir() and Path("benchmarks").is_dir(): tests_dir = Path("benchmarks") all_passed = True @@ -584,6 +660,7 @@ def _test_new_agents( cases=cases, models=model_list, judge_model=judge_model, + on_progress=_make_benchmark_progress(), ) ) except Exception as e: @@ -594,19 +671,19 @@ def _test_new_agents( agents_tested += 1 for model_name, results in results_by_model.items(): - click.echo(format_results_table(results)) + _con.print() + print_results_chart(results, console=_con) failed = [r for r in results if r.score < threshold] if failed: all_passed = False - click.secho( - f" FAILED: {len(failed)}/{len(results)} cases below threshold ({threshold})", - fg="red", + _con.print( + f"\n [red bold]FAILED: {len(failed)}/{len(results)} " + f"cases below threshold ({threshold})[/]" ) else: - click.secho( - f" PASSED: all {len(results)} cases >= {threshold}", - fg="green", + _con.print( + f"\n [green bold]PASSED: all {len(results)} cases >= {threshold}[/]" ) click.echo() @@ -1372,13 +1449,17 @@ def benchmark( uv_args.extend(["-m", m]) _relaunch_via_uv(uv_args) + from rich.console import Console as _RichConsole + from crewai_cli.benchmark import ( - format_comparison_table, - format_results_table, load_benchmark_cases, + print_comparison_chart, + print_results_chart, run_benchmark, ) + _con = _RichConsole() + try: cases = load_benchmark_cases(cases_path) except (FileNotFoundError, ValueError) as e: @@ -1401,20 +1482,20 @@ def benchmark( cases=cases, models=model_list, judge_model=judge_model, + on_progress=_make_benchmark_progress(), ) ) except Exception as e: click.secho(f"Error running benchmark: {e}", fg="red") raise SystemExit(1) from e - # Print results for each model for model, results in results_by_model.items(): - click.echo(format_results_table(results)) - click.echo() + _con.print() + print_results_chart(results, console=_con) - # Print comparison if multiple models if len(results_by_model) > 1: - click.echo(format_comparison_table(results_by_model)) + _con.print() + print_comparison_chart(results_by_model, console=_con) if __name__ == "__main__":