From f723e69410dc67a010461a76f90b60bb44119778 Mon Sep 17 00:00:00 2001
From: alex-clawd <alex@crewai.com>
Date: Wed, 13 May 2026 21:37:40 -0700
Subject: [PATCH] fix: address bugbot review comments and CI failures

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 lib/cli/src/crewai_cli/agent_tui.py | 12 ++-----
 lib/cli/src/crewai_cli/benchmark.py | 12 ++++---
 lib/cli/src/crewai_cli/cli.py       | 51 +++++++++++++++++++----------
 3 files changed, 44 insertions(+), 31 deletions(-)
diff --git a/lib/cli/src/crewai_cli/agent_tui.py b/lib/cli/src/crewai_cli/agent_tui.py
index e2b234c94..e14684f29 100644
--- a/lib/cli/src/crewai_cli/agent_tui.py
+++ b/lib/cli/src/crewai_cli/agent_tui.py
@@ -304,9 +304,7 @@ class ThinkingIndicator(Static):
         for step in self._steps:
             lines.append(step)
         status_esc = _safe_render(self._current_status)
-        current = (
-            f"[{_CORAL}]{ch}[/] [{_DIM}]{self._agent_name}[/] {status_esc}"
-        )
+        current = f"[{_CORAL}]{ch}[/] [{_DIM}]{self._agent_name}[/] {status_esc}"
         if self._tokens:
             current += f"  {self._tokens}"
         lines.append(current)
@@ -1507,9 +1505,7 @@ class AgentTUI(App[None]):
             while True:
                 try:
                     timeout = 180.0 if first_chunk else 120.0
-                    chunk = await asyncio.wait_for(
-                        anext(stream), timeout=timeout  # type: ignore[arg-type]
-                    )
+                    chunk = await asyncio.wait_for(anext(stream), timeout=timeout)
                     first_chunk = False
                 except StopAsyncIteration:
                     break
@@ -1552,9 +1548,7 @@ class AgentTUI(App[None]):
                 if getattr(response, "input_tokens", 0) or getattr(
                     response, "output_tokens", 0
                 ):
-                    meta_parts.append(
-                        f"~{response.output_tokens or 0:,} tokens"
-                    )
+                    meta_parts.append(f"~{response.output_tokens or 0:,} tokens")
                 if getattr(response, "response_time_ms", 0):
                     meta_parts.append(f"{response.response_time_ms / 1000:.1f}s")
             metadata = " · ".join(meta_parts)
diff --git a/lib/cli/src/crewai_cli/benchmark.py b/lib/cli/src/crewai_cli/benchmark.py
index 5288cd62b..e8432b1bd 100644
--- a/lib/cli/src/crewai_cli/benchmark.py
+++ b/lib/cli/src/crewai_cli/benchmark.py
@@ -8,7 +8,7 @@ import json
 from pathlib import Path
 import re
 import time
-from typing import Any
+from typing import Any, cast
 
 from pydantic import BaseModel
 
@@ -190,9 +190,11 @@ async def _judge_with_llm(
     )
 
     try:
+        from crewai.tools import BaseTool as _BaseTool
+
         response = judge_llm.call(
             messages=[{"role": "user", "content": prompt}],
-            tools=[_JUDGE_TOOL],
+            tools=cast("list[dict[str, _BaseTool]]", [_JUDGE_TOOL]),
             available_functions={"submit_evaluation": lambda **kw: kw},
         )
         result = _parse_judge_result(response)
@@ -208,9 +210,11 @@ async def _judge_with_llm(
             f"Input: {input_text}\n\n"
             f"Response: {actual}\n\n"
             f"Evaluation criteria: {criteria}\n\n"
-            "Respond with ONLY a JSON object: {\"score\": <float>, \"passed\": <bool>}"
+            'Respond with ONLY a JSON object: {"score": <float>, "passed": <bool>}'
+        )
+        response = judge_llm.call(
+            messages=[{"role": "user", "content": fallback_prompt}]
         )
-        response = judge_llm.call(messages=[{"role": "user", "content": fallback_prompt}])
         result = _parse_judge_result(response)
         if result is not None:
             return result
diff --git a/lib/cli/src/crewai_cli/cli.py b/lib/cli/src/crewai_cli/cli.py
index f638a29f1..f77694518 100644
--- a/lib/cli/src/crewai_cli/cli.py
+++ b/lib/cli/src/crewai_cli/cli.py
@@ -532,7 +532,7 @@ def memory(
     help="Show agent execution details (tool calls, LLM responses, errors).",
 )
 def test(
-    n_iterations: int,
+    n_iterations: int | None,
     model: str | None,
     trained_agents_file: str | None,
     threshold: float | None,
@@ -612,7 +612,9 @@ def test(
         click.echo(
             f"Testing the crew for {legacy_iterations} iterations with model {crew_model}"
         )
-        evaluate_crew(legacy_iterations, crew_model, trained_agents_file=trained_agents_file)
+        evaluate_crew(
+            legacy_iterations, crew_model, trained_agents_file=trained_agents_file
+        )
 
 
 def _read_config(*keys: str) -> Any:
@@ -802,7 +804,10 @@ class _BenchmarkLiveProgress:
         parts: list[Any] = []
         if self._n_iterations > 1:
             parts.append(
-                Text(f"  Iteration {self._current_iteration + 1}/{self._n_iterations}", style="cyan")
+                Text(
+                    f"  Iteration {self._current_iteration + 1}/{self._n_iterations}",
+                    style="cyan",
+                )
             )
 
         table = Table(box=box.SIMPLE, show_header=False, padding=(0, 1), expand=False)
@@ -914,7 +919,11 @@ def _test_new_agents(
     model_list = [model] if model else None
 
     # Progress display — prefix model key with agent name
-    progress = None if verbose else _BenchmarkLiveProgress(console=_con, n_iterations=n_iterations)
+    progress = (
+        None
+        if verbose
+        else _BenchmarkLiveProgress(console=_con, n_iterations=n_iterations)
+    )
 
     def _make_progress_cb(agent_name: str) -> Any:
         def _cb(event: dict[str, Any]) -> None:
@@ -973,11 +982,11 @@ def _test_new_agents(
     iter_marks: list[str] = []
 
     for iteration in range(n_iterations):
-        if not verbose:
-            if progress is None:
-                raise RuntimeError("progress must not be None in non-verbose mode")
-            progress.start(iteration=iteration)
         try:
+            if not verbose:
+                if progress is None:
+                    raise RuntimeError("progress must not be None in non-verbose mode")
+                progress.start(iteration=iteration)
             with ArtifactsSandbox():
                 if verbose:
                     with VerboseBenchmarkOutput():
@@ -1029,15 +1038,17 @@ def _test_new_agents(
         avg_time = sum(r.response_time_ms for r in results) / 1000 / n_iter
         avg_cost = sum(r.cost or 0.0 for r in results) / n_iter
 
-        rows.append({
-            "label": f"{agent_name}/{model_key}",
-            "passed": passed_count == total,
-            "ratio": f"{pass_per_iter}/{cases_per_iter}",
-            "score": avg_score,
-            "time": f"{avg_time:.1f}s",
-            "tokens": f"↑{_fmt_tokens(int(sum(r.input_tokens for r in results) / n_iter))} ↓{_fmt_tokens(int(sum(r.output_tokens for r in results) / n_iter))}",
-            "cost": _fmt_cost(avg_cost) if avg_cost > 0 else "",
-        })
+        rows.append(
+            {
+                "label": f"{agent_name}/{model_key}",
+                "passed": passed_count == total,
+                "ratio": f"{pass_per_iter}/{cases_per_iter}",
+                "score": avg_score,
+                "time": f"{avg_time:.1f}s",
+                "tokens": f"↑{_fmt_tokens(int(sum(r.input_tokens for r in results) / n_iter))} ↓{_fmt_tokens(int(sum(r.output_tokens for r in results) / n_iter))}",
+                "cost": _fmt_cost(avg_cost) if avg_cost > 0 else "",
+            }
+        )
 
     w_label = max((len(r["label"]) for r in rows), default=0)
     w_ratio = max((len(r["ratio"]) for r in rows), default=0)
@@ -1907,6 +1918,7 @@ def benchmark(
     from crewai_cli.benchmark import (
         load_benchmark_cases,
         print_comparison_chart,
+        print_results_chart,
         run_benchmark,
     )
 
@@ -1980,7 +1992,10 @@ def benchmark(
             progress.stop()
         _loop.close()
 
-    if len(results_by_model) > 1:
+    if len(results_by_model) == 1:
+        _single_results = next(iter(results_by_model.values()))
+        print_results_chart(_single_results, console=_con)
+    elif len(results_by_model) > 1:
         _con.print()
         print_comparison_chart(results_by_model, console=_con)