fix: address bugbot review comments and CI failures

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-07-02 21:58:11 +00:00 · 2026-05-13 21:37:40 -07:00
parent ef39974bd8
commit f723e69410
3 changed files with 44 additions and 31 deletions
--- a/lib/cli/src/crewai_cli/agent_tui.py
+++ b/lib/cli/src/crewai_cli/agent_tui.py
@@ -304,9 +304,7 @@ class ThinkingIndicator(Static):
        for step in self._steps:
            lines.append(step)
        status_esc = _safe_render(self._current_status)
-        current = (
-            f"[{_CORAL}]{ch}[/] [{_DIM}]{self._agent_name}[/] {status_esc}"
-        )
+        current = f"[{_CORAL}]{ch}[/] [{_DIM}]{self._agent_name}[/] {status_esc}"
        if self._tokens:
            current += f"  {self._tokens}"
        lines.append(current)
@@ -1507,9 +1505,7 @@ class AgentTUI(App[None]):
            while True:
                try:
                    timeout = 180.0 if first_chunk else 120.0
-                    chunk = await asyncio.wait_for(
-                        anext(stream), timeout=timeout  # type: ignore[arg-type]
-                    )
+                    chunk = await asyncio.wait_for(anext(stream), timeout=timeout)
                    first_chunk = False
                except StopAsyncIteration:
                    break
@@ -1552,9 +1548,7 @@ class AgentTUI(App[None]):
                if getattr(response, "input_tokens", 0) or getattr(
                    response, "output_tokens", 0
                ):
-                    meta_parts.append(
-                        f"~{response.output_tokens or 0:,} tokens"
-                    )
+                    meta_parts.append(f"~{response.output_tokens or 0:,} tokens")
                if getattr(response, "response_time_ms", 0):
                    meta_parts.append(f"{response.response_time_ms / 1000:.1f}s")
            metadata = " · ".join(meta_parts)
--- a/lib/cli/src/crewai_cli/benchmark.py
+++ b/lib/cli/src/crewai_cli/benchmark.py
@@ -8,7 +8,7 @@ import json
 from pathlib import Path
 import re
 import time
-from typing import Any
+from typing import Any, cast

 from pydantic import BaseModel

@@ -190,9 +190,11 @@ async def _judge_with_llm(
    )

    try:
+        from crewai.tools import BaseTool as _BaseTool
+
        response = judge_llm.call(
            messages=[{"role": "user", "content": prompt}],
-            tools=[_JUDGE_TOOL],
+            tools=cast("list[dict[str, _BaseTool]]", [_JUDGE_TOOL]),
            available_functions={"submit_evaluation": lambda **kw: kw},
        )
        result = _parse_judge_result(response)
@@ -208,9 +210,11 @@ async def _judge_with_llm(
            f"Input: {input_text}\n\n"
            f"Response: {actual}\n\n"
            f"Evaluation criteria: {criteria}\n\n"
-            "Respond with ONLY a JSON object: {\"score\": <float>, \"passed\": <bool>}"
+            'Respond with ONLY a JSON object: {"score": <float>, "passed": <bool>}'
+        )
+        response = judge_llm.call(
+            messages=[{"role": "user", "content": fallback_prompt}]
        )
-        response = judge_llm.call(messages=[{"role": "user", "content": fallback_prompt}])
        result = _parse_judge_result(response)
        if result is not None:
            return result
--- a/lib/cli/src/crewai_cli/cli.py
+++ b/lib/cli/src/crewai_cli/cli.py
@@ -532,7 +532,7 @@ def memory(
    help="Show agent execution details (tool calls, LLM responses, errors).",
 )
 def test(
-    n_iterations: int,
+    n_iterations: int | None,
    model: str | None,
    trained_agents_file: str | None,
    threshold: float | None,
@@ -612,7 +612,9 @@ def test(
        click.echo(
            f"Testing the crew for {legacy_iterations} iterations with model {crew_model}"
        )
-        evaluate_crew(legacy_iterations, crew_model, trained_agents_file=trained_agents_file)
+        evaluate_crew(
+            legacy_iterations, crew_model, trained_agents_file=trained_agents_file
+        )


 def _read_config(*keys: str) -> Any:
@@ -802,7 +804,10 @@ class _BenchmarkLiveProgress:
        parts: list[Any] = []
        if self._n_iterations > 1:
            parts.append(
-                Text(f"  Iteration {self._current_iteration + 1}/{self._n_iterations}", style="cyan")
+                Text(
+                    f"  Iteration {self._current_iteration + 1}/{self._n_iterations}",
+                    style="cyan",
+                )
            )

        table = Table(box=box.SIMPLE, show_header=False, padding=(0, 1), expand=False)
@@ -914,7 +919,11 @@ def _test_new_agents(
    model_list = [model] if model else None

    # Progress display — prefix model key with agent name
-    progress = None if verbose else _BenchmarkLiveProgress(console=_con, n_iterations=n_iterations)
+    progress = (
+        None
+        if verbose
+        else _BenchmarkLiveProgress(console=_con, n_iterations=n_iterations)
+    )

    def _make_progress_cb(agent_name: str) -> Any:
        def _cb(event: dict[str, Any]) -> None:
@@ -973,11 +982,11 @@ def _test_new_agents(
    iter_marks: list[str] = []

    for iteration in range(n_iterations):
-        if not verbose:
-            if progress is None:
-                raise RuntimeError("progress must not be None in non-verbose mode")
-            progress.start(iteration=iteration)
        try:
+            if not verbose:
+                if progress is None:
+                    raise RuntimeError("progress must not be None in non-verbose mode")
+                progress.start(iteration=iteration)
            with ArtifactsSandbox():
                if verbose:
                    with VerboseBenchmarkOutput():
@@ -1029,15 +1038,17 @@ def _test_new_agents(
        avg_time = sum(r.response_time_ms for r in results) / 1000 / n_iter
        avg_cost = sum(r.cost or 0.0 for r in results) / n_iter

-        rows.append({
-            "label": f"{agent_name}/{model_key}",
-            "passed": passed_count == total,
-            "ratio": f"{pass_per_iter}/{cases_per_iter}",
-            "score": avg_score,
-            "time": f"{avg_time:.1f}s",
-            "tokens": f"↑{_fmt_tokens(int(sum(r.input_tokens for r in results) / n_iter))} ↓{_fmt_tokens(int(sum(r.output_tokens for r in results) / n_iter))}",
-            "cost": _fmt_cost(avg_cost) if avg_cost > 0 else "",
-        })
+        rows.append(
+            {
+                "label": f"{agent_name}/{model_key}",
+                "passed": passed_count == total,
+                "ratio": f"{pass_per_iter}/{cases_per_iter}",
+                "score": avg_score,
+                "time": f"{avg_time:.1f}s",
+                "tokens": f"↑{_fmt_tokens(int(sum(r.input_tokens for r in results) / n_iter))} ↓{_fmt_tokens(int(sum(r.output_tokens for r in results) / n_iter))}",
+                "cost": _fmt_cost(avg_cost) if avg_cost > 0 else "",
+            }
+        )

    w_label = max((len(r["label"]) for r in rows), default=0)
    w_ratio = max((len(r["ratio"]) for r in rows), default=0)
@@ -1907,6 +1918,7 @@ def benchmark(
    from crewai_cli.benchmark import (
        load_benchmark_cases,
        print_comparison_chart,
+        print_results_chart,
        run_benchmark,
    )

@@ -1980,7 +1992,10 @@ def benchmark(
            progress.stop()
        _loop.close()

-    if len(results_by_model) > 1:
+    if len(results_by_model) == 1:
+        _single_results = next(iter(results_by_model.values()))
+        print_results_chart(_single_results, console=_con)
+    elif len(results_by_model) > 1:
        _con.print()
        print_comparison_chart(results_by_model, console=_con)