From f723e69410dc67a010461a76f90b60bb44119778 Mon Sep 17 00:00:00 2001 From: alex-clawd Date: Wed, 13 May 2026 21:37:40 -0700 Subject: [PATCH] fix: address bugbot review comments and CI failures Co-Authored-By: Claude Sonnet 4.6 --- lib/cli/src/crewai_cli/agent_tui.py | 12 ++----- lib/cli/src/crewai_cli/benchmark.py | 12 ++++--- lib/cli/src/crewai_cli/cli.py | 51 +++++++++++++++++++---------- 3 files changed, 44 insertions(+), 31 deletions(-) diff --git a/lib/cli/src/crewai_cli/agent_tui.py b/lib/cli/src/crewai_cli/agent_tui.py index e2b234c94..e14684f29 100644 --- a/lib/cli/src/crewai_cli/agent_tui.py +++ b/lib/cli/src/crewai_cli/agent_tui.py @@ -304,9 +304,7 @@ class ThinkingIndicator(Static): for step in self._steps: lines.append(step) status_esc = _safe_render(self._current_status) - current = ( - f"[{_CORAL}]{ch}[/] [{_DIM}]{self._agent_name}[/] {status_esc}" - ) + current = f"[{_CORAL}]{ch}[/] [{_DIM}]{self._agent_name}[/] {status_esc}" if self._tokens: current += f" {self._tokens}" lines.append(current) @@ -1507,9 +1505,7 @@ class AgentTUI(App[None]): while True: try: timeout = 180.0 if first_chunk else 120.0 - chunk = await asyncio.wait_for( - anext(stream), timeout=timeout # type: ignore[arg-type] - ) + chunk = await asyncio.wait_for(anext(stream), timeout=timeout) first_chunk = False except StopAsyncIteration: break @@ -1552,9 +1548,7 @@ class AgentTUI(App[None]): if getattr(response, "input_tokens", 0) or getattr( response, "output_tokens", 0 ): - meta_parts.append( - f"~{response.output_tokens or 0:,} tokens" - ) + meta_parts.append(f"~{response.output_tokens or 0:,} tokens") if getattr(response, "response_time_ms", 0): meta_parts.append(f"{response.response_time_ms / 1000:.1f}s") metadata = " · ".join(meta_parts) diff --git a/lib/cli/src/crewai_cli/benchmark.py b/lib/cli/src/crewai_cli/benchmark.py index 5288cd62b..e8432b1bd 100644 --- a/lib/cli/src/crewai_cli/benchmark.py +++ b/lib/cli/src/crewai_cli/benchmark.py @@ -8,7 +8,7 @@ import json from pathlib import Path import re import time -from typing import Any +from typing import Any, cast from pydantic import BaseModel @@ -190,9 +190,11 @@ async def _judge_with_llm( ) try: + from crewai.tools import BaseTool as _BaseTool + response = judge_llm.call( messages=[{"role": "user", "content": prompt}], - tools=[_JUDGE_TOOL], + tools=cast("list[dict[str, _BaseTool]]", [_JUDGE_TOOL]), available_functions={"submit_evaluation": lambda **kw: kw}, ) result = _parse_judge_result(response) @@ -208,9 +210,11 @@ async def _judge_with_llm( f"Input: {input_text}\n\n" f"Response: {actual}\n\n" f"Evaluation criteria: {criteria}\n\n" - "Respond with ONLY a JSON object: {\"score\": , \"passed\": }" + 'Respond with ONLY a JSON object: {"score": , "passed": }' + ) + response = judge_llm.call( + messages=[{"role": "user", "content": fallback_prompt}] ) - response = judge_llm.call(messages=[{"role": "user", "content": fallback_prompt}]) result = _parse_judge_result(response) if result is not None: return result diff --git a/lib/cli/src/crewai_cli/cli.py b/lib/cli/src/crewai_cli/cli.py index f638a29f1..f77694518 100644 --- a/lib/cli/src/crewai_cli/cli.py +++ b/lib/cli/src/crewai_cli/cli.py @@ -532,7 +532,7 @@ def memory( help="Show agent execution details (tool calls, LLM responses, errors).", ) def test( - n_iterations: int, + n_iterations: int | None, model: str | None, trained_agents_file: str | None, threshold: float | None, @@ -612,7 +612,9 @@ def test( click.echo( f"Testing the crew for {legacy_iterations} iterations with model {crew_model}" ) - evaluate_crew(legacy_iterations, crew_model, trained_agents_file=trained_agents_file) + evaluate_crew( + legacy_iterations, crew_model, trained_agents_file=trained_agents_file + ) def _read_config(*keys: str) -> Any: @@ -802,7 +804,10 @@ class _BenchmarkLiveProgress: parts: list[Any] = [] if self._n_iterations > 1: parts.append( - Text(f" Iteration {self._current_iteration + 1}/{self._n_iterations}", style="cyan") + Text( + f" Iteration {self._current_iteration + 1}/{self._n_iterations}", + style="cyan", + ) ) table = Table(box=box.SIMPLE, show_header=False, padding=(0, 1), expand=False) @@ -914,7 +919,11 @@ def _test_new_agents( model_list = [model] if model else None # Progress display — prefix model key with agent name - progress = None if verbose else _BenchmarkLiveProgress(console=_con, n_iterations=n_iterations) + progress = ( + None + if verbose + else _BenchmarkLiveProgress(console=_con, n_iterations=n_iterations) + ) def _make_progress_cb(agent_name: str) -> Any: def _cb(event: dict[str, Any]) -> None: @@ -973,11 +982,11 @@ def _test_new_agents( iter_marks: list[str] = [] for iteration in range(n_iterations): - if not verbose: - if progress is None: - raise RuntimeError("progress must not be None in non-verbose mode") - progress.start(iteration=iteration) try: + if not verbose: + if progress is None: + raise RuntimeError("progress must not be None in non-verbose mode") + progress.start(iteration=iteration) with ArtifactsSandbox(): if verbose: with VerboseBenchmarkOutput(): @@ -1029,15 +1038,17 @@ def _test_new_agents( avg_time = sum(r.response_time_ms for r in results) / 1000 / n_iter avg_cost = sum(r.cost or 0.0 for r in results) / n_iter - rows.append({ - "label": f"{agent_name}/{model_key}", - "passed": passed_count == total, - "ratio": f"{pass_per_iter}/{cases_per_iter}", - "score": avg_score, - "time": f"{avg_time:.1f}s", - "tokens": f"↑{_fmt_tokens(int(sum(r.input_tokens for r in results) / n_iter))} ↓{_fmt_tokens(int(sum(r.output_tokens for r in results) / n_iter))}", - "cost": _fmt_cost(avg_cost) if avg_cost > 0 else "", - }) + rows.append( + { + "label": f"{agent_name}/{model_key}", + "passed": passed_count == total, + "ratio": f"{pass_per_iter}/{cases_per_iter}", + "score": avg_score, + "time": f"{avg_time:.1f}s", + "tokens": f"↑{_fmt_tokens(int(sum(r.input_tokens for r in results) / n_iter))} ↓{_fmt_tokens(int(sum(r.output_tokens for r in results) / n_iter))}", + "cost": _fmt_cost(avg_cost) if avg_cost > 0 else "", + } + ) w_label = max((len(r["label"]) for r in rows), default=0) w_ratio = max((len(r["ratio"]) for r in rows), default=0) @@ -1907,6 +1918,7 @@ def benchmark( from crewai_cli.benchmark import ( load_benchmark_cases, print_comparison_chart, + print_results_chart, run_benchmark, ) @@ -1980,7 +1992,10 @@ def benchmark( progress.stop() _loop.close() - if len(results_by_model) > 1: + if len(results_by_model) == 1: + _single_results = next(iter(results_by_model.values())) + print_results_chart(_single_results, console=_con) + elif len(results_by_model) > 1: _con.print() print_comparison_chart(results_by_model, console=_con)