mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-07-02 05:38:12 +00:00
fix: address bugbot review comments and CI failures
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -304,9 +304,7 @@ class ThinkingIndicator(Static):
|
||||
for step in self._steps:
|
||||
lines.append(step)
|
||||
status_esc = _safe_render(self._current_status)
|
||||
current = (
|
||||
f"[{_CORAL}]{ch}[/] [{_DIM}]{self._agent_name}[/] {status_esc}"
|
||||
)
|
||||
current = f"[{_CORAL}]{ch}[/] [{_DIM}]{self._agent_name}[/] {status_esc}"
|
||||
if self._tokens:
|
||||
current += f" {self._tokens}"
|
||||
lines.append(current)
|
||||
@@ -1507,9 +1505,7 @@ class AgentTUI(App[None]):
|
||||
while True:
|
||||
try:
|
||||
timeout = 180.0 if first_chunk else 120.0
|
||||
chunk = await asyncio.wait_for(
|
||||
anext(stream), timeout=timeout # type: ignore[arg-type]
|
||||
)
|
||||
chunk = await asyncio.wait_for(anext(stream), timeout=timeout)
|
||||
first_chunk = False
|
||||
except StopAsyncIteration:
|
||||
break
|
||||
@@ -1552,9 +1548,7 @@ class AgentTUI(App[None]):
|
||||
if getattr(response, "input_tokens", 0) or getattr(
|
||||
response, "output_tokens", 0
|
||||
):
|
||||
meta_parts.append(
|
||||
f"~{response.output_tokens or 0:,} tokens"
|
||||
)
|
||||
meta_parts.append(f"~{response.output_tokens or 0:,} tokens")
|
||||
if getattr(response, "response_time_ms", 0):
|
||||
meta_parts.append(f"{response.response_time_ms / 1000:.1f}s")
|
||||
metadata = " · ".join(meta_parts)
|
||||
|
||||
@@ -8,7 +8,7 @@ import json
|
||||
from pathlib import Path
|
||||
import re
|
||||
import time
|
||||
from typing import Any
|
||||
from typing import Any, cast
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
@@ -190,9 +190,11 @@ async def _judge_with_llm(
|
||||
)
|
||||
|
||||
try:
|
||||
from crewai.tools import BaseTool as _BaseTool
|
||||
|
||||
response = judge_llm.call(
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
tools=[_JUDGE_TOOL],
|
||||
tools=cast("list[dict[str, _BaseTool]]", [_JUDGE_TOOL]),
|
||||
available_functions={"submit_evaluation": lambda **kw: kw},
|
||||
)
|
||||
result = _parse_judge_result(response)
|
||||
@@ -208,9 +210,11 @@ async def _judge_with_llm(
|
||||
f"Input: {input_text}\n\n"
|
||||
f"Response: {actual}\n\n"
|
||||
f"Evaluation criteria: {criteria}\n\n"
|
||||
"Respond with ONLY a JSON object: {\"score\": <float>, \"passed\": <bool>}"
|
||||
'Respond with ONLY a JSON object: {"score": <float>, "passed": <bool>}'
|
||||
)
|
||||
response = judge_llm.call(
|
||||
messages=[{"role": "user", "content": fallback_prompt}]
|
||||
)
|
||||
response = judge_llm.call(messages=[{"role": "user", "content": fallback_prompt}])
|
||||
result = _parse_judge_result(response)
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
@@ -532,7 +532,7 @@ def memory(
|
||||
help="Show agent execution details (tool calls, LLM responses, errors).",
|
||||
)
|
||||
def test(
|
||||
n_iterations: int,
|
||||
n_iterations: int | None,
|
||||
model: str | None,
|
||||
trained_agents_file: str | None,
|
||||
threshold: float | None,
|
||||
@@ -612,7 +612,9 @@ def test(
|
||||
click.echo(
|
||||
f"Testing the crew for {legacy_iterations} iterations with model {crew_model}"
|
||||
)
|
||||
evaluate_crew(legacy_iterations, crew_model, trained_agents_file=trained_agents_file)
|
||||
evaluate_crew(
|
||||
legacy_iterations, crew_model, trained_agents_file=trained_agents_file
|
||||
)
|
||||
|
||||
|
||||
def _read_config(*keys: str) -> Any:
|
||||
@@ -802,7 +804,10 @@ class _BenchmarkLiveProgress:
|
||||
parts: list[Any] = []
|
||||
if self._n_iterations > 1:
|
||||
parts.append(
|
||||
Text(f" Iteration {self._current_iteration + 1}/{self._n_iterations}", style="cyan")
|
||||
Text(
|
||||
f" Iteration {self._current_iteration + 1}/{self._n_iterations}",
|
||||
style="cyan",
|
||||
)
|
||||
)
|
||||
|
||||
table = Table(box=box.SIMPLE, show_header=False, padding=(0, 1), expand=False)
|
||||
@@ -914,7 +919,11 @@ def _test_new_agents(
|
||||
model_list = [model] if model else None
|
||||
|
||||
# Progress display — prefix model key with agent name
|
||||
progress = None if verbose else _BenchmarkLiveProgress(console=_con, n_iterations=n_iterations)
|
||||
progress = (
|
||||
None
|
||||
if verbose
|
||||
else _BenchmarkLiveProgress(console=_con, n_iterations=n_iterations)
|
||||
)
|
||||
|
||||
def _make_progress_cb(agent_name: str) -> Any:
|
||||
def _cb(event: dict[str, Any]) -> None:
|
||||
@@ -973,11 +982,11 @@ def _test_new_agents(
|
||||
iter_marks: list[str] = []
|
||||
|
||||
for iteration in range(n_iterations):
|
||||
if not verbose:
|
||||
if progress is None:
|
||||
raise RuntimeError("progress must not be None in non-verbose mode")
|
||||
progress.start(iteration=iteration)
|
||||
try:
|
||||
if not verbose:
|
||||
if progress is None:
|
||||
raise RuntimeError("progress must not be None in non-verbose mode")
|
||||
progress.start(iteration=iteration)
|
||||
with ArtifactsSandbox():
|
||||
if verbose:
|
||||
with VerboseBenchmarkOutput():
|
||||
@@ -1029,15 +1038,17 @@ def _test_new_agents(
|
||||
avg_time = sum(r.response_time_ms for r in results) / 1000 / n_iter
|
||||
avg_cost = sum(r.cost or 0.0 for r in results) / n_iter
|
||||
|
||||
rows.append({
|
||||
"label": f"{agent_name}/{model_key}",
|
||||
"passed": passed_count == total,
|
||||
"ratio": f"{pass_per_iter}/{cases_per_iter}",
|
||||
"score": avg_score,
|
||||
"time": f"{avg_time:.1f}s",
|
||||
"tokens": f"↑{_fmt_tokens(int(sum(r.input_tokens for r in results) / n_iter))} ↓{_fmt_tokens(int(sum(r.output_tokens for r in results) / n_iter))}",
|
||||
"cost": _fmt_cost(avg_cost) if avg_cost > 0 else "",
|
||||
})
|
||||
rows.append(
|
||||
{
|
||||
"label": f"{agent_name}/{model_key}",
|
||||
"passed": passed_count == total,
|
||||
"ratio": f"{pass_per_iter}/{cases_per_iter}",
|
||||
"score": avg_score,
|
||||
"time": f"{avg_time:.1f}s",
|
||||
"tokens": f"↑{_fmt_tokens(int(sum(r.input_tokens for r in results) / n_iter))} ↓{_fmt_tokens(int(sum(r.output_tokens for r in results) / n_iter))}",
|
||||
"cost": _fmt_cost(avg_cost) if avg_cost > 0 else "",
|
||||
}
|
||||
)
|
||||
|
||||
w_label = max((len(r["label"]) for r in rows), default=0)
|
||||
w_ratio = max((len(r["ratio"]) for r in rows), default=0)
|
||||
@@ -1907,6 +1918,7 @@ def benchmark(
|
||||
from crewai_cli.benchmark import (
|
||||
load_benchmark_cases,
|
||||
print_comparison_chart,
|
||||
print_results_chart,
|
||||
run_benchmark,
|
||||
)
|
||||
|
||||
@@ -1980,7 +1992,10 @@ def benchmark(
|
||||
progress.stop()
|
||||
_loop.close()
|
||||
|
||||
if len(results_by_model) > 1:
|
||||
if len(results_by_model) == 1:
|
||||
_single_results = next(iter(results_by_model.values()))
|
||||
print_results_chart(_single_results, console=_con)
|
||||
elif len(results_by_model) > 1:
|
||||
_con.print()
|
||||
print_comparison_chart(results_by_model, console=_con)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user