From 2d82896d71f6da0b4007fe352aa1f162a2132cd2 Mon Sep 17 00:00:00 2001
From: Joao Moura <joaomdmoura@gmail.com>
Date: Thu, 14 May 2026 16:48:17 -0400
Subject: [PATCH] =?UTF-8?q?fix:=20address=20PR=20review=20comments=20?=
 =?UTF-8?q?=E2=80=94=20lint,=20threshold,=20dedup,=20agents=5Fdir?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove redundant local `import asyncio` in executor.py that caused
  ruff F823 (local variable referenced before assignment)
- Clear progress state before creating Live display (fixes flash)
- Use threshold-based passed in _save_run_results so persisted results
  match CLI output
- Pass agents_dir to load_agent_from_definition in _train_new_agents
  so coworker references resolve correctly
- Deduplicate verbose/non-verbose benchmark execution blocks into
  single context-manager expression

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 lib/cli/src/crewai_cli/cli.py               | 61 ++++++++-------------
 lib/crewai/src/crewai/new_agent/executor.py |  2 -
 2 files changed, 24 insertions(+), 39 deletions(-)

diff --git a/lib/cli/src/crewai_cli/cli.py b/lib/cli/src/crewai_cli/cli.py
index f77694518..7ea58b6c2 100644
--- a/lib/cli/src/crewai_cli/cli.py
+++ b/lib/cli/src/crewai_cli/cli.py
@@ -235,7 +235,9 @@ def _train_new_agents(agent_files: list[Any], n_iterations: int) -> None:
         try:
             from crewai.new_agent.definition_parser import load_agent_from_definition
 
-            agent = load_agent_from_definition(str(agent_path))
+            agent = load_agent_from_definition(
+                str(agent_path), agents_dir=str(agent_path.parent)
+            )
         except Exception as e:
             click.secho(f"  Error loading agent {agent_name}: {e}", fg="red")
             continue
@@ -681,12 +683,15 @@ def _save_run_results(
 
         cases: list[dict[str, Any]] = []
         for r in result_list:
+            effective_passed = (
+                r.score >= threshold if threshold is not None else r.passed
+            )
             case: dict[str, Any] = {
                 "case": r.case_index + 1,
                 "input": r.input,
                 "output": r.actual,
                 "score": r.score,
-                "passed": r.passed,
+                "passed": effective_passed,
                 "time_ms": r.response_time_ms,
                 "input_tokens": r.input_tokens,
                 "output_tokens": r.output_tokens,
@@ -730,13 +735,13 @@ class _BenchmarkLiveProgress:
         from rich.live import Live
 
         self._current_iteration = iteration
+        self._state.clear()
         self._live = Live(
             self._render(),
             console=self._console,
             refresh_per_second=10,
             transient=True,
         )
-        self._state.clear()
         self._live.start()
 
     def stop(self) -> None:
@@ -987,13 +992,9 @@ def _test_new_agents(
                 if progress is None:
                     raise RuntimeError("progress must not be None in non-verbose mode")
                 progress.start(iteration=iteration)
-            with ArtifactsSandbox():
-                if verbose:
-                    with VerboseBenchmarkOutput():
-                        all_results = _loop.run_until_complete(_run_all())
-                else:
-                    with SuppressBenchmarkOutput():
-                        all_results = _loop.run_until_complete(_run_all())
+            output_ctx = VerboseBenchmarkOutput() if verbose else SuppressBenchmarkOutput()
+            with ArtifactsSandbox(), output_ctx:
+                all_results = _loop.run_until_complete(_run_all())
         finally:
             if not verbose:
                 if progress is None:
@@ -1957,33 +1958,19 @@ def benchmark(
     try:
         if progress:
             progress.start()
-        with ArtifactsSandbox():
-            if verbose:
-                with VerboseBenchmarkOutput():
-                    results_by_model = _loop.run_until_complete(
-                        run_benchmark(
-                            agent_def=agent_path,
-                            cases=cases,
-                            models=model_list,
-                            judge_model=judge_model,
-                            on_progress=progress.on_progress if progress else None,
-                            verbose=verbose,
-                            case_timeout=effective_timeout,
-                        )
-                    )
-            else:
-                with SuppressBenchmarkOutput():
-                    results_by_model = _loop.run_until_complete(
-                        run_benchmark(
-                            agent_def=agent_path,
-                            cases=cases,
-                            models=model_list,
-                            judge_model=judge_model,
-                            on_progress=progress.on_progress if progress else None,
-                            verbose=verbose,
-                            case_timeout=effective_timeout,
-                        )
-                    )
+        output_ctx = VerboseBenchmarkOutput() if verbose else SuppressBenchmarkOutput()
+        with ArtifactsSandbox(), output_ctx:
+            results_by_model = _loop.run_until_complete(
+                run_benchmark(
+                    agent_def=agent_path,
+                    cases=cases,
+                    models=model_list,
+                    judge_model=judge_model,
+                    on_progress=progress.on_progress if progress else None,
+                    verbose=verbose,
+                    case_timeout=effective_timeout,
+                )
+            )
     except Exception as e:
         click.secho(f"Error running benchmark: {e}", fg="red")
         raise SystemExit(1) from e
diff --git a/lib/crewai/src/crewai/new_agent/executor.py b/lib/crewai/src/crewai/new_agent/executor.py
index 70f3fe972..640d32e1b 100644
--- a/lib/crewai/src/crewai/new_agent/executor.py
+++ b/lib/crewai/src/crewai/new_agent/executor.py
@@ -2109,8 +2109,6 @@ class ConversationalAgentExecutor(BaseModel):
                             if self.conversation_history
                             else "",
                         )
-                        import asyncio
-
                         loop = asyncio.get_event_loop()
                         if loop.is_running():
                             asyncio.ensure_future(self.provider.send_message(hint_msg))