fix: address PR review comments — lint, threshold, dedup, agents_dir

- Remove redundant local `import asyncio` in executor.py that caused ruff F823 (local variable referenced before assignment) - Clear progress state before creating Live display (fixes flash) - Use threshold-based passed in _save_run_results so persisted results match CLI output - Pass agents_dir to load_agent_from_definition in _train_new_agents so coworker references resolve correctly - Deduplicate verbose/non-verbose benchmark execution blocks into single context-manager expression Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-07-01 13:18:10 +00:00 · 2026-05-14 16:48:17 -04:00
parent 9abf9146f3
commit 2d82896d71
2 changed files with 24 additions and 39 deletions
--- a/lib/cli/src/crewai_cli/cli.py
+++ b/lib/cli/src/crewai_cli/cli.py
@@ -235,7 +235,9 @@ def _train_new_agents(agent_files: list[Any], n_iterations: int) -> None:
        try:
            from crewai.new_agent.definition_parser import load_agent_from_definition

-            agent = load_agent_from_definition(str(agent_path))
+            agent = load_agent_from_definition(
+                str(agent_path), agents_dir=str(agent_path.parent)
+            )
        except Exception as e:
            click.secho(f"  Error loading agent {agent_name}: {e}", fg="red")
            continue
@@ -681,12 +683,15 @@ def _save_run_results(

        cases: list[dict[str, Any]] = []
        for r in result_list:
+            effective_passed = (
+                r.score >= threshold if threshold is not None else r.passed
+            )
            case: dict[str, Any] = {
                "case": r.case_index + 1,
                "input": r.input,
                "output": r.actual,
                "score": r.score,
-                "passed": r.passed,
+                "passed": effective_passed,
                "time_ms": r.response_time_ms,
                "input_tokens": r.input_tokens,
                "output_tokens": r.output_tokens,
@@ -730,13 +735,13 @@ class _BenchmarkLiveProgress:
        from rich.live import Live

        self._current_iteration = iteration
+        self._state.clear()
        self._live = Live(
            self._render(),
            console=self._console,
            refresh_per_second=10,
            transient=True,
        )
-        self._state.clear()
        self._live.start()

    def stop(self) -> None:
@@ -987,13 +992,9 @@ def _test_new_agents(
                if progress is None:
                    raise RuntimeError("progress must not be None in non-verbose mode")
                progress.start(iteration=iteration)
-            with ArtifactsSandbox():
-                if verbose:
-                    with VerboseBenchmarkOutput():
-                        all_results = _loop.run_until_complete(_run_all())
-                else:
-                    with SuppressBenchmarkOutput():
-                        all_results = _loop.run_until_complete(_run_all())
+            output_ctx = VerboseBenchmarkOutput() if verbose else SuppressBenchmarkOutput()
+            with ArtifactsSandbox(), output_ctx:
+                all_results = _loop.run_until_complete(_run_all())
        finally:
            if not verbose:
                if progress is None:
@@ -1957,33 +1958,19 @@ def benchmark(
    try:
        if progress:
            progress.start()
-        with ArtifactsSandbox():
-            if verbose:
-                with VerboseBenchmarkOutput():
-                    results_by_model = _loop.run_until_complete(
-                        run_benchmark(
-                            agent_def=agent_path,
-                            cases=cases,
-                            models=model_list,
-                            judge_model=judge_model,
-                            on_progress=progress.on_progress if progress else None,
-                            verbose=verbose,
-                            case_timeout=effective_timeout,
-                        )
-                    )
-            else:
-                with SuppressBenchmarkOutput():
-                    results_by_model = _loop.run_until_complete(
-                        run_benchmark(
-                            agent_def=agent_path,
-                            cases=cases,
-                            models=model_list,
-                            judge_model=judge_model,
-                            on_progress=progress.on_progress if progress else None,
-                            verbose=verbose,
-                            case_timeout=effective_timeout,
-                        )
-                    )
+        output_ctx = VerboseBenchmarkOutput() if verbose else SuppressBenchmarkOutput()
+        with ArtifactsSandbox(), output_ctx:
+            results_by_model = _loop.run_until_complete(
+                run_benchmark(
+                    agent_def=agent_path,
+                    cases=cases,
+                    models=model_list,
+                    judge_model=judge_model,
+                    on_progress=progress.on_progress if progress else None,
+                    verbose=verbose,
+                    case_timeout=effective_timeout,
+                )
+            )
    except Exception as e:
        click.secho(f"Error running benchmark: {e}", fg="red")
        raise SystemExit(1) from e