diff --git a/lib/cli/src/crewai_cli/agent_tui.py b/lib/cli/src/crewai_cli/agent_tui.py index d8e4a31b2..6923d616c 100644 --- a/lib/cli/src/crewai_cli/agent_tui.py +++ b/lib/cli/src/crewai_cli/agent_tui.py @@ -40,6 +40,7 @@ from textual.widgets import ( ) from crewai_cli.create_agent import _strip_jsonc +from crewai_cli.utils import load_env_vars try: @@ -1703,18 +1704,9 @@ class AgentTUI(App[None]): def _load_dotenv(base: Path) -> None: """Load .env file into os.environ if it exists.""" - env_path = base / ".env" - if not env_path.exists(): - return try: - for line in env_path.read_text(encoding="utf-8").splitlines(): - line = line.strip() - if not line or line.startswith("#"): - continue - key, _, value = line.partition("=") - key = key.strip() - value = value.strip() - if key and value and key not in os.environ: + for key, value in load_env_vars(base).items(): + if key not in os.environ: os.environ[key] = value except Exception: pass diff --git a/lib/cli/src/crewai_cli/benchmark.py b/lib/cli/src/crewai_cli/benchmark.py index 9d0ca9127..c16cd733b 100644 --- a/lib/cli/src/crewai_cli/benchmark.py +++ b/lib/cli/src/crewai_cli/benchmark.py @@ -136,7 +136,8 @@ async def _judge_with_llm( from crewai.utilities.llm_utils import create_llm judge_llm = create_llm(judge_model) - assert judge_llm is not None + if judge_llm is None: + raise RuntimeError(f"Failed to create LLM from judge model: {judge_model!r}") prompt = ( "You are an evaluation judge. Score the following response on a scale of 0.0 to 1.0.\n\n" @@ -445,7 +446,7 @@ async def run_benchmark( class SuppressBenchmarkOutput: """Context manager that silences console formatter and noisy logging during benchmarks.""" - def __enter__(self) -> "SuppressBenchmarkOutput": + def __enter__(self) -> SuppressBenchmarkOutput: import logging self._saved_formatter = None @@ -491,7 +492,7 @@ class SuppressBenchmarkOutput: class VerboseBenchmarkOutput: """Context manager that subscribes to NewAgent events and prints them for debugging.""" - def __enter__(self) -> "VerboseBenchmarkOutput": + def __enter__(self) -> VerboseBenchmarkOutput: import logging import sys @@ -620,7 +621,7 @@ class ArtifactsSandbox: self._base = Path(base) self._prev_cwd: str | None = None - def __enter__(self) -> "ArtifactsSandbox": + def __enter__(self) -> ArtifactsSandbox: import os self._base.mkdir(parents=True, exist_ok=True) diff --git a/lib/cli/src/crewai_cli/cli.py b/lib/cli/src/crewai_cli/cli.py index b494dae8d..47cd3b448 100644 --- a/lib/cli/src/crewai_cli/cli.py +++ b/lib/cli/src/crewai_cli/cli.py @@ -37,7 +37,11 @@ from crewai_cli.user_data import ( is_tracing_enabled, update_user_data, ) -from crewai_cli.utils import build_env_with_all_tool_credentials, read_toml +from crewai_cli.utils import ( + build_env_with_all_tool_credentials, + load_env_vars, + read_toml, +) def _get_cli_version() -> str: @@ -59,19 +63,12 @@ def crewai() -> None: """Top-level command group for crewai.""" from pathlib import Path - env_path = Path.cwd() / ".env" - if env_path.exists(): - try: - for line in env_path.read_text(encoding="utf-8").splitlines(): - line = line.strip() - if not line or line.startswith("#"): - continue - key, _, value = line.partition("=") - key, value = key.strip(), value.strip() - if key and value and key not in os.environ: - os.environ[key] = value - except Exception: - pass + try: + for key, value in load_env_vars(Path.cwd()).items(): + if key not in os.environ: + os.environ[key] = value + except Exception: + pass @crewai.command( @@ -847,7 +844,7 @@ def _test_new_agents( case_count = sum(len(j["cases"]) for j in jobs) click.echo() click.secho( - f"Testing {len(jobs)} agent(s), {case_count} cases (threshold={threshold})", + f"Testing {len(jobs)} agent(s), {case_count} cases, {n_iterations} iteration(s) (threshold={threshold})", fg="cyan", bold=True, ) @@ -858,51 +855,59 @@ def _test_new_agents( VerboseBenchmarkOutput, ) - if not verbose: - assert progress is not None - progress.start() - try: - with ArtifactsSandbox(): - if verbose: - with VerboseBenchmarkOutput(): - all_results = asyncio.run(_run_all()) - else: - with SuppressBenchmarkOutput(): - all_results = asyncio.run(_run_all()) - finally: - if not verbose: - assert progress is not None - progress.stop() - - # Evaluate results all_passed = True agents_tested = 0 - for job, result in zip(jobs, all_results): - if isinstance(result, Exception): - click.secho( - f" Error running tests for {job['agent_name']}: {result}", fg="red" - ) - all_passed = False - continue - agents_tested += 1 - for results in result.values(): - failed = [r for r in results if r.score < job["threshold"]] - if failed: + for iteration in range(n_iterations): + if n_iterations > 1: + click.secho(f"\n Iteration {iteration + 1}/{n_iterations}", fg="cyan") + + if not verbose: + if progress is None: + raise RuntimeError("progress must not be None in non-verbose mode") + progress.start() + try: + with ArtifactsSandbox(): + if verbose: + with VerboseBenchmarkOutput(): + all_results = asyncio.run(_run_all()) + else: + with SuppressBenchmarkOutput(): + all_results = asyncio.run(_run_all()) + finally: + if not verbose: + if progress is None: + raise RuntimeError("progress must not be None in non-verbose mode") + progress.stop() + + # Evaluate results for this iteration + for job, result in zip(jobs, all_results): + if isinstance(result, Exception): + click.secho( + f" Error running tests for {job['agent_name']}: {result}", fg="red" + ) all_passed = False - _con.print( - f" [red bold]{job['agent_name']}: FAILED {len(failed)}/{len(results)} " - f"cases below threshold ({job['threshold']})[/red bold]" - ) - for r in failed: - inp = r.input[:60] + ("…" if len(r.input) > 60 else "") + continue + + agents_tested += 1 + for results in result.values(): + failed = [r for r in results if r.score < job["threshold"]] + if failed: + all_passed = False _con.print( - f" [red]#{r.case_index + 1}[/red] [dim]{inp}[/dim] [red]{r.score:.2f}[/red]" + f" [red bold]{job['agent_name']}: FAILED {len(failed)}/{len(results)} " + f"cases below threshold ({job['threshold']})[/red bold]" ) - else: - _con.print( - f" [green bold]{job['agent_name']}: PASSED all {len(results)} cases >= {job['threshold']}[/green bold]" - ) + for r in failed: + inp = r.input[:60] + ("…" if len(r.input) > 60 else "") + _con.print( + f" [red]#{r.case_index + 1}[/red] [dim]{inp}[/dim] [red]{r.score:.2f}[/red]" + ) + else: + _con.print( + f" [green bold]{job['agent_name']}: PASSED all {len(results)} cases >= {job['threshold']}[/green bold]" + ) + if agents_tested == 0: click.secho("No agents completed successfully.", fg="yellow") raise SystemExit(1) diff --git a/lib/crewai/tests/utilities/test_llm_utils.py b/lib/crewai/tests/utilities/test_llm_utils.py index 5b4aaeef9..dc83cfb82 100644 --- a/lib/crewai/tests/utilities/test_llm_utils.py +++ b/lib/crewai/tests/utilities/test_llm_utils.py @@ -77,16 +77,16 @@ def test_create_llm_from_env_with_unaccepted_attributes() -> None: with patch.dict( os.environ, { - "OPENAI_MODEL_NAME": "gpt-3.5-turbo", "OPENAI_API_KEY": "fake-key", "AWS_ACCESS_KEY_ID": "fake-access-key", "AWS_SECRET_ACCESS_KEY": "fake-secret-key", "AWS_DEFAULT_REGION": "us-west-2", }, + clear=True, ): llm = create_llm(llm_value=None) assert isinstance(llm, BaseLLM) - assert llm.model == "gpt-3.5-turbo" + assert llm.model == DEFAULT_LLM_MODEL assert not hasattr(llm, "AWS_ACCESS_KEY_ID") assert not hasattr(llm, "AWS_SECRET_ACCESS_KEY") assert not hasattr(llm, "AWS_DEFAULT_REGION")