fix: resolve lint, test, and review issues

- Replace S101 assert guards with explicit if/raise RuntimeError in
  benchmark.py and cli.py (3 locations)
- Fix test_create_llm_from_env_with_unaccepted_attributes to use
  DEFAULT_LLM_MODEL with clear=True so the assertion isn't brittle
  against the hardcoded model name
- Add n_iterations loop to _test_new_agents (was unused, now mirrors
  _train_new_agents iteration pattern)
- Consolidate dotenv loading in cli.py and agent_tui.py to use the
  existing load_env_vars() from utils.py instead of duplicating logic

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
alex-clawd
2026-05-13 07:38:39 -07:00
parent 68fb64f383
commit 74bf197ccb
4 changed files with 69 additions and 71 deletions

View File

@@ -40,6 +40,7 @@ from textual.widgets import (
)
from crewai_cli.create_agent import _strip_jsonc
from crewai_cli.utils import load_env_vars
try:
@@ -1703,18 +1704,9 @@ class AgentTUI(App[None]):
def _load_dotenv(base: Path) -> None:
"""Load .env file into os.environ if it exists."""
env_path = base / ".env"
if not env_path.exists():
return
try:
for line in env_path.read_text(encoding="utf-8").splitlines():
line = line.strip()
if not line or line.startswith("#"):
continue
key, _, value = line.partition("=")
key = key.strip()
value = value.strip()
if key and value and key not in os.environ:
for key, value in load_env_vars(base).items():
if key not in os.environ:
os.environ[key] = value
except Exception:
pass

View File

@@ -136,7 +136,8 @@ async def _judge_with_llm(
from crewai.utilities.llm_utils import create_llm
judge_llm = create_llm(judge_model)
assert judge_llm is not None
if judge_llm is None:
raise RuntimeError(f"Failed to create LLM from judge model: {judge_model!r}")
prompt = (
"You are an evaluation judge. Score the following response on a scale of 0.0 to 1.0.\n\n"
@@ -445,7 +446,7 @@ async def run_benchmark(
class SuppressBenchmarkOutput:
"""Context manager that silences console formatter and noisy logging during benchmarks."""
def __enter__(self) -> "SuppressBenchmarkOutput":
def __enter__(self) -> SuppressBenchmarkOutput:
import logging
self._saved_formatter = None
@@ -491,7 +492,7 @@ class SuppressBenchmarkOutput:
class VerboseBenchmarkOutput:
"""Context manager that subscribes to NewAgent events and prints them for debugging."""
def __enter__(self) -> "VerboseBenchmarkOutput":
def __enter__(self) -> VerboseBenchmarkOutput:
import logging
import sys
@@ -620,7 +621,7 @@ class ArtifactsSandbox:
self._base = Path(base)
self._prev_cwd: str | None = None
def __enter__(self) -> "ArtifactsSandbox":
def __enter__(self) -> ArtifactsSandbox:
import os
self._base.mkdir(parents=True, exist_ok=True)

View File

@@ -37,7 +37,11 @@ from crewai_cli.user_data import (
is_tracing_enabled,
update_user_data,
)
from crewai_cli.utils import build_env_with_all_tool_credentials, read_toml
from crewai_cli.utils import (
build_env_with_all_tool_credentials,
load_env_vars,
read_toml,
)
def _get_cli_version() -> str:
@@ -59,19 +63,12 @@ def crewai() -> None:
"""Top-level command group for crewai."""
from pathlib import Path
env_path = Path.cwd() / ".env"
if env_path.exists():
try:
for line in env_path.read_text(encoding="utf-8").splitlines():
line = line.strip()
if not line or line.startswith("#"):
continue
key, _, value = line.partition("=")
key, value = key.strip(), value.strip()
if key and value and key not in os.environ:
os.environ[key] = value
except Exception:
pass
try:
for key, value in load_env_vars(Path.cwd()).items():
if key not in os.environ:
os.environ[key] = value
except Exception:
pass
@crewai.command(
@@ -847,7 +844,7 @@ def _test_new_agents(
case_count = sum(len(j["cases"]) for j in jobs)
click.echo()
click.secho(
f"Testing {len(jobs)} agent(s), {case_count} cases (threshold={threshold})",
f"Testing {len(jobs)} agent(s), {case_count} cases, {n_iterations} iteration(s) (threshold={threshold})",
fg="cyan",
bold=True,
)
@@ -858,51 +855,59 @@ def _test_new_agents(
VerboseBenchmarkOutput,
)
if not verbose:
assert progress is not None
progress.start()
try:
with ArtifactsSandbox():
if verbose:
with VerboseBenchmarkOutput():
all_results = asyncio.run(_run_all())
else:
with SuppressBenchmarkOutput():
all_results = asyncio.run(_run_all())
finally:
if not verbose:
assert progress is not None
progress.stop()
# Evaluate results
all_passed = True
agents_tested = 0
for job, result in zip(jobs, all_results):
if isinstance(result, Exception):
click.secho(
f" Error running tests for {job['agent_name']}: {result}", fg="red"
)
all_passed = False
continue
agents_tested += 1
for results in result.values():
failed = [r for r in results if r.score < job["threshold"]]
if failed:
for iteration in range(n_iterations):
if n_iterations > 1:
click.secho(f"\n Iteration {iteration + 1}/{n_iterations}", fg="cyan")
if not verbose:
if progress is None:
raise RuntimeError("progress must not be None in non-verbose mode")
progress.start()
try:
with ArtifactsSandbox():
if verbose:
with VerboseBenchmarkOutput():
all_results = asyncio.run(_run_all())
else:
with SuppressBenchmarkOutput():
all_results = asyncio.run(_run_all())
finally:
if not verbose:
if progress is None:
raise RuntimeError("progress must not be None in non-verbose mode")
progress.stop()
# Evaluate results for this iteration
for job, result in zip(jobs, all_results):
if isinstance(result, Exception):
click.secho(
f" Error running tests for {job['agent_name']}: {result}", fg="red"
)
all_passed = False
_con.print(
f" [red bold]{job['agent_name']}: FAILED {len(failed)}/{len(results)} "
f"cases below threshold ({job['threshold']})[/red bold]"
)
for r in failed:
inp = r.input[:60] + ("" if len(r.input) > 60 else "")
continue
agents_tested += 1
for results in result.values():
failed = [r for r in results if r.score < job["threshold"]]
if failed:
all_passed = False
_con.print(
f" [red]#{r.case_index + 1}[/red] [dim]{inp}[/dim] [red]{r.score:.2f}[/red]"
f" [red bold]{job['agent_name']}: FAILED {len(failed)}/{len(results)} "
f"cases below threshold ({job['threshold']})[/red bold]"
)
else:
_con.print(
f" [green bold]{job['agent_name']}: PASSED all {len(results)} cases >= {job['threshold']}[/green bold]"
)
for r in failed:
inp = r.input[:60] + ("" if len(r.input) > 60 else "")
_con.print(
f" [red]#{r.case_index + 1}[/red] [dim]{inp}[/dim] [red]{r.score:.2f}[/red]"
)
else:
_con.print(
f" [green bold]{job['agent_name']}: PASSED all {len(results)} cases >= {job['threshold']}[/green bold]"
)
if agents_tested == 0:
click.secho("No agents completed successfully.", fg="yellow")
raise SystemExit(1)

View File

@@ -77,16 +77,16 @@ def test_create_llm_from_env_with_unaccepted_attributes() -> None:
with patch.dict(
os.environ,
{
"OPENAI_MODEL_NAME": "gpt-3.5-turbo",
"OPENAI_API_KEY": "fake-key",
"AWS_ACCESS_KEY_ID": "fake-access-key",
"AWS_SECRET_ACCESS_KEY": "fake-secret-key",
"AWS_DEFAULT_REGION": "us-west-2",
},
clear=True,
):
llm = create_llm(llm_value=None)
assert isinstance(llm, BaseLLM)
assert llm.model == "gpt-3.5-turbo"
assert llm.model == DEFAULT_LLM_MODEL
assert not hasattr(llm, "AWS_ACCESS_KEY_ID")
assert not hasattr(llm, "AWS_SECRET_ACCESS_KEY")
assert not hasattr(llm, "AWS_DEFAULT_REGION")