diff --git a/lib/cli/src/crewai_cli/agent_tui.py b/lib/cli/src/crewai_cli/agent_tui.py
index d8e4a31b2..6923d616c 100644
--- a/lib/cli/src/crewai_cli/agent_tui.py
+++ b/lib/cli/src/crewai_cli/agent_tui.py
@@ -40,6 +40,7 @@ from textual.widgets import (
 )
 
 from crewai_cli.create_agent import _strip_jsonc
+from crewai_cli.utils import load_env_vars
 
 
 try:
@@ -1703,18 +1704,9 @@ class AgentTUI(App[None]):
 
 def _load_dotenv(base: Path) -> None:
     """Load .env file into os.environ if it exists."""
-    env_path = base / ".env"
-    if not env_path.exists():
-        return
     try:
-        for line in env_path.read_text(encoding="utf-8").splitlines():
-            line = line.strip()
-            if not line or line.startswith("#"):
-                continue
-            key, _, value = line.partition("=")
-            key = key.strip()
-            value = value.strip()
-            if key and value and key not in os.environ:
+        for key, value in load_env_vars(base).items():
+            if key not in os.environ:
                 os.environ[key] = value
     except Exception:
         pass
diff --git a/lib/cli/src/crewai_cli/benchmark.py b/lib/cli/src/crewai_cli/benchmark.py
index 9d0ca9127..c16cd733b 100644
--- a/lib/cli/src/crewai_cli/benchmark.py
+++ b/lib/cli/src/crewai_cli/benchmark.py
@@ -136,7 +136,8 @@ async def _judge_with_llm(
     from crewai.utilities.llm_utils import create_llm
 
     judge_llm = create_llm(judge_model)
-    assert judge_llm is not None
+    if judge_llm is None:
+        raise RuntimeError(f"Failed to create LLM from judge model: {judge_model!r}")
 
     prompt = (
         "You are an evaluation judge. Score the following response on a scale of 0.0 to 1.0.\n\n"
@@ -445,7 +446,7 @@ async def run_benchmark(
 class SuppressBenchmarkOutput:
     """Context manager that silences console formatter and noisy logging during benchmarks."""
 
-    def __enter__(self) -> "SuppressBenchmarkOutput":
+    def __enter__(self) -> SuppressBenchmarkOutput:
         import logging
 
         self._saved_formatter = None
@@ -491,7 +492,7 @@ class SuppressBenchmarkOutput:
 class VerboseBenchmarkOutput:
     """Context manager that subscribes to NewAgent events and prints them for debugging."""
 
-    def __enter__(self) -> "VerboseBenchmarkOutput":
+    def __enter__(self) -> VerboseBenchmarkOutput:
         import logging
         import sys
 
@@ -620,7 +621,7 @@ class ArtifactsSandbox:
         self._base = Path(base)
         self._prev_cwd: str | None = None
 
-    def __enter__(self) -> "ArtifactsSandbox":
+    def __enter__(self) -> ArtifactsSandbox:
         import os
 
         self._base.mkdir(parents=True, exist_ok=True)
diff --git a/lib/cli/src/crewai_cli/cli.py b/lib/cli/src/crewai_cli/cli.py
index b494dae8d..47cd3b448 100644
--- a/lib/cli/src/crewai_cli/cli.py
+++ b/lib/cli/src/crewai_cli/cli.py
@@ -37,7 +37,11 @@ from crewai_cli.user_data import (
     is_tracing_enabled,
     update_user_data,
 )
-from crewai_cli.utils import build_env_with_all_tool_credentials, read_toml
+from crewai_cli.utils import (
+    build_env_with_all_tool_credentials,
+    load_env_vars,
+    read_toml,
+)
 
 
 def _get_cli_version() -> str:
@@ -59,19 +63,12 @@ def crewai() -> None:
     """Top-level command group for crewai."""
     from pathlib import Path
 
-    env_path = Path.cwd() / ".env"
-    if env_path.exists():
-        try:
-            for line in env_path.read_text(encoding="utf-8").splitlines():
-                line = line.strip()
-                if not line or line.startswith("#"):
-                    continue
-                key, _, value = line.partition("=")
-                key, value = key.strip(), value.strip()
-                if key and value and key not in os.environ:
-                    os.environ[key] = value
-        except Exception:
-            pass
+    try:
+        for key, value in load_env_vars(Path.cwd()).items():
+            if key not in os.environ:
+                os.environ[key] = value
+    except Exception:
+        pass
 
 
 @crewai.command(
@@ -847,7 +844,7 @@ def _test_new_agents(
     case_count = sum(len(j["cases"]) for j in jobs)
     click.echo()
     click.secho(
-        f"Testing {len(jobs)} agent(s), {case_count} cases (threshold={threshold})",
+        f"Testing {len(jobs)} agent(s), {case_count} cases, {n_iterations} iteration(s) (threshold={threshold})",
         fg="cyan",
         bold=True,
     )
@@ -858,51 +855,59 @@ def _test_new_agents(
         VerboseBenchmarkOutput,
     )
 
-    if not verbose:
-        assert progress is not None
-        progress.start()
-    try:
-        with ArtifactsSandbox():
-            if verbose:
-                with VerboseBenchmarkOutput():
-                    all_results = asyncio.run(_run_all())
-            else:
-                with SuppressBenchmarkOutput():
-                    all_results = asyncio.run(_run_all())
-    finally:
-        if not verbose:
-            assert progress is not None
-            progress.stop()
-
-    # Evaluate results
     all_passed = True
     agents_tested = 0
-    for job, result in zip(jobs, all_results):
-        if isinstance(result, Exception):
-            click.secho(
-                f"  Error running tests for {job['agent_name']}: {result}", fg="red"
-            )
-            all_passed = False
-            continue
 
-        agents_tested += 1
-        for results in result.values():
-            failed = [r for r in results if r.score < job["threshold"]]
-            if failed:
+    for iteration in range(n_iterations):
+        if n_iterations > 1:
+            click.secho(f"\n  Iteration {iteration + 1}/{n_iterations}", fg="cyan")
+
+        if not verbose:
+            if progress is None:
+                raise RuntimeError("progress must not be None in non-verbose mode")
+            progress.start()
+        try:
+            with ArtifactsSandbox():
+                if verbose:
+                    with VerboseBenchmarkOutput():
+                        all_results = asyncio.run(_run_all())
+                else:
+                    with SuppressBenchmarkOutput():
+                        all_results = asyncio.run(_run_all())
+        finally:
+            if not verbose:
+                if progress is None:
+                    raise RuntimeError("progress must not be None in non-verbose mode")
+                progress.stop()
+
+        # Evaluate results for this iteration
+        for job, result in zip(jobs, all_results):
+            if isinstance(result, Exception):
+                click.secho(
+                    f"  Error running tests for {job['agent_name']}: {result}", fg="red"
+                )
                 all_passed = False
-                _con.print(
-                    f"  [red bold]{job['agent_name']}: FAILED {len(failed)}/{len(results)} "
-                    f"cases below threshold ({job['threshold']})[/red bold]"
-                )
-                for r in failed:
-                    inp = r.input[:60] + ("…" if len(r.input) > 60 else "")
+                continue
+
+            agents_tested += 1
+            for results in result.values():
+                failed = [r for r in results if r.score < job["threshold"]]
+                if failed:
+                    all_passed = False
                     _con.print(
-                        f"    [red]#{r.case_index + 1}[/red] [dim]{inp}[/dim]  [red]{r.score:.2f}[/red]"
+                        f"  [red bold]{job['agent_name']}: FAILED {len(failed)}/{len(results)} "
+                        f"cases below threshold ({job['threshold']})[/red bold]"
                     )
-            else:
-                _con.print(
-                    f"  [green bold]{job['agent_name']}: PASSED all {len(results)} cases >= {job['threshold']}[/green bold]"
-                )
+                    for r in failed:
+                        inp = r.input[:60] + ("…" if len(r.input) > 60 else "")
+                        _con.print(
+                            f"    [red]#{r.case_index + 1}[/red] [dim]{inp}[/dim]  [red]{r.score:.2f}[/red]"
+                        )
+                else:
+                    _con.print(
+                        f"  [green bold]{job['agent_name']}: PASSED all {len(results)} cases >= {job['threshold']}[/green bold]"
+                    )
+
     if agents_tested == 0:
         click.secho("No agents completed successfully.", fg="yellow")
         raise SystemExit(1)
diff --git a/lib/crewai/tests/utilities/test_llm_utils.py b/lib/crewai/tests/utilities/test_llm_utils.py
index 5b4aaeef9..dc83cfb82 100644
--- a/lib/crewai/tests/utilities/test_llm_utils.py
+++ b/lib/crewai/tests/utilities/test_llm_utils.py
@@ -77,16 +77,16 @@ def test_create_llm_from_env_with_unaccepted_attributes() -> None:
     with patch.dict(
         os.environ,
         {
-            "OPENAI_MODEL_NAME": "gpt-3.5-turbo",
             "OPENAI_API_KEY": "fake-key",
             "AWS_ACCESS_KEY_ID": "fake-access-key",
             "AWS_SECRET_ACCESS_KEY": "fake-secret-key",
             "AWS_DEFAULT_REGION": "us-west-2",
         },
+        clear=True,
     ):
         llm = create_llm(llm_value=None)
         assert isinstance(llm, BaseLLM)
-        assert llm.model == "gpt-3.5-turbo"
+        assert llm.model == DEFAULT_LLM_MODEL
         assert not hasattr(llm, "AWS_ACCESS_KEY_ID")
         assert not hasattr(llm, "AWS_SECRET_ACCESS_KEY")
         assert not hasattr(llm, "AWS_DEFAULT_REGION")