diff --git a/lib/cli/src/crewai_cli/benchmark.py b/lib/cli/src/crewai_cli/benchmark.py
index 0128d9a49..c8ee1e62c 100644
--- a/lib/cli/src/crewai_cli/benchmark.py
+++ b/lib/cli/src/crewai_cli/benchmark.py
@@ -192,6 +192,7 @@ async def _run_model_benchmark(
     judge_model: str,
     emit: Callable[[dict[str, Any]], None],
     agents_dir: Path | None = None,
+    verbose: bool = False,
 ) -> list[BenchmarkResult]:
     """Run all benchmark cases for a single model, parallelising up to _MAX_CASES_PARALLEL."""
     total = len(cases)
@@ -209,12 +210,8 @@ async def _run_model_benchmark(
                 bench_defn["llm"] = model
             bench_defn["settings"]["memory"] = False
             bench_defn["settings"]["self_improving"] = False
-            bench_defn["settings"]["planning"] = False
-            bench_defn["verbose"] = False
-            bench_defn["max_iter"] = min(bench_defn.get("max_iter", 25), 5)
-            bench_defn["max_execution_time"] = min(bench_defn.get("max_execution_time", 60), 60)
+            bench_defn["verbose"] = verbose
             bench_defn.pop("coworkers", None)
-            bench_defn.pop("tools", None)
 
             try:
                 agent = _load_agent(bench_defn, agents_dir=agents_dir)
@@ -304,6 +301,7 @@ async def run_benchmark(
     models: list[str] | None = None,
     judge_model: str = "openai/gpt-4o-mini",
     on_progress: Callable[[dict[str, Any]], None] | None = None,
+    verbose: bool = False,
 ) -> dict[str, list[BenchmarkResult]]:
     """Run benchmark cases against an agent definition across models in parallel.
 
@@ -313,6 +311,7 @@ async def run_benchmark(
         models: Optional list of model identifiers to compare. If None, uses agent's default.
         judge_model: Model to use for LLM judge evaluation.
         on_progress: Optional callback receiving progress dicts with a "type" key.
+        verbose: When True, enable agent verbose output for debugging.
 
     Returns:
         Dict mapping model name to list of BenchmarkResult.
@@ -333,7 +332,7 @@ async def run_benchmark(
             on_progress(event)
 
     tasks = [
-        _run_model_benchmark(defn, model, cases, judge_model, _emit, agents_dir=agents_dir)
+        _run_model_benchmark(defn, model, cases, judge_model, _emit, agents_dir=agents_dir, verbose=verbose)
         for model in models
     ]
     all_results = await asyncio.gather(*tasks)
@@ -379,6 +378,105 @@ class SuppressBenchmarkOutput:
                 pass
 
 
+class VerboseBenchmarkOutput:
+    """Context manager that subscribes to NewAgent events and prints them for debugging."""
+
+    def __enter__(self):
+        import logging
+        import sys
+        from crewai.events.event_bus import crewai_event_bus
+        from crewai.new_agent.events import (
+            NewAgentLLMCallStartedEvent,
+            NewAgentLLMCallCompletedEvent,
+            NewAgentLLMCallFailedEvent,
+            NewAgentToolUsageStartedEvent,
+            NewAgentToolUsageCompletedEvent,
+            NewAgentToolUsageFailedEvent,
+            NewAgentStatusUpdateEvent,
+            NewAgentContextSummarizedEvent,
+        )
+
+        # Suppress Rich formatter panels — we print our own structured output
+        self._saved_formatter = None
+        try:
+            from crewai.events.listeners.tracing.trace_listener import TraceCollectionListener
+            listener = TraceCollectionListener._instance
+            if listener:
+                self._saved_formatter = listener.formatter
+                listener.formatter = None
+        except Exception:
+            pass
+
+        # Quiet loggers to WARNING — keep warnings visible, suppress debug/info spam
+        self._loggers = []
+        for name in (None, "crewai.new_agent.event_listener", "crewai.new_agent.executor", "crewai"):
+            lg = logging.getLogger(name)
+            self._loggers.append((lg, lg.level))
+            lg.setLevel(logging.WARNING)
+
+        self._bus = crewai_event_bus
+        self._handlers = []
+        w = sys.stderr.write
+        fl = sys.stderr.flush
+
+        def _on_llm_start(_src, ev: NewAgentLLMCallStartedEvent):
+            w(f"\033[36m[llm] calling {ev.model}…\033[0m\n"); fl()
+
+        def _on_llm_done(_src, ev: NewAgentLLMCallCompletedEvent):
+            w(f"\033[36m[llm] {ev.model}  {ev.input_tokens}→{ev.output_tokens} tokens  {ev.response_time_ms}ms\033[0m\n"); fl()
+
+        def _on_llm_fail(_src, ev: NewAgentLLMCallFailedEvent):
+            w(f"\033[31m[llm] FAILED: {ev.error[:200]}\033[0m\n"); fl()
+
+        def _on_tool_start(_src, ev: NewAgentToolUsageStartedEvent):
+            w(f"\033[33m[tool] using {ev.tool_name}…\033[0m\n"); fl()
+
+        def _on_tool_done(_src, ev: NewAgentToolUsageCompletedEvent):
+            w(f"\033[33m[tool] {ev.tool_name} done\033[0m\n"); fl()
+
+        def _on_tool_fail(_src, ev: NewAgentToolUsageFailedEvent):
+            w(f"\033[31m[tool] {ev.tool_name} FAILED: {ev.error[:200]}\033[0m\n"); fl()
+
+        def _on_status(_src, ev: NewAgentStatusUpdateEvent):
+            if ev.detail:
+                w(f"\033[2m[status] {ev.state}: {ev.detail}\033[0m\n"); fl()
+
+        def _on_summarized(_src, ev: NewAgentContextSummarizedEvent):
+            w(f"\033[35m[context] summarized — context was too large\033[0m\n"); fl()
+
+        pairs = [
+            (NewAgentLLMCallStartedEvent, _on_llm_start),
+            (NewAgentLLMCallCompletedEvent, _on_llm_done),
+            (NewAgentLLMCallFailedEvent, _on_llm_fail),
+            (NewAgentToolUsageStartedEvent, _on_tool_start),
+            (NewAgentToolUsageCompletedEvent, _on_tool_done),
+            (NewAgentToolUsageFailedEvent, _on_tool_fail),
+            (NewAgentStatusUpdateEvent, _on_status),
+            (NewAgentContextSummarizedEvent, _on_summarized),
+        ]
+        for event_type, handler in pairs:
+            self._bus.on(event_type)(handler)
+            self._handlers.append((event_type, handler))
+        return self
+
+    def __exit__(self, *exc):
+        for event_type, handler in self._handlers:
+            try:
+                self._bus.off(event_type, handler)
+            except Exception:
+                pass
+        for lg, level in self._loggers:
+            lg.setLevel(level)
+        if self._saved_formatter is not None:
+            try:
+                from crewai.events.listeners.tracing.trace_listener import TraceCollectionListener
+                listener = TraceCollectionListener._instance
+                if listener:
+                    listener.formatter = self._saved_formatter
+            except Exception:
+                pass
+
+
 class ArtifactsSandbox:
     """Context manager that chdirs into tests/artifacts/ for the benchmark run.
 
diff --git a/lib/cli/src/crewai_cli/cli.py b/lib/cli/src/crewai_cli/cli.py
index e65a98143..2b86064e8 100644
--- a/lib/cli/src/crewai_cli/cli.py
+++ b/lib/cli/src/crewai_cli/cli.py
@@ -511,12 +511,18 @@ def memory(
     help="LLM model for evaluation judging (NewAgent only). "
     "Defaults to test.judge_model in config.json (openai/gpt-4o-mini if not set).",
 )
+@click.option(
+    "-v", "--verbose",
+    is_flag=True,
+    help="Show agent execution details (tool calls, LLM responses, errors).",
+)
 def test(
     n_iterations: int,
     model: str | None,
     trained_agents_file: str | None,
     threshold: float | None,
     judge_model: str | None,
+    verbose: bool,
 ) -> None:
     """Test the crew or agents and evaluate the results.
 
@@ -541,6 +547,8 @@ def test(
                 uv_args.extend(["-m", model])
             if trained_agents_file:
                 uv_args.extend(["-f", trained_agents_file])
+            if verbose:
+                uv_args.append("-v")
             _relaunch_via_uv(uv_args)
 
         config_threshold = _read_config("test", "threshold")
@@ -548,7 +556,7 @@ def test(
             config_threshold = _read_config("test_threshold")
         effective_threshold = threshold if threshold is not None else (float(config_threshold) if config_threshold is not None else 0.7)
 
-        _test_new_agents(agent_files, n_iterations, model, effective_threshold, effective_judge)
+        _test_new_agents(agent_files, n_iterations, model, effective_threshold, effective_judge, verbose=verbose)
     else:
         crew_model = model or "gpt-4o-mini"
         click.echo(f"Testing the crew for {n_iterations} iterations with model {crew_model}")
@@ -706,6 +714,7 @@ def _test_new_agents(
     model: str | None,
     threshold: float,
     judge_model: str,
+    verbose: bool = False,
 ) -> None:
     """Run NewAgent test cases with pass/fail threshold (all agents in parallel)."""
     import asyncio
@@ -754,7 +763,7 @@ def _test_new_agents(
     model_list = [model] if model else None
 
     # Progress display — prefix model key with agent name
-    progress = _BenchmarkLiveProgress(console=_con)
+    progress = None if verbose else _BenchmarkLiveProgress(console=_con)
 
     def _make_progress_cb(agent_name: str):
         def _cb(event: dict) -> None:
@@ -773,7 +782,8 @@ def _test_new_agents(
                     cases=job["cases"],
                     models=model_list,
                     judge_model=judge_model,
-                    on_progress=_make_progress_cb(job["agent_name"]),
+                    on_progress=None if verbose else _make_progress_cb(job["agent_name"]),
+                    verbose=verbose,
                 )
             )
         return await asyncio.gather(*tasks, return_exceptions=True)
@@ -785,14 +795,21 @@ def _test_new_agents(
         fg="cyan", bold=True,
     )
 
-    from crewai_cli.benchmark import ArtifactsSandbox, SuppressBenchmarkOutput
+    from crewai_cli.benchmark import ArtifactsSandbox, SuppressBenchmarkOutput, VerboseBenchmarkOutput
 
-    progress.start()
+    if not verbose:
+        progress.start()
     try:
-        with ArtifactsSandbox(), SuppressBenchmarkOutput():
-            all_results = asyncio.run(_run_all())
+        with ArtifactsSandbox():
+            if verbose:
+                with VerboseBenchmarkOutput():
+                    all_results = asyncio.run(_run_all())
+            else:
+                with SuppressBenchmarkOutput():
+                    all_results = asyncio.run(_run_all())
     finally:
-        progress.stop()
+        if not verbose:
+            progress.stop()
 
     # Evaluate results
     all_passed = True
@@ -1565,11 +1582,17 @@ def checkpoint_prune(
     help="Model for LLM judge evaluation. "
     "Defaults to test.judge_model in config.json (openai/gpt-4o-mini if not set).",
 )
+@click.option(
+    "-v", "--verbose",
+    is_flag=True,
+    help="Show agent execution details (tool calls, LLM responses, errors).",
+)
 def benchmark(
     agent_path: str,
     cases_path: str,
     models: tuple[str, ...],
     judge_model: str | None,
+    verbose: bool,
 ) -> None:
     """Run agent against test cases and report results."""
     import asyncio
@@ -1582,6 +1605,8 @@ def benchmark(
         uv_args = ["benchmark", agent_path, cases_path, "--judge-model", judge_model]
         for m in models:
             uv_args.extend(["-m", m])
+        if verbose:
+            uv_args.append("-v")
         _relaunch_via_uv(uv_args)
 
     from rich.console import Console as _RichConsole
@@ -1613,26 +1638,42 @@ def benchmark(
     click.echo(f"Judge model: {judge_model}")
     click.echo()
 
-    from crewai_cli.benchmark import ArtifactsSandbox, SuppressBenchmarkOutput
+    from crewai_cli.benchmark import ArtifactsSandbox, SuppressBenchmarkOutput, VerboseBenchmarkOutput
 
-    progress = _BenchmarkLiveProgress(console=_con)
-    progress.start()
+    progress = None if verbose else _BenchmarkLiveProgress(console=_con)
+    if progress:
+        progress.start()
     try:
-        with ArtifactsSandbox(), SuppressBenchmarkOutput():
-            results_by_model = asyncio.run(
-                run_benchmark(
-                    agent_def=agent_path,
-                    cases=cases,
-                    models=model_list,
-                    judge_model=judge_model,
-                    on_progress=progress.on_progress,
-                )
-            )
+        with ArtifactsSandbox():
+            if verbose:
+                with VerboseBenchmarkOutput():
+                    results_by_model = asyncio.run(
+                        run_benchmark(
+                            agent_def=agent_path,
+                            cases=cases,
+                            models=model_list,
+                            judge_model=judge_model,
+                            verbose=verbose,
+                        )
+                    )
+            else:
+                with SuppressBenchmarkOutput():
+                    results_by_model = asyncio.run(
+                        run_benchmark(
+                            agent_def=agent_path,
+                            cases=cases,
+                            models=model_list,
+                            judge_model=judge_model,
+                            on_progress=progress.on_progress if progress else None,
+                            verbose=verbose,
+                        )
+                    )
     except Exception as e:
         click.secho(f"Error running benchmark: {e}", fg="red")
         raise SystemExit(1) from e
     finally:
-        progress.stop()
+        if progress:
+            progress.stop()
 
     if len(results_by_model) > 1:
         _con.print()