diff --git a/lib/cli/src/crewai_cli/agent_tui.py b/lib/cli/src/crewai_cli/agent_tui.py
index 87db1fbcd..905680d54 100644
--- a/lib/cli/src/crewai_cli/agent_tui.py
+++ b/lib/cli/src/crewai_cli/agent_tui.py
@@ -21,12 +21,17 @@ from typing import Any
 from textual.app import App, ComposeResult
 from textual.binding import Binding
 from textual.containers import Horizontal, Vertical, VerticalScroll
+from textual.screen import ModalScreen
 from textual.widgets import (
     Button,
+    Checkbox,
     Footer,
     Header,
     Input,
+    Label,
     OptionList,
+    RadioButton,
+    RadioSet,
     Static,
     TabbedContent,
     TabPane,
@@ -66,7 +71,7 @@ _BG_PANEL = "#222222"
 _BG_MSG_USER = "#2a2a2a"
 _BG_MSG_AGENT = "#252525"
 _DIM = "#777777"
-_COMMON_ROOM = "__common__"
+_ROOM_PREFIX = "__room__"
 _SPINNER = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"
 
 
@@ -202,6 +207,79 @@ class ThinkingIndicator(Static):
         self.update("\n".join(lines))
 
 
+class CreateRoomScreen(ModalScreen[dict[str, Any] | None]):
+    """Modal form for creating a new room."""
+
+    CSS = f"""
+    CreateRoomScreen {{
+        align: center middle;
+    }}
+    #room-form {{
+        width: 56;
+        max-height: 80%;
+        background: {_BG_PANEL};
+        border: tall {_TEAL};
+        padding: 1 2;
+    }}
+    #room-form Label {{
+        margin: 1 0 0 0;
+        color: {_DIM};
+    }}
+    #room-form Input {{
+        margin: 0 0 1 0;
+    }}
+    #room-form .form-section {{
+        height: auto;
+        margin: 0 0 1 0;
+    }}
+    #room-form RadioSet {{
+        margin: 0;
+    }}
+    #room-form Button {{
+        margin: 1 1 0 0;
+    }}
+    """
+
+    def __init__(self, agent_names: list[str], **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self._agent_names = agent_names
+
+    def compose(self) -> ComposeResult:
+        with Vertical(id="room-form"):
+            yield Label(f"[bold {_CORAL}]Create Room[/]")
+            yield Label("Name")
+            yield Input(placeholder="e.g. engineering", id="room-name-input")
+            yield Label("Agents")
+            with Vertical(classes="form-section"):
+                for name in self._agent_names:
+                    yield Checkbox(name, value=True, id=f"cb-{name}")
+            yield Label("Engagement")
+            with RadioSet(id="engagement-radio"):
+                yield RadioButton("Organic — agents auto-respond", value=True, id="radio-organic")
+                yield RadioButton("Tagged — @mention required", id="radio-tagged")
+            with Horizontal():
+                yield Button("Create", variant="primary", id="btn-create-room")
+                yield Button("Cancel", id="btn-cancel-room")
+
+    def on_button_pressed(self, event: Button.Pressed) -> None:
+        if event.button.id == "btn-cancel-room":
+            self.dismiss(None)
+            return
+        if event.button.id == "btn-create-room":
+            name_input = self.query_one("#room-name-input", Input)
+            name = name_input.value.strip().lower().replace(" ", "-")
+            if not name:
+                name_input.focus()
+                return
+            agents = [
+                n for n in self._agent_names
+                if self.query_one(f"#cb-{n}", Checkbox).value
+            ]
+            radio = self.query_one("#engagement-radio", RadioSet)
+            engagement = "organic" if radio.pressed_index == 0 else "tagged"
+            self.dismiss({"name": name, "agents": agents, "engagement": engagement})
+
+
 # ── Main TUI ──────────────────────────────────────────────────
 
 
@@ -273,6 +351,38 @@ class AgentTUI(App[None]):
         color: {_TEAL};
         background: {_BG};
     }}
+    .sidebar-label {{
+        padding: 1 1 0 1;
+        color: {_DIM};
+        text-style: bold;
+        height: auto;
+    }}
+    #room-list {{
+        height: auto;
+        max-height: 40%;
+        padding: 0 1;
+        background: {_BG_PANEL};
+    }}
+    #room-list > .option-list--option-highlighted {{
+        background: {_TEAL};
+        color: white;
+    }}
+    #room-list > .option-list--option {{
+        padding: 0 1;
+    }}
+    #btn-new-room {{
+        margin: 0 1 1 1;
+        width: 100%;
+        background: {_BG};
+        color: {_TEAL};
+        border: tall #333333;
+        min-height: 1;
+        height: 3;
+    }}
+    #btn-new-room:hover {{
+        background: {_TEAL};
+        color: white;
+    }}
     #agent-list {{
         height: 1fr;
         padding: 0 1;
@@ -374,16 +484,18 @@ class AgentTUI(App[None]):
         super().__init__(**kwargs)
         self._agents_dir = agents_dir
         self._config = config or {}
+        self._config_path = Path.cwd() / "config.json"
         self._agent_defs: list[dict[str, Any]] = []
         self._agent_names: list[str] = []
         self._agent_instances: dict[str, Any] = {}
-        self._current_room: str = _COMMON_ROOM
+        # Rooms: {room_key: {"name": display_name, "agents": [...], "engagement": "organic"|"tagged"}}
+        self._rooms: dict[str, dict[str, Any]] = {}
+        self._current_room: str = ""
         # (sender, content, metadata) tuples keyed by room
         self._chat_histories: dict[str, list[tuple[str, str, str]]] = {}
         self._processing = False
         self._last_active_agent: str | None = None
         self._last_agent_error: str = ""
-        self._engagement_mode: str = "organic"
         self._scheduler: Any = None
 
     def compose(self) -> ComposeResult:
@@ -391,7 +503,11 @@ class AgentTUI(App[None]):
         with Horizontal(id="main-layout"):
             with Vertical(id="sidebar"):
                 with TabbedContent(id="sidebar-tabs"):
-                    with TabPane("Agents", id="tab-agents"):
+                    with TabPane("Chat", id="tab-agents"):
+                        yield Static("ROOMS", classes="sidebar-label")
+                        yield OptionList(id="room-list")
+                        yield Button("+ New Room", id="btn-new-room", variant="default")
+                        yield Static("AGENTS", classes="sidebar-label")
                         yield OptionList(id="agent-list")
                     with TabPane("Memory", id="tab-memory"):
                         yield Static("Click below to open the memory browser.", id="memory-scope-label")
@@ -407,15 +523,49 @@ class AgentTUI(App[None]):
                     )
         yield Footer()
 
+    def _room_key(self, name: str) -> str:
+        return f"{_ROOM_PREFIX}{name}"
+
+    def _is_room(self, key: str) -> bool:
+        return key.startswith(_ROOM_PREFIX)
+
+    def _room_engagement(self, room_key: str) -> str:
+        if room_key in self._rooms:
+            return self._rooms[room_key].get("engagement", "organic")
+        return "organic"
+
+    def _room_agents(self, room_key: str) -> list[str]:
+        if room_key in self._rooms:
+            return self._rooms[room_key].get("agents", self._agent_names[:])
+        return self._agent_names[:]
+
     def on_mount(self) -> None:
         self._agent_defs = _load_agents(self._agents_dir)
         self._agent_names = [
             d.get("name", d.get("role", "unnamed")) for d in self._agent_defs
         ]
 
-        rooms = self._config.get("rooms", {})
-        common_room = rooms.get("common", {})
-        self._engagement_mode = common_room.get("engagement", "organic")
+        # Load rooms from config
+        rooms_cfg = self._config.get("rooms", {})
+        for room_name, room_data in rooms_cfg.items():
+            key = self._room_key(room_name)
+            cfg_agents = room_data.get("agents", [])
+            self._rooms[key] = {
+                "name": room_name,
+                "agents": cfg_agents if cfg_agents else self._agent_names[:],
+                "engagement": room_data.get("engagement", "organic"),
+            }
+
+        # Ensure at least "common" room exists
+        common_key = self._room_key("common")
+        if common_key not in self._rooms:
+            self._rooms[common_key] = {
+                "name": "common",
+                "agents": self._agent_names[:],
+                "engagement": "organic",
+            }
+
+        self._current_room = common_key
 
         # Subscribe to status update events from the executor
         self._status_listener = None
@@ -439,22 +589,26 @@ class AgentTUI(App[None]):
             except Exception:
                 pass
 
-        chat_input = self.query_one("#chat-input", Input)
-        if self._engagement_mode == "organic":
-            chat_input.placeholder = "Type a message — agents will respond automatically"
-
         if AgentSuggester is not None and self._agent_names:
             self.query_one("#chat-input", Input).suggester = AgentSuggester(
                 self._agent_names
             )
 
-        agent_list = self.query_one("#agent-list", OptionList)
-
         if not self._agent_defs:
             self._mount_sys("No agents found. Run: crewai create agent <name>")
             return
 
-        agent_list.add_option("◆ Common Room")
+        # Populate rooms list
+        room_list = self.query_one("#room-list", OptionList)
+        for key in self._rooms:
+            display = self._rooms[key]["name"].replace("-", " ").title()
+            engagement = self._rooms[key]["engagement"]
+            n_agents = len(self._rooms[key]["agents"])
+            room_list.add_option(f"◆ {display}  [{_DIM}]{engagement} · {n_agents}[/]")
+        room_list.highlighted = 0
+
+        # Populate agents list (DM entries)
+        agent_list = self.query_one("#agent-list", OptionList)
         for defn in self._agent_defs:
             name = defn.get("name", "unnamed")
             role = defn.get("role", "")
@@ -464,8 +618,8 @@ class AgentTUI(App[None]):
                 label += f" · {trunc}"
             agent_list.add_option(label)
 
-        agent_list.highlighted = 0
-
+        self._update_subtitle()
+        self._update_placeholder()
         self._load_history_from_disk()
         self._render_chat()
         self.query_one("#chat-input", Input).focus()
@@ -478,6 +632,25 @@ class AgentTUI(App[None]):
         except Exception:
             pass
 
+    def _update_subtitle(self) -> None:
+        if self._is_room(self._current_room):
+            info = self._rooms.get(self._current_room, {})
+            display = info.get("name", "room").replace("-", " ").title()
+            self.sub_title = display
+        else:
+            self.sub_title = f"Chat with {self._current_room}"
+
+    def _update_placeholder(self) -> None:
+        chat_input = self.query_one("#chat-input", Input)
+        if self._is_room(self._current_room):
+            engagement = self._room_engagement(self._current_room)
+            if engagement == "organic":
+                chat_input.placeholder = "Type a message — agents will respond automatically"
+            else:
+                chat_input.placeholder = "Use @agent_name to direct your message"
+        else:
+            chat_input.placeholder = f"Message {self._current_room}"
+
     def _on_scheduled_task_due(self, task: Any) -> str:
         """Callback fired by the scheduler when a task comes due."""
         agent_name = getattr(task, "agent_name", "")
@@ -511,17 +684,31 @@ class AgentTUI(App[None]):
     def on_option_list_option_highlighted(
         self, event: OptionList.OptionHighlighted
     ) -> None:
-        if event.option_list.id != "agent-list":
-            return
-        idx = event.option_index
-        if idx == 0:
-            self._current_room = _COMMON_ROOM
-            self.sub_title = "Common Room"
-        elif 1 <= idx <= len(self._agent_names):
-            name = self._agent_names[idx - 1]
-            self._current_room = name
-            self.sub_title = f"Chat with {name}"
-        self._render_chat()
+        if event.option_list.id == "room-list":
+            room_keys = list(self._rooms.keys())
+            idx = event.option_index
+            if 0 <= idx < len(room_keys):
+                self._current_room = room_keys[idx]
+                # Deselect agent list
+                try:
+                    self.query_one("#agent-list", OptionList).highlighted = None
+                except Exception:
+                    pass
+                self._update_subtitle()
+                self._update_placeholder()
+                self._render_chat()
+        elif event.option_list.id == "agent-list":
+            idx = event.option_index
+            if 0 <= idx < len(self._agent_names):
+                self._current_room = self._agent_names[idx]
+                # Deselect room list
+                try:
+                    self.query_one("#room-list", OptionList).highlighted = None
+                except Exception:
+                    pass
+                self._update_subtitle()
+                self._update_placeholder()
+                self._render_chat()
 
     # ── Message routing ──
 
@@ -548,15 +735,21 @@ class AgentTUI(App[None]):
         self._append_msg(room, "You", text)
         self._mount_bubble("You", text)
 
-        if not targets and self._current_room == _COMMON_ROOM:
-            # Route to agent with pending suggestion before organic scoring
+        if not targets and self._is_room(self._current_room):
+            room_agent_names = self._room_agents(self._current_room)
+            room_agent_defs = [
+                d for d in self._agent_defs
+                if d.get("name", d.get("role", "unnamed")) in room_agent_names
+            ]
+            engagement = self._room_engagement(self._current_room)
+
             pending_agent = self._find_agent_with_pending_suggestion()
-            if pending_agent:
+            if pending_agent and pending_agent in room_agent_names:
                 targets = [pending_agent]
-            elif self._engagement_mode == "organic":
-                scored = await self._score_relevance_llm(clean_text, self._agent_defs)
+            elif engagement == "organic":
+                scored = await self._score_relevance_llm(clean_text, room_agent_defs)
                 if scored is None:
-                    scored = self._score_relevance(clean_text, self._agent_defs)
+                    scored = self._score_relevance(clean_text, room_agent_defs)
                 if scored:
                     top_score = scored[0][1]
                     best = [scored[0][0]]
@@ -569,13 +762,14 @@ class AgentTUI(App[None]):
                         d.get("name", d.get("role", "unnamed")) for d in best
                     ]
                 else:
-                    targets = [self._last_active_agent or self._agent_names[0]]
-            elif len(self._agent_names) == 1:
-                targets = [self._agent_names[0]]
+                    targets = [self._last_active_agent or room_agent_names[0]]
+            elif len(room_agent_names) == 1:
+                targets = [room_agent_names[0]]
             else:
+                first = room_agent_names[0] if room_agent_names else "agent"
                 self._mount_sys(
                     "Tip: use @agent_name to direct your message, "
-                    f"e.g. @{self._agent_names[0]}"
+                    f"e.g. @{first}"
                 )
                 return
 
@@ -748,7 +942,7 @@ class AgentTUI(App[None]):
     def _handle_skills_command(self) -> None:
         """List active skills for the current agent."""
         agent = None
-        if self._current_room != _COMMON_ROOM:
+        if not self._is_room(self._current_room):
             agent = self._get_or_create_agent(self._current_room)
         elif self._last_active_agent:
             agent = self._get_or_create_agent(self._last_active_agent)
@@ -833,7 +1027,7 @@ class AgentTUI(App[None]):
 
     def _get_focused_agent(self) -> Any:
         """Return the currently focused agent instance, or None."""
-        if self._current_room != _COMMON_ROOM:
+        if not self._is_room(self._current_room):
             return self._get_or_create_agent(self._current_room)
         if self._last_active_agent:
             return self._get_or_create_agent(self._last_active_agent)
@@ -966,9 +1160,9 @@ class AgentTUI(App[None]):
         """Parse all @mentions in the message.
 
         Returns ``([agent_names], clean_text)``.
-        In the Common Room, at least one @mention is required — untagged
-        messages return ``([], text)`` so the caller can prompt.
-        In a DM room, messages always route to that room's agent.
+        In a room, at least one @mention is required for tagged mode —
+        untagged messages return ``([], text)`` so the caller can handle routing.
+        In a DM (agent name), messages always route to that agent.
         """
         found: list[str] = []
         clean = text
@@ -979,7 +1173,7 @@ class AgentTUI(App[None]):
                 clean = pattern.sub("", clean).strip()
         if found:
             return found, clean
-        if self._current_room != _COMMON_ROOM:
+        if not self._is_room(self._current_room):
             return [self._current_room], text
         return [], text
 
@@ -992,7 +1186,7 @@ class AgentTUI(App[None]):
         """Process a message directed at multiple agents in parallel."""
         # Build room context once (shared snapshot before any replies)
         room_context: str | None = None
-        if room == _COMMON_ROOM:
+        if self._is_room(room):
             ctx = self._build_room_context(room)
             if ctx:
                 room_context = (
@@ -1109,10 +1303,8 @@ class AgentTUI(App[None]):
                     self._mount_sys(msg)
                 return
 
-            # In the Common Room, prepend conversation context so the
-            # tagged agent can see what was discussed before.
             message_text = text
-            if room == _COMMON_ROOM:
+            if self._is_room(room):
                 ctx = self._build_room_context(room)
                 if ctx:
                     message_text = (
@@ -1270,17 +1462,21 @@ class AgentTUI(App[None]):
         history = self._chat_histories.get(self._current_room, [])
 
         if not history:
-            if self._current_room == _COMMON_ROOM:
-                names = ", ".join(self._agent_names[:5])
-                if self._engagement_mode == "organic":
+            if self._is_room(self._current_room):
+                room_info = self._rooms.get(self._current_room, {})
+                display = room_info.get("name", "room").replace("-", " ").title()
+                room_agents = self._room_agents(self._current_room)
+                names = ", ".join(room_agents[:5])
+                engagement = self._room_engagement(self._current_room)
+                if engagement == "organic":
                     self._mount_sys(
-                        f"Welcome to the Common Room. "
+                        f"Welcome to {display}. "
                         f"Just type — relevant agents will respond. "
                         f"Use @agent_name to direct a message. Available: {names}"
                     )
                 else:
                     self._mount_sys(
-                        f"Welcome to the Common Room. "
+                        f"Welcome to {display}. "
                         f"Use @agent_name to chat. Available: {names}"
                     )
             else:
@@ -1322,6 +1518,9 @@ class AgentTUI(App[None]):
             return
         for path in hdir.glob("*.json"):
             room = path.stem
+            # Migrate old __common__ history to new __room__common key
+            if room == "__common__":
+                room = self._room_key("common")
             try:
                 data = json.loads(path.read_text(encoding="utf-8"))
                 self._chat_histories[room] = [
@@ -1354,7 +1553,7 @@ class AgentTUI(App[None]):
 
     def _get_focused_agent_name(self) -> str | None:
         """Return the agent name for the current room (DM only)."""
-        if self._current_room == _COMMON_ROOM:
+        if self._is_room(self._current_room):
             return self._last_active_agent
         if self._current_room in self._agent_names:
             return self._current_room
@@ -1363,12 +1562,18 @@ class AgentTUI(App[None]):
     def on_tabbed_content_tab_activated(self, event: TabbedContent.TabActivated) -> None:
         pass
 
-    # ── Sidebar: Provenance button ──
+    # ── Sidebar buttons ──
 
     def on_button_pressed(self, event: Button.Pressed) -> None:
         if event.button.id == "btn-memory":
             self._launch_memory_browser()
             return
+        if event.button.id == "btn-new-room":
+            self.push_screen(
+                CreateRoomScreen(self._agent_names),
+                callback=self._on_room_created,
+            )
+            return
         if event.button.id != "btn-provenance":
             return
         agent_name = self._get_focused_agent_name()
@@ -1407,6 +1612,58 @@ class AgentTUI(App[None]):
             lines.append(line)
         self._mount_sys("\n".join(lines))
 
+    # ── Room creation ──
+
+    def _on_room_created(self, result: dict[str, Any] | None) -> None:
+        if result is None:
+            return
+        name = result["name"]
+        key = self._room_key(name)
+        if key in self._rooms:
+            self._mount_sys(f"Room '{name}' already exists.")
+            return
+
+        self._rooms[key] = {
+            "name": name,
+            "agents": result["agents"],
+            "engagement": result["engagement"],
+        }
+
+        # Add to sidebar
+        room_list = self.query_one("#room-list", OptionList)
+        display = name.replace("-", " ").title()
+        engagement = result["engagement"]
+        n_agents = len(result["agents"])
+        room_list.add_option(f"◆ {display}  [{_DIM}]{engagement} · {n_agents}[/]")
+
+        # Save to config.json
+        self._save_room_to_config(name, result["agents"], result["engagement"])
+
+        # Switch to the new room
+        room_keys = list(self._rooms.keys())
+        idx = room_keys.index(key)
+        room_list.highlighted = idx
+        self._current_room = key
+        self._update_subtitle()
+        self._update_placeholder()
+        self._render_chat()
+        self._mount_sys(f"Room '{display}' created with {n_agents} agent(s).")
+
+    def _save_room_to_config(self, name: str, agents: list[str], engagement: str) -> None:
+        try:
+            if self._config_path.exists():
+                raw = self._config_path.read_text(encoding="utf-8")
+                config = json.loads(_strip_jsonc(raw))
+            else:
+                config = {}
+            rooms = config.setdefault("rooms", {})
+            rooms[name] = {"agents": agents, "engagement": engagement}
+            self._config_path.write_text(
+                json.dumps(config, indent=2) + "\n", encoding="utf-8"
+            )
+        except Exception:
+            pass
+
     # ── Actions ──
 
     def action_quit(self) -> None:
diff --git a/lib/cli/src/crewai_cli/benchmark.py b/lib/cli/src/crewai_cli/benchmark.py
index b8f6f9a7c..d88b34d0f 100644
--- a/lib/cli/src/crewai_cli/benchmark.py
+++ b/lib/cli/src/crewai_cli/benchmark.py
@@ -174,10 +174,133 @@ def _parse_definition(source: Any) -> dict[str, Any]:
     return parse_agent_definition(source)
 
 
-def _load_agent(source: Any) -> Any:
+def _load_agent(source: Any, agents_dir: Path | None = None) -> Any:
     """Load a NewAgent from a definition — delegates to crewai's loader."""
     from crewai.new_agent.definition_parser import load_agent_from_definition
-    return load_agent_from_definition(source)
+    return load_agent_from_definition(source, agents_dir=agents_dir)
+
+
+_MAX_CASES_PARALLEL = 4
+_CASE_TIMEOUT_SECONDS = 90
+
+
+async def _run_model_benchmark(
+    defn: dict[str, Any],
+    model: str,
+    cases: list[BenchmarkCase] | LoadedCases,
+    judge_model: str,
+    emit: Callable[[dict[str, Any]], None],
+    agents_dir: Path | None = None,
+) -> list[BenchmarkResult]:
+    """Run all benchmark cases for a single model, parallelising up to _MAX_CASES_PARALLEL."""
+    total = len(cases)
+    emit({"type": "model_start", "model": model, "total_cases": total})
+
+    sem = asyncio.Semaphore(_MAX_CASES_PARALLEL)
+    done_count = 0
+
+    async def _run_case(i: int, case: BenchmarkCase) -> BenchmarkResult:
+        nonlocal done_count
+        async with sem:
+            emit({"type": "case_start", "model": model, "case_index": i, "total_cases": total, "input": case.input})
+
+            bench_defn = dict(defn)
+            bench_defn["settings"] = dict(defn.get("settings", {}))
+            if model != "default":
+                bench_defn["llm"] = model
+            bench_defn["settings"]["memory"] = False
+            bench_defn["settings"]["self_improving"] = False
+            bench_defn["settings"]["planning"] = False
+            bench_defn["verbose"] = False
+            bench_defn["max_iter"] = min(bench_defn.get("max_iter", 25), 5)
+            bench_defn["max_execution_time"] = min(bench_defn.get("max_execution_time", 60), 60)
+            bench_defn.pop("coworkers", None)
+            bench_defn.pop("tools", None)
+
+            try:
+                agent = _load_agent(bench_defn, agents_dir=agents_dir)
+            except Exception as e:
+                done_count += 1
+                emit({"type": "case_done", "model": model, "case_index": done_count, "total_cases": total, "passed": False, "score": 0.0, "time_ms": 0, "error": str(e)})
+                return BenchmarkResult(
+                    case_index=i, input=case.input, expected=case.expected,
+                    actual=f"[Agent creation error: {e}]", model=model, passed=False, score=0.0,
+                )
+
+            start_ms = _current_time_ms()
+            try:
+                response = await asyncio.wait_for(
+                    agent.amessage(case.input),
+                    timeout=_CASE_TIMEOUT_SECONDS,
+                )
+                elapsed_ms = _current_time_ms() - start_ms
+                actual = response.content
+                input_tokens = response.input_tokens or 0
+                output_tokens = response.output_tokens or 0
+                cost = response.cost
+            except asyncio.TimeoutError:
+                elapsed_ms = _current_time_ms() - start_ms
+                done_count += 1
+                emit({"type": "case_done", "model": model, "case_index": done_count, "total_cases": total, "passed": False, "score": 0.0, "time_ms": elapsed_ms, "error": "timeout"})
+                return BenchmarkResult(
+                    case_index=i, input=case.input, expected=case.expected,
+                    actual=f"[Timeout after {_CASE_TIMEOUT_SECONDS}s]", model=model, passed=False, score=0.0,
+                    response_time_ms=elapsed_ms,
+                )
+            except Exception as e:
+                elapsed_ms = _current_time_ms() - start_ms
+                done_count += 1
+                emit({"type": "case_done", "model": model, "case_index": done_count, "total_cases": total, "passed": False, "score": 0.0, "time_ms": elapsed_ms, "error": str(e)})
+                return BenchmarkResult(
+                    case_index=i, input=case.input, expected=case.expected,
+                    actual=f"[Error: {e}]", model=model, passed=False, score=0.0,
+                    response_time_ms=elapsed_ms,
+                )
+
+            passed, score = False, 0.0
+            if case.expected is not None:
+                passed, score = _check_expected(case.expected, actual)
+            if case.criteria is not None:
+                emit({"type": "judging", "model": model, "case_index": done_count + 1, "total_cases": total})
+                try:
+                    criteria_passed, criteria_score = await asyncio.wait_for(
+                        _judge_with_llm(case.criteria, case.input, actual, judge_model),
+                        timeout=30,
+                    )
+                except asyncio.TimeoutError:
+                    criteria_passed, criteria_score = False, 0.0
+                if case.expected is not None:
+                    passed = passed and criteria_passed
+                    score = (score + criteria_score) / 2.0
+                else:
+                    passed, score = criteria_passed, criteria_score
+
+            done_count += 1
+            emit({"type": "case_done", "model": model, "case_index": done_count, "total_cases": total, "passed": passed, "score": score, "time_ms": elapsed_ms})
+            return BenchmarkResult(
+                case_index=i, input=case.input, expected=case.expected, actual=actual,
+                model=model, passed=passed, score=score,
+                input_tokens=input_tokens, output_tokens=output_tokens,
+                response_time_ms=elapsed_ms, cost=cost,
+            )
+
+    model_results = await asyncio.gather(*[_run_case(i, case) for i, case in enumerate(cases)])
+
+    total_passed = sum(1 for r in model_results if r.passed)
+    avg_score = sum(r.score for r in model_results) / len(model_results) if model_results else 0.0
+    total_time = max(r.response_time_ms for r in model_results) / 1000 if model_results else 0.0
+    total_in = sum(r.input_tokens for r in model_results)
+    total_out = sum(r.output_tokens for r in model_results)
+    total_cost = sum(r.cost for r in model_results if r.cost is not None)
+    emit({
+        "type": "model_done", "model": model,
+        "passed": total_passed, "total": len(model_results),
+        "avg_score": avg_score, "total_time": total_time,
+        "input_tokens": total_in, "output_tokens": total_out,
+        "total_cost": total_cost if total_cost > 0 else None,
+    })
+
+    return model_results
 
 
 async def run_benchmark(
@@ -187,7 +310,7 @@ async def run_benchmark(
     judge_model: str = "openai/gpt-4o-mini",
     on_progress: Callable[[dict[str, Any]], None] | None = None,
 ) -> dict[str, list[BenchmarkResult]]:
-    """Run benchmark cases against an agent definition, optionally across multiple models.
+    """Run benchmark cases against an agent definition across models in parallel.
 
     Args:
         agent_def: Agent definition dict, JSON string, or file path.
@@ -199,6 +322,12 @@ async def run_benchmark(
     Returns:
         Dict mapping model name to list of BenchmarkResult.
     """
+    agents_dir: Path | None = None
+    if isinstance(agent_def, (str, Path)):
+        p = Path(agent_def)
+        if p.is_file():
+            agents_dir = p.parent
+
     defn = _parse_definition(agent_def)
 
     if models is None or len(models) == 0:
@@ -208,102 +337,74 @@ async def run_benchmark(
         if on_progress:
             on_progress(event)
 
-    results_by_model: dict[str, list[BenchmarkResult]] = {}
+    tasks = [
+        _run_model_benchmark(defn, model, cases, judge_model, _emit, agents_dir=agents_dir)
+        for model in models
+    ]
+    all_results = await asyncio.gather(*tasks)
 
-    for mi, model in enumerate(models):
-        model_results: list[BenchmarkResult] = []
-        _emit({"type": "model_start", "model": model, "model_index": mi, "total_models": len(models), "total_cases": len(cases)})
+    return dict(zip(models, all_results))
 
-        for i, case in enumerate(cases):
-            _emit({"type": "case_start", "model": model, "case_index": i, "total_cases": len(cases), "input": case.input})
 
-            bench_defn = dict(defn)
-            if model != "default":
-                bench_defn["llm"] = model
-            bench_defn.setdefault("settings", {})
-            bench_defn["settings"]["memory_read_only"] = True
+class suppress_benchmark_output:
+    """Context manager that silences console formatter and noisy logging during benchmarks."""
 
+    def __enter__(self):
+        import logging
+        self._saved_formatter = None
+        try:
+            from crewai.events.listeners.tracing.trace_listener import TraceCollectionListener
+            listener = TraceCollectionListener._instance
+            if listener:
+                self._saved_formatter = listener.formatter
+                listener.formatter = None
+        except Exception:
+            pass
+        self._loggers = []
+        for name in (None, "crewai.new_agent.event_listener", "crewai.new_agent.executor", "crewai"):
+            lg = logging.getLogger(name)
+            self._loggers.append((lg, lg.level))
+            lg.setLevel(logging.CRITICAL)
+        return self
+
+    def __exit__(self, *exc):
+        for lg, level in self._loggers:
+            lg.setLevel(level)
+        if self._saved_formatter is not None:
             try:
-                agent = _load_agent(bench_defn)
-            except Exception as e:
-                result = BenchmarkResult(
-                    case_index=i,
-                    input=case.input,
-                    expected=case.expected,
-                    actual=f"[Agent creation error: {e}]",
-                    model=model,
-                    passed=False,
-                    score=0.0,
-                )
-                model_results.append(result)
-                _emit({"type": "case_done", "model": model, "case_index": i, "total_cases": len(cases), "passed": False, "score": 0.0, "time_ms": 0, "error": str(e)})
-                continue
+                from crewai.events.listeners.tracing.trace_listener import TraceCollectionListener
+                listener = TraceCollectionListener._instance
+                if listener:
+                    listener.formatter = self._saved_formatter
+            except Exception:
+                pass
 
-            start_ms = _current_time_ms()
-            try:
-                response = await agent.amessage(case.input)
-                elapsed_ms = _current_time_ms() - start_ms
 
-                actual = response.content
-                input_tokens = response.input_tokens or 0
-                output_tokens = response.output_tokens or 0
-                cost = response.cost
+class artifacts_sandbox:
+    """Context manager that chdirs into tests/artifacts/ for the benchmark run.
 
-            except Exception as e:
-                elapsed_ms = _current_time_ms() - start_ms
-                result = BenchmarkResult(
-                    case_index=i,
-                    input=case.input,
-                    expected=case.expected,
-                    actual=f"[Error: {e}]",
-                    model=model,
-                    passed=False,
-                    score=0.0,
-                    response_time_ms=elapsed_ms,
-                )
-                model_results.append(result)
-                _emit({"type": "case_done", "model": model, "case_index": i, "total_cases": len(cases), "passed": False, "score": 0.0, "time_ms": elapsed_ms, "error": str(e)})
-                continue
+    Any files created by agent tools land in the artifacts directory instead of
+    polluting the project root.  A .gitignore is written if one doesn't exist.
+    """
 
-            passed = False
-            score = 0.0
+    def __init__(self, base: str | Path = "tests/artifacts"):
+        self._base = Path(base)
+        self._prev_cwd: str | None = None
 
-            if case.expected is not None:
-                passed, score = _check_expected(case.expected, actual)
-            if case.criteria is not None:
-                _emit({"type": "judging", "model": model, "case_index": i, "total_cases": len(cases)})
-                criteria_passed, criteria_score = await _judge_with_llm(
-                    case.criteria, case.input, actual, judge_model
-                )
-                if case.expected is not None:
-                    passed = passed and criteria_passed
-                    score = (score + criteria_score) / 2.0
-                else:
-                    passed = criteria_passed
-                    score = criteria_score
+    def __enter__(self):
+        import os
+        self._base.mkdir(parents=True, exist_ok=True)
+        gitignore = self._base / ".gitignore"
+        if not gitignore.exists():
+            gitignore.write_text("*\n")
+        self._prev_cwd = os.getcwd()
+        os.chdir(self._base)
+        return self
 
-            result = BenchmarkResult(
-                case_index=i,
-                input=case.input,
-                expected=case.expected,
-                actual=actual,
-                model=model,
-                passed=passed,
-                score=score,
-                input_tokens=input_tokens,
-                output_tokens=output_tokens,
-                response_time_ms=elapsed_ms,
-                cost=cost,
-            )
-            model_results.append(result)
-            _emit({"type": "case_done", "model": model, "case_index": i, "total_cases": len(cases), "passed": passed, "score": score, "time_ms": elapsed_ms})
-
-        results_by_model[model] = model_results
-        total_passed = sum(1 for r in model_results if r.passed)
-        avg_score = sum(r.score for r in model_results) / len(model_results) if model_results else 0.0
-        _emit({"type": "model_done", "model": model, "passed": total_passed, "total": len(model_results), "avg_score": avg_score})
-
-    return results_by_model
+    def __exit__(self, *exc):
+        import os
+        if self._prev_cwd:
+            os.chdir(self._prev_cwd)
 
 
 def _current_time_ms() -> int:
@@ -389,7 +490,7 @@ def format_comparison_table(results_by_model: dict[str, list[BenchmarkResult]])
         avg_score = sum(r.score for r in results) / n if n > 0 else 0.0
         total_in = sum(r.input_tokens for r in results)
         total_out = sum(r.output_tokens for r in results)
-        total_time = sum(r.response_time_ms for r in results)
+        total_time = max((r.response_time_ms for r in results), default=0)
 
         model_trunc = model[:28] if len(model) > 28 else model
         line = (
@@ -477,12 +578,8 @@ def print_results_chart(
     model = results[0].model
     has_cost = any(r.cost is not None for r in results)
 
-    inner_w = max(console.width - 4, 60)
-    bar_w = 12
-    overhead = 2 + 2 + 2 + 2 + bar_w + 1 + 4 + 2 + 4 + 2 + 6
-    if has_cost:
-        overhead += 2 + 7
-    input_w = max(15, inner_w - overhead)
+    bar_w = 10
+    input_w = 35
 
     rows: list[str] = []
     for r in results:
@@ -506,21 +603,21 @@ def print_results_chart(
 
     color = _score_color(avg)
     summary_parts = [
-        f"[{color}]{passed}/{n} passed[/]",
-        f"avg [{color}]{avg:.2f}[/]",
-        f"[dim]{total_time:.1f}s[/]",
-        f"[dim]↑{_fmt_tokens(total_in)} ↓{_fmt_tokens(total_out)}[/]",
+        f"[{color}]{passed}/{n} passed[/{color}]",
+        f"avg [{color}]{avg:.2f}[/{color}]",
+        f"[dim]{total_time:.1f}s[/dim]",
+        f"[dim]↑{_fmt_tokens(total_in)} ↓{_fmt_tokens(total_out)}[/dim]",
     ]
     if total_cost > 0:
-        summary_parts.append(f"[dim]{_fmt_cost(total_cost)}[/]")
+        summary_parts.append(f"[dim]{_fmt_cost(total_cost)}[/dim]")
 
     body = "\n".join(rows) + "\n\n  " + "  ·  ".join(summary_parts)
     panel = Panel(
         body,
-        title=f"[bold cyan]{model}[/]",
+        title=f"[bold cyan]{model}[/bold cyan]",
         title_align="left",
-        border_style="cyan",
-        padding=(1, 0),
+        border_style="dim",
+        padding=(0, 1),
         expand=False,
     )
     console.print(panel)
@@ -550,7 +647,7 @@ def print_comparison_chart(
         n = len(results)
         passed = sum(1 for r in results if r.passed)
         avg = sum(r.score for r in results) / n if n else 0.0
-        total_time = sum(r.response_time_ms for r in results) / 1000
+        total_time = max((r.response_time_ms for r in results), default=0) / 1000
         total_tokens = sum(r.input_tokens + r.output_tokens for r in results)
         models_data.append({
             "model": model, "passed": passed, "n": n,
diff --git a/lib/cli/src/crewai_cli/cli.py b/lib/cli/src/crewai_cli/cli.py
index 7213acdff..bfc735dfa 100644
--- a/lib/cli/src/crewai_cli/cli.py
+++ b/lib/cli/src/crewai_cli/cli.py
@@ -502,20 +502,21 @@ def memory(
     type=float,
     default=None,
     help="Minimum score to pass a test case (NewAgent only, 0.0-1.0). "
-    "Defaults to test_threshold in config.json (0.7 if not set).",
+    "Defaults to test.threshold in config.json (0.7 if not set).",
 )
 @click.option(
     "--judge-model",
     type=str,
-    default="openai/gpt-4o-mini",
-    help="LLM model for evaluation judging (NewAgent only).",
+    default=None,
+    help="LLM model for evaluation judging (NewAgent only). "
+    "Defaults to test.judge_model in config.json (openai/gpt-4o-mini if not set).",
 )
 def test(
     n_iterations: int,
     model: str | None,
     trained_agents_file: str | None,
     threshold: float | None,
-    judge_model: str,
+    judge_model: str | None,
 ) -> None:
     """Test the crew or agents and evaluate the results.
 
@@ -530,26 +531,33 @@ def test(
     agent_files = sorted(agents_dir.glob("*.json")) + sorted(agents_dir.glob("*.jsonc")) if agents_dir.is_dir() else []
 
     if agent_files:
+        effective_judge = judge_model or _read_config("test", "judge_model") or "openai/gpt-4o-mini"
+
         if _needs_uv_relaunch():
-            uv_args = ["test", "-n", str(n_iterations), "--threshold", str(threshold), "--judge-model", judge_model]
+            uv_args = ["test", "-n", str(n_iterations), "--judge-model", effective_judge]
+            if threshold is not None:
+                uv_args.extend(["--threshold", str(threshold)])
             if model:
                 uv_args.extend(["-m", model])
             if trained_agents_file:
                 uv_args.extend(["-f", trained_agents_file])
             _relaunch_via_uv(uv_args)
 
-        project_threshold = _read_config_threshold()
-        effective_threshold = threshold or project_threshold or 0.7
+        config_threshold = _read_config("test", "threshold") or _read_config("test_threshold")
+        effective_threshold = threshold or (float(config_threshold) if config_threshold is not None else None) or 0.7
 
-        _test_new_agents(agent_files, n_iterations, model, effective_threshold, judge_model)
+        _test_new_agents(agent_files, n_iterations, model, effective_threshold, effective_judge)
     else:
         crew_model = model or "gpt-4o-mini"
         click.echo(f"Testing the crew for {n_iterations} iterations with model {crew_model}")
         evaluate_crew(n_iterations, crew_model, trained_agents_file=trained_agents_file)
 
 
-def _read_config_threshold() -> float | None:
-    """Read test_threshold from config.json if it exists."""
+def _read_config(*keys: str) -> Any:
+    """Read a nested value from config.json (JSONC-safe).
+
+    Example: _read_config("test", "threshold") reads config["test"]["threshold"].
+    """
     import json
     from pathlib import Path
 
@@ -562,74 +570,132 @@ def _read_config_threshold() -> float | None:
         clean = re.sub(r"(?<!:)//.*?$", "", raw, flags=re.MULTILINE)
         clean = re.sub(r"/\*.*?\*/", "", clean, flags=re.DOTALL)
         data = json.loads(clean)
-        val = data.get("test_threshold")
-        return float(val) if val is not None else None
+        for k in keys:
+            if not isinstance(data, dict):
+                return None
+            data = data.get(k)
+            if data is None:
+                return None
+        return data
     except Exception:
         return None
 
 
-def _make_benchmark_progress():
-    """Create a progress callback with Rich spinner animation."""
-    import time
+class _BenchmarkLiveProgress:
+    """Live parallel progress display for benchmark runs."""
 
-    from rich.console import Console
-    from rich.spinner import Spinner
-    from rich.live import Live
+    def __init__(self, console=None):
+        from rich.console import Console
+        self._console = console or Console()
+        self._state: dict[str, dict] = {}
+        self._live = None
 
-    console = Console()
-    state: dict = {"live": None}
+    def start(self):
+        from rich.live import Live
+        self._live = Live(
+            self._render(),
+            console=self._console,
+            refresh_per_second=10,
+            transient=False,
+        )
+        self._live.start()
 
-    def _stop_live():
-        if state["live"]:
-            state["live"].stop()
-            state["live"] = None
+    def stop(self):
+        if self._live:
+            self._live.update(self._render())
+            self._live.stop()
+            self._live = None
 
-    def progress(event: dict) -> None:
+    def on_progress(self, event: dict) -> None:
         t = event["type"]
+        model = event.get("model", "")
 
         if t == "model_start":
-            _stop_live()
-            label = event["model"]
-            if event["total_models"] > 1:
-                label = f"\\[{event['model_index'] + 1}/{event['total_models']}] {label}"
-            console.print(f"\n[bold cyan]▶ {label}[/]  [dim]({event['total_cases']} cases)[/]")
-
+            self._state[model] = {
+                "done": 0, "total": event["total_cases"],
+                "status": "starting", "passed": 0,
+                "avg": 0.0, "time": 0.0,
+                "in_tokens": 0, "out_tokens": 0, "cost": None,
+            }
         elif t == "case_start":
-            _stop_live()
-            idx = event["case_index"] + 1
-            total = event["total_cases"]
-            snippet = event["input"][:60] + ("…" if len(event["input"]) > 60 else "")
-            console.print(f"  [dim]\\[{idx}/{total}][/] {snippet}")
-            state["live"] = Live(
-                Spinner("dots", text="  running…", style="cyan"),
-                console=console,
-                transient=True,
-            )
-            state["live"].start()
-
+            self._state[model]["status"] = "running"
         elif t == "judging":
-            if state["live"]:
-                state["live"].update(
-                    Spinner("dots", text="  judging…", style="cyan")
-                )
-
+            self._state[model]["status"] = "judging"
         elif t == "case_done":
-            _stop_live()
-            elapsed_s = event["time_ms"] / 1000
-            if event.get("error"):
-                console.print(f"    [red]✗ ERROR[/red]  ({elapsed_s:.1f}s)")
-            elif event["passed"]:
-                console.print(f"    [green]✓ PASS[/green]  score={event['score']:.2f}  ({elapsed_s:.1f}s)")
-            else:
-                console.print(f"    [red]✗ FAIL[/red]  score={event['score']:.2f}  ({elapsed_s:.1f}s)")
-
+            s = self._state[model]
+            s["done"] = max(s["done"], event["case_index"])
+            if event.get("passed"):
+                s["passed"] += 1
+            s["status"] = "running"
         elif t == "model_done":
-            _stop_live()
-            p, tot, avg = event["passed"], event["total"], event["avg_score"]
-            color = "green" if p == tot else ("yellow" if p > 0 else "red")
-            console.print(f"  [{color}]── {p}/{tot} passed · avg score {avg:.2f}[/{color}]")
+            s = self._state[model]
+            s["status"] = "done"
+            s["passed"] = event.get("passed", s["passed"])
+            s["done"] = event.get("total", s["done"])
+            s["avg"] = event["avg_score"]
+            s["time"] = event.get("total_time", 0.0)
+            s["in_tokens"] = event.get("input_tokens", 0)
+            s["out_tokens"] = event.get("output_tokens", 0)
+            s["cost"] = event.get("total_cost")
 
-    return progress
+        if self._live:
+            self._live.update(self._render())
+
+    def _render(self):
+        from rich.table import Table
+        from rich.spinner import Spinner
+        from rich.text import Text
+        from rich import box
+
+        from crewai_cli.benchmark import _score_color, _fmt_tokens, _fmt_cost
+
+        has_cost = any(
+            info.get("cost") is not None
+            for info in self._state.values()
+            if info["status"] == "done"
+        )
+        n_cols = 7 if has_cost else 6
+
+        table = Table(box=box.SIMPLE, show_header=False, padding=(0, 1), expand=False)
+        table.add_column("", width=1)                              # icon
+        table.add_column("", no_wrap=True)                         # model
+        table.add_column("", no_wrap=True, justify="right")        # passed or bar
+        table.add_column("", no_wrap=True, justify="right")        # score
+        table.add_column("", no_wrap=True, justify="right")        # time
+        table.add_column("", no_wrap=True, justify="right")        # tokens
+        if has_cost:
+            table.add_column("", no_wrap=True, justify="right")    # cost
+
+        for model, info in self._state.items():
+            if info["status"] == "done":
+                icon = Text("✓", style="green")
+                color = _score_color(info["avg"])
+                cols = [
+                    icon,
+                    model,
+                    Text.from_markup(f"[{color}]{info['passed']}/{info['total']}[/{color}]"),
+                    Text.from_markup(f"[{color}]{info['avg']:.2f}[/{color}]"),
+                    Text(f"{info['time']:.1f}s", style="dim"),
+                    Text(f"↑{_fmt_tokens(info['in_tokens'])} ↓{_fmt_tokens(info['out_tokens'])}", style="dim"),
+                ]
+                if has_cost:
+                    if info["cost"] is not None:
+                        cols.append(Text(_fmt_cost(info["cost"]), style="dim"))
+                    else:
+                        cols.append(Text(""))
+            else:
+                bar_w = 10
+                pct = info["done"] / info["total"] if info["total"] > 0 else 0
+                filled = round(pct * bar_w)
+                icon = Spinner("dots", style="cyan")
+                progress = Text.from_markup(
+                    f"[cyan]{'█' * filled}{'░' * (bar_w - filled)}[/cyan] {info['done']}/{info['total']}"
+                )
+                cols = [icon, model, progress] + [Text("")] * (n_cols - 3)
+
+            table.add_row(*cols)
+
+        return table
 
 
 def _test_new_agents(
@@ -639,7 +705,7 @@ def _test_new_agents(
     threshold: float,
     judge_model: str,
 ) -> None:
-    """Run NewAgent test cases with pass/fail threshold."""
+    """Run NewAgent test cases with pass/fail threshold (all agents in parallel)."""
     import asyncio
     from pathlib import Path
 
@@ -647,7 +713,6 @@ def _test_new_agents(
 
     from crewai_cli.benchmark import (
         load_benchmark_cases,
-        print_results_chart,
         run_benchmark,
     )
 
@@ -655,9 +720,9 @@ def _test_new_agents(
     tests_dir = Path("tests")
     if not tests_dir.is_dir() and Path("benchmarks").is_dir():
         tests_dir = Path("benchmarks")
-    all_passed = True
-    agents_tested = 0
 
+    # Collect valid agents + cases
+    jobs: list[dict] = []
     for agent_path in agent_files:
         agent_name = agent_path.stem
         cases_path = tests_dir / f"{agent_name}_cases.json"
@@ -670,52 +735,91 @@ def _test_new_agents(
             loaded = load_benchmark_cases(cases_path)
         except (FileNotFoundError, ValueError) as e:
             click.secho(f"  Error loading cases for {agent_name}: {e}", fg="red")
-            all_passed = False
             continue
 
         file_threshold = loaded.threshold if loaded.threshold is not None else threshold
+        jobs.append({
+            "agent_name": agent_name,
+            "agent_path": str(agent_path.resolve()),
+            "cases": loaded.cases,
+            "threshold": file_threshold,
+        })
 
-        model_list = [model] if model else None
+    if not jobs:
+        click.secho("No agents with matching benchmark cases found.", fg="yellow")
+        raise SystemExit(1)
 
-        click.echo()
-        click.secho(f"Testing {agent_name} ({len(loaded)} cases, threshold={file_threshold})", fg="cyan", bold=True)
+    model_list = [model] if model else None
 
-        try:
-            results_by_model = asyncio.run(
+    # Progress display — prefix model key with agent name
+    progress = _BenchmarkLiveProgress(console=_con)
+
+    def _make_progress_cb(agent_name: str):
+        def _cb(event: dict) -> None:
+            prefixed = dict(event)
+            if "model" in prefixed:
+                prefixed["model"] = f"{agent_name}/{prefixed['model']}"
+            progress.on_progress(prefixed)
+        return _cb
+
+    async def _run_all():
+        tasks = []
+        for job in jobs:
+            tasks.append(
                 run_benchmark(
-                    agent_def=str(agent_path),
-                    cases=loaded.cases,
+                    agent_def=job["agent_path"],
+                    cases=job["cases"],
                     models=model_list,
                     judge_model=judge_model,
-                    on_progress=_make_benchmark_progress(),
+                    on_progress=_make_progress_cb(job["agent_name"]),
                 )
             )
-        except Exception as e:
-            click.secho(f"  Error running tests for {agent_name}: {e}", fg="red")
+        return await asyncio.gather(*tasks, return_exceptions=True)
+
+    agent_count = sum(1 for j in jobs for _ in (model_list or [None]))
+    case_count = sum(len(j["cases"]) for j in jobs)
+    click.echo()
+    click.secho(
+        f"Testing {len(jobs)} agent(s), {case_count} cases (threshold={threshold})",
+        fg="cyan", bold=True,
+    )
+
+    from crewai_cli.benchmark import artifacts_sandbox, suppress_benchmark_output
+
+    progress.start()
+    try:
+        with artifacts_sandbox(), suppress_benchmark_output():
+            all_results = asyncio.run(_run_all())
+    finally:
+        progress.stop()
+
+    # Evaluate results
+    all_passed = True
+    agents_tested = 0
+    for job, result in zip(jobs, all_results):
+        if isinstance(result, Exception):
+            click.secho(f"  Error running tests for {job['agent_name']}: {result}", fg="red")
             all_passed = False
             continue
 
         agents_tested += 1
-
-        for model_name, results in results_by_model.items():
-            _con.print()
-            print_results_chart(results, console=_con)
-
-            failed = [r for r in results if r.score < file_threshold]
+        for model_name, results in result.items():
+            failed = [r for r in results if r.score < job["threshold"]]
             if failed:
                 all_passed = False
                 _con.print(
-                    f"\n  [red bold]FAILED: {len(failed)}/{len(results)} "
-                    f"cases below threshold ({file_threshold})[/red bold]"
+                    f"  [red bold]{job['agent_name']}: FAILED {len(failed)}/{len(results)} "
+                    f"cases below threshold ({job['threshold']})[/red bold]"
                 )
+                for r in failed:
+                    inp = r.input[:60] + ("…" if len(r.input) > 60 else "")
+                    _con.print(f"    [red]#{r.case_index + 1}[/red] [dim]{inp}[/dim]  [red]{r.score:.2f}[/red]")
             else:
                 _con.print(
-                    f"\n  [green bold]PASSED: all {len(results)} cases >= {file_threshold}[/green bold]"
+                    f"  [green bold]{job['agent_name']}: PASSED all {len(results)} cases >= {job['threshold']}[/green bold]"
                 )
-
-    click.echo()
     if agents_tested == 0:
-        click.secho("No agents with matching benchmark cases found.", fg="yellow")
+        click.secho("No agents completed successfully.", fg="yellow")
         raise SystemExit(1)
     elif all_passed:
         click.secho(f"All tests passed ({agents_tested} agent(s)).", fg="green", bold=True)
@@ -1456,20 +1560,23 @@ def checkpoint_prune(
 )
 @click.option(
     "--judge-model",
-    default="openai/gpt-4o-mini",
-    help="Model for LLM judge evaluation",
+    default=None,
+    help="Model for LLM judge evaluation. "
+    "Defaults to test.judge_model in config.json (openai/gpt-4o-mini if not set).",
 )
 def benchmark(
     agent_path: str,
     cases_path: str,
     models: tuple[str, ...],
-    judge_model: str,
+    judge_model: str | None,
 ) -> None:
     """Run agent against test cases and report results."""
     import asyncio
 
     from crewai_cli.run_crew import _needs_uv_relaunch, _relaunch_via_uv
 
+    judge_model = judge_model or _read_config("test", "judge_model") or "openai/gpt-4o-mini"
+
     if _needs_uv_relaunch():
         uv_args = ["benchmark", agent_path, cases_path, "--judge-model", judge_model]
         for m in models:
@@ -1481,12 +1588,15 @@ def benchmark(
     from crewai_cli.benchmark import (
         load_benchmark_cases,
         print_comparison_chart,
-        print_results_chart,
         run_benchmark,
     )
 
     _con = _RichConsole()
 
+    from pathlib import Path as _P
+    agent_path = str(_P(agent_path).resolve())
+    cases_path = str(_P(cases_path).resolve())
+
     try:
         cases = load_benchmark_cases(cases_path)
     except (FileNotFoundError, ValueError) as e:
@@ -1502,23 +1612,26 @@ def benchmark(
     click.echo(f"Judge model: {judge_model}")
     click.echo()
 
+    from crewai_cli.benchmark import artifacts_sandbox, suppress_benchmark_output
+
+    progress = _BenchmarkLiveProgress(console=_con)
+    progress.start()
     try:
-        results_by_model = asyncio.run(
-            run_benchmark(
-                agent_def=agent_path,
-                cases=cases,
-                models=model_list,
-                judge_model=judge_model,
-                on_progress=_make_benchmark_progress(),
+        with artifacts_sandbox(), suppress_benchmark_output():
+            results_by_model = asyncio.run(
+                run_benchmark(
+                    agent_def=agent_path,
+                    cases=cases,
+                    models=model_list,
+                    judge_model=judge_model,
+                    on_progress=progress.on_progress,
+                )
             )
-        )
     except Exception as e:
         click.secho(f"Error running benchmark: {e}", fg="red")
         raise SystemExit(1) from e
-
-    for model, results in results_by_model.items():
-        _con.print()
-        print_results_chart(results, console=_con)
+    finally:
+        progress.stop()
 
     if len(results_by_model) > 1:
         _con.print()
diff --git a/lib/cli/src/crewai_cli/create_agent.py b/lib/cli/src/crewai_cli/create_agent.py
index 1fa776fde..6f2b6dfe3 100644
--- a/lib/cli/src/crewai_cli/create_agent.py
+++ b/lib/cli/src/crewai_cli/create_agent.py
@@ -122,9 +122,15 @@ PROJECT_CONFIG_TEMPLATE = """\
 {
   // Project configuration for crewai agents
 
-  // Minimum score (0.0–1.0) for a test case to pass.
-  // Override per test file with: {"threshold": 0.9, "cases": [...]}
-  "test_threshold": 0.7,
+  // Test / benchmark settings
+  "test": {
+    // Minimum score (0.0–1.0) for a test case to pass.
+    // Override per test file with: {"threshold": 0.9, "cases": [...]}
+    "threshold": 0.7,
+
+    // LLM used to judge test responses (provider/model format)
+    "judge_model": "openai/gpt-4o-mini"
+  },
 
   // Rooms define how agents collaborate in the TUI
   "rooms": {
@@ -133,10 +139,10 @@ PROJECT_CONFIG_TEMPLATE = """\
       "agents": [],
 
       // Engagement mode:
-      //   "dm" — chat with one agent at a time (default)
+      //   "organic" — all agents see messages, respond if relevant (default)
+      //   "dm" — chat with one agent at a time
       //   "tagged" — @mention to direct messages
-      //   "organic" — all agents see messages, respond if relevant
-      "engagement": "dm"
+      "engagement": "organic"
     }
   }
 }
@@ -182,6 +188,7 @@ _GITIGNORE_TEMPLATE = """\
 __pycache__/
 .DS_Store
 .crewai/
+tests/artifacts/
 """
 
 
@@ -807,7 +814,7 @@ def _add_agent_to_config(base: Path, agent_name: str) -> None:
         config = json.loads(clean)
 
         rooms = config.get("rooms", {})
-        common = rooms.get("common", {"agents": [], "engagement": "dm"})
+        common = rooms.get("common", {"agents": [], "engagement": "organic"})
         agents = common.get("agents", [])
         if agent_name not in agents:
             agents.append(agent_name)
diff --git a/lib/crewai/src/crewai/new_agent/definition_parser.py b/lib/crewai/src/crewai/new_agent/definition_parser.py
index 69c74830d..84617d9c8 100644
--- a/lib/crewai/src/crewai/new_agent/definition_parser.py
+++ b/lib/crewai/src/crewai/new_agent/definition_parser.py
@@ -98,8 +98,8 @@ def load_agent_from_definition(
 
     agent_name = defn.get("name", "")
     if agent_name and agent_name in _loading_chain:
-        logger.warning(
-            "Circular coworker reference for '%s' — skipping to prevent infinite recursion",
+        logger.debug(
+            "Skipping coworker back-reference '%s' (already in loading chain)",
             agent_name,
         )
         return None
@@ -249,8 +249,8 @@ def _resolve_coworkers(
         elif "ref" in cw:
             ref_name = cw["ref"]
             if _loading_chain and ref_name in _loading_chain:
-                logger.warning(
-                    "Circular coworker ref '%s' — skipping to prevent infinite recursion",
+                logger.debug(
+                    "Skipping coworker back-reference '%s' (already in loading chain)",
                     ref_name,
                 )
                 continue
@@ -258,7 +258,9 @@ def _resolve_coworkers(
                 for ext in (".json", ".jsonc"):
                     ref_path = agents_dir / f"{ref_name}{ext}"
                     if ref_path.exists():
-                        result = load_agent_from_definition(ref_path, agents_dir, _loading_chain)
+                        result = load_agent_from_definition(
+                            ref_path, agents_dir, set(_loading_chain) if _loading_chain else None
+                        )
                         if result is not None:
                             coworkers.append(result)
                         break
diff --git a/lib/crewai/src/crewai/new_agent/executor.py b/lib/crewai/src/crewai/new_agent/executor.py
index b0413d254..47c46b4ef 100644
--- a/lib/crewai/src/crewai/new_agent/executor.py
+++ b/lib/crewai/src/crewai/new_agent/executor.py
@@ -483,12 +483,22 @@ class ConversationalAgentExecutor(BaseModel):
     def _estimate_cost(self, model: str, input_tokens: int, output_tokens: int) -> float | None:
         """Approximate cost in USD based on common model pricing per 1M tokens."""
         costs = {
-            "gpt-4o": (2.50, 10.00),
+            "gpt-4.1-nano": (0.10, 0.40),
+            "gpt-4.1-mini": (0.40, 1.60),
+            "gpt-4.1": (2.00, 8.00),
             "gpt-4o-mini": (0.15, 0.60),
+            "gpt-4o": (2.50, 10.00),
             "gpt-4": (30.00, 60.00),
+            "gpt-5-mini": (0.25, 2.00),
+            "gpt-5.5": (5.00, 30.00),
+            "gpt-5.4": (2.50, 15.00),
+            "gpt-5": (1.25, 10.00),
+            "o4-mini": (1.10, 4.40),
+            "o3-mini": (1.10, 4.40),
+            "o3": (2.00, 8.00),
+            "claude-opus": (5.00, 25.00),
             "claude-sonnet": (3.00, 15.00),
-            "claude-haiku": (0.25, 1.25),
-            "claude-opus": (15.00, 75.00),
+            "claude-haiku": (1.00, 5.00),
         }
         for key, (inp_cost, out_cost) in costs.items():
             if key in model.lower():
@@ -719,9 +729,9 @@ class ConversationalAgentExecutor(BaseModel):
 
         ctx_size = llm.get_context_window_size()
         total_chars = sum(len(str(m.get("content", ""))) for m in llm_messages)
-        est_tokens = total_chars // 4
+        est_tokens = total_chars // 3
 
-        if est_tokens < int(ctx_size * 0.75):
+        if est_tokens < int(ctx_size * 0.60):
             return
 
         try:
@@ -1486,6 +1496,8 @@ class ConversationalAgentExecutor(BaseModel):
                     response_text = tool_result
                     break
 
+                self._proactive_summarize_messages(llm_messages, callbacks)
+
                 # GAP-21: Call step_callback at each iteration boundary
                 if self.agent.step_callback:
                     self.agent.step_callback(iterations, self._tools_used_this_turn, response_text)
diff --git a/lib/crewai/src/crewai/utilities/exceptions/context_window_exceeding_exception.py b/lib/crewai/src/crewai/utilities/exceptions/context_window_exceeding_exception.py
index 9e44ce6f4..2cc6d63e1 100644
--- a/lib/crewai/src/crewai/utilities/exceptions/context_window_exceeding_exception.py
+++ b/lib/crewai/src/crewai/utilities/exceptions/context_window_exceeding_exception.py
@@ -9,6 +9,7 @@ CONTEXT_LIMIT_ERRORS: Final[list[str]] = [
     "context window full",
     "too many tokens",
     "input is too long",
+    "prompt is too long",
     "exceeds token limit",
 ]