Brandon/cre 252 add agent to crewai test (#1308)

* Update config typecheck to accept agents * Clean up prints * Adding agents to crew evaluator output table * Properly generating table now * Update tests
2026-01-09 08:08:32 +00:00 · 2024-09-07 02:53:23 -04:00
parent cdaf2d41c7
commit 26d9af8367
7 changed files with 115 additions and 62 deletions
--- a/src/crewai/crew.py
+++ b/src/crewai/crew.py
@@ -584,7 +584,10 @@ class Crew(BaseModel):
            self.manager_agent.allow_delegation = True
            manager = self.manager_agent
            if manager.tools is not None and len(manager.tools) > 0:
-                raise Exception("Manager agent should not have tools")
+                self._logger.log(
+                    "warning", "Manager agent should not have tools", color="orange"
+                )
+                manager.tools = []
            manager.tools = self.manager_agent.get_delegation_tools(self.agents)
        else:
            manager = Agent(
--- a/src/crewai/project/annotations.py
+++ b/src/crewai/project/annotations.py
@@ -103,7 +103,8 @@ def crew(func):
        for task_name in sorted_task_names:
            task_instance = tasks[task_name]()
            instantiated_tasks.append(task_instance)
-            if hasattr(task_instance, "agent"):
+            agent_instance = getattr(task_instance, "agent", None)
+            if agent_instance is not None:
                agent_instance = task_instance.agent
                if agent_instance.role not in agent_roles:
                    instantiated_agents.append(agent_instance)
--- a/src/crewai/task.py
+++ b/src/crewai/task.py
@@ -6,7 +6,7 @@ import uuid
 from concurrent.futures import Future
 from copy import copy
 from hashlib import md5
-from typing import Any, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union

 from opentelemetry.trace import Span
 from pydantic import (
@@ -108,6 +108,7 @@ class Task(BaseModel):
        description="A converter class used to export structured output",
        default=None,
    )
+    processed_by_agents: Set[str] = Field(default_factory=set)

    _telemetry: Telemetry = PrivateAttr(default_factory=Telemetry)
    _execution_span: Optional[Span] = PrivateAttr(default=None)
@@ -241,6 +242,8 @@ class Task(BaseModel):
        self.prompt_context = context
        tools = tools or self.tools or []

+        self.processed_by_agents.add(agent.role)
+
        result = agent.execute_task(
            task=self,
            context=context,
@@ -273,9 +276,7 @@ class Task(BaseModel):
            content = (
                json_output
                if json_output
-                else pydantic_output.model_dump_json()
-                if pydantic_output
-                else result
+                else pydantic_output.model_dump_json() if pydantic_output else result
            )
            self._save_file(content)

@@ -310,8 +311,10 @@ class Task(BaseModel):
        """Increment the tools errors counter."""
        self.tools_errors += 1

-    def increment_delegations(self) -> None:
+    def increment_delegations(self, agent_name: Optional[str]) -> None:
        """Increment the delegations counter."""
+        if agent_name:
+            self.processed_by_agents.add(agent_name)
        self.delegations += 1

    def copy(self, agents: List["BaseAgent"]) -> "Task":
--- a/src/crewai/tools/tool_usage.py
+++ b/src/crewai/tools/tool_usage.py
@@ -8,6 +8,7 @@ from langchain_core.tools import BaseTool
 from langchain_openai import ChatOpenAI

 from crewai.agents.tools_handler import ToolsHandler
+from crewai.task import Task
 from crewai.telemetry import Telemetry
 from crewai.tools.tool_calling import InstructorToolCalling, ToolCalling
 from crewai.utilities import I18N, Converter, ConverterError, Printer
@@ -51,7 +52,7 @@ class ToolUsage:
        original_tools: List[Any],
        tools_description: str,
        tools_names: str,
-        task: Any,
+        task: Task,
        function_calling_llm: Any,
        agent: Any,
        action: Any,
@@ -154,7 +155,10 @@ class ToolUsage:
                    "Delegate work to coworker",
                    "Ask question to coworker",
                ]:
-                    self.task.increment_delegations()
+                    coworker = (
+                        calling.arguments.get("coworker") if calling.arguments else None
+                    )
+                    self.task.increment_delegations(coworker)

                if calling.arguments:
                    try:
@@ -241,7 +245,7 @@ class ToolUsage:
            result = self._remember_format(result=result)  # type: ignore # "_remember_format" of "ToolUsage" does not return a value (it only ever returns None)
        return result

-    def _should_remember_format(self) -> None:
+    def _should_remember_format(self) -> bool:
        return self.task.used_tools % self._remember_format_after_usages == 0

    def _remember_format(self, result: str) -> None:
@@ -353,10 +357,10 @@ class ToolUsage:
                    return ToolUsageErrorException(  # type: ignore # Incompatible return value type (got "ToolUsageErrorException", expected "ToolCalling | InstructorToolCalling")
                        f'{self._i18n.errors("tool_arguments_error")}'
                    )
-                calling = ToolCalling(  # type: ignore # Unexpected keyword argument "log" for "ToolCalling"
+                calling = ToolCalling(
                    tool_name=tool.name,
                    arguments=arguments,
-                    log=tool_string,
+                    log=tool_string,  # type: ignore
                )
        except Exception as e:
            self._run_attempts += 1
@@ -404,19 +408,19 @@ class ToolUsage:
                        '"' + value.replace('"', '\\"') + '"'
                    )  # Re-encapsulate with double quotes
                elif value.isdigit():  # Check if value is a digit, hence integer
-                    formatted_value = value
+                    value = value
                elif value.lower() in [
                    "true",
                    "false",
                    "null",
                ]:  # Check for boolean and null values
-                    formatted_value = value.lower()
+                    value = value.lower()
                else:
                    # Assume the value is a string and needs quotes
-                    formatted_value = '"' + value.replace('"', '\\"') + '"'
+                    value = '"' + value.replace('"', '\\"') + '"'

                # Rebuild the entry with proper quoting
-                formatted_entry = f'"{key}": {formatted_value}'
+                formatted_entry = f'"{key}": {value}'
                formatted_entries.append(formatted_entry)

            # Reconstruct the JSON string
--- a/src/crewai/utilities/config.py
+++ b/src/crewai/utilities/config.py
@@ -23,17 +23,16 @@ def process_config(
    # Copy values from config (originally from YAML) to the model's attributes.
    # Only copy if the attribute isn't already set, preserving any explicitly defined values.
    for key, value in config.items():
-        if key not in model_class.model_fields:
+        if key not in model_class.model_fields or values.get(key) is not None:
            continue
-        if values.get(key) is not None:
-            continue
-        if isinstance(value, (str, int, float, bool, list)):
-            values[key] = value
-        elif isinstance(value, dict):
+
+        if isinstance(value, dict):
            if isinstance(values.get(key), dict):
                values[key].update(value)
            else:
                values[key] = value
+        else:
+            values[key] = value

    # Remove the config from values to avoid duplicate processing
    values.pop("config", None)
--- a/src/crewai/utilities/evaluators/crew_evaluator_handler.py
+++ b/src/crewai/utilities/evaluators/crew_evaluator_handler.py
@@ -1,14 +1,14 @@
 from collections import defaultdict

-from langchain_openai import ChatOpenAI
-from pydantic import BaseModel, Field
-from rich.console import Console
-from rich.table import Table
-
 from crewai.agent import Agent
 from crewai.task import Task
 from crewai.tasks.task_output import TaskOutput
 from crewai.telemetry import Telemetry
+from langchain_openai import ChatOpenAI
+from pydantic import BaseModel, Field
+from rich.box import HEAVY_EDGE
+from rich.console import Console
+from rich.table import Table


 class TaskEvaluationPydanticOutput(BaseModel):
@@ -77,50 +77,72 @@ class CrewEvaluator:
    def print_crew_evaluation_result(self) -> None:
        """
        Prints the evaluation result of the crew in a table.
-        A Crew with 2 tasks using the command crewai test -n 2
+        A Crew with 2 tasks using the command crewai test -n 3
        will output the following table:

-                        Task Scores
+                        Tasks Scores
                    (1-10 Higher is better)
-            ┏━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━┓
-            ┃ Tasks/Crew ┃ Run 1 ┃ Run 2 ┃ Avg. Total ┃
-            ┡━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━┩
-            │ Task 1     │ 10.0  │ 9.0   │ 9.5        │
-            │ Task 2     │ 9.0   │ 9.0   │ 9.0        │
-            │ Crew       │ 9.5   │ 9.0   │ 9.2        │
-            └────────────┴───────┴───────┴────────────┘
+        ┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
+        ┃ Tasks/Crew/Agents  ┃ Run 1 ┃ Run 2 ┃ Run 3 ┃ Avg. Total ┃ Agents                       ┃
+        ┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
+        │ Task 1             │ 9.0   │ 10.0  │ 9.0   │ 9.3        │ - AI LLMs Senior Researcher  │
+        │                    │       │       │       │            │ - AI LLMs Reporting Analyst  │
+        │                    │       │       │       │            │                              │
+        │ Task 2             │ 9.0   │ 9.0   │ 9.0   │ 9.0        │ - AI LLMs Senior Researcher  │
+        │                    │       │       │       │            │ - AI LLMs Reporting Analyst  │
+        │                    │       │       │       │            │                              │
+        │ Crew               │ 9.0   │ 9.5   │ 9.0   │ 9.2        │                              │
+        │ Execution Time (s) │ 42    │ 79    │ 52    │ 57         │                              │
+        └────────────────────┴───────┴───────┴───────┴────────────┴──────────────────────────────┘
        """
        task_averages = [
            sum(scores) / len(scores) for scores in zip(*self.tasks_scores.values())
        ]
        crew_average = sum(task_averages) / len(task_averages)

-        # Create a table
-        table = Table(title="Tasks Scores \n (1-10 Higher is better)")
+        table = Table(title="Tasks Scores \n (1-10 Higher is better)", box=HEAVY_EDGE)

-        # Add columns for the table
-        table.add_column("Tasks/Crew")
+        table.add_column("Tasks/Crew/Agents", style="cyan")
        for run in range(1, len(self.tasks_scores) + 1):
-            table.add_column(f"Run {run}")
-        table.add_column("Avg. Total")
+            table.add_column(f"Run {run}", justify="center")
+        table.add_column("Avg. Total", justify="center")
+        table.add_column("Agents", style="green")

-        # Add rows for each task
-        for task_index in range(len(task_averages)):
+        for task_index, task in enumerate(self.crew.tasks):
            task_scores = [
                self.tasks_scores[run][task_index]
                for run in range(1, len(self.tasks_scores) + 1)
            ]
            avg_score = task_averages[task_index]
+            agents = list(task.processed_by_agents)
+
+            # Add the task row with the first agent
            table.add_row(
-                f"Task {task_index + 1}", *map(str, task_scores), f"{avg_score:.1f}"
+                f"Task {task_index + 1}",
+                *[f"{score:.1f}" for score in task_scores],
+                f"{avg_score:.1f}",
+                f"- {agents[0]}" if agents else "",
            )

-        # Add a row for the crew average
+            # Add rows for additional agents
+            for agent in agents[1:]:
+                table.add_row("", "", "", "", "", f"- {agent}")
+
+            # Add a blank separator row if it's not the last task
+            if task_index < len(self.crew.tasks) - 1:
+                table.add_row("", "", "", "", "", "")
+
+        # Add Crew and Execution Time rows
        crew_scores = [
            sum(self.tasks_scores[run]) / len(self.tasks_scores[run])
            for run in range(1, len(self.tasks_scores) + 1)
        ]
-        table.add_row("Crew", *map(str, crew_scores), f"{crew_average:.1f}")
+        table.add_row(
+            "Crew",
+            *[f"{score:.2f}" for score in crew_scores],
+            f"{crew_average:.1f}",
+            "",
+        )

        run_exec_times = [
            int(sum(tasks_exec_times))
@@ -128,11 +150,9 @@ class CrewEvaluator:
        ]
        execution_time_avg = int(sum(run_exec_times) / len(run_exec_times))
        table.add_row(
-            "Execution Time (s)",
-            *map(str, run_exec_times),
-            f"{execution_time_avg}",
+            "Execution Time (s)", *map(str, run_exec_times), f"{execution_time_avg}", ""
        )
-        # Display the table in the terminal
+
        console = Console()
        console.print(table)

--- a/tests/utilities/evaluators/test_crew_evaluator_handler.py
+++ b/tests/utilities/evaluators/test_crew_evaluator_handler.py
@@ -1,7 +1,6 @@
 from unittest import mock

 import pytest
-
 from crewai.agent import Agent
 from crewai.crew import Crew
 from crewai.task import Task
@@ -80,6 +79,7 @@ class TestCrewEvaluator:
    @mock.patch("crewai.utilities.evaluators.crew_evaluator_handler.Console")
    @mock.patch("crewai.utilities.evaluators.crew_evaluator_handler.Table")
    def test_print_crew_evaluation_result(self, table, console, crew_planner):
+        # Set up task scores and execution times
        crew_planner.tasks_scores = {
            1: [10, 9, 8],
            2: [9, 8, 7],
@@ -89,22 +89,45 @@ class TestCrewEvaluator:
            2: [55, 33, 67],
        }

+        # Mock agents and assign them to tasks
+        crew_planner.crew.agents = [
+            mock.Mock(role="Agent 1"),
+            mock.Mock(role="Agent 2"),
+        ]
+        crew_planner.crew.tasks = [
+            mock.Mock(
+                agent=crew_planner.crew.agents[0], processed_by_agents=["Agent 1"]
+            ),
+            mock.Mock(
+                agent=crew_planner.crew.agents[1], processed_by_agents=["Agent 2"]
+            ),
+        ]
+
+        # Run the method
        crew_planner.print_crew_evaluation_result()

+        # Verify that the table is created with the appropriate structure and rows
        table.assert_has_calls(
            [
-                mock.call(title="Tasks Scores \n (1-10 Higher is better)"),
-                mock.call().add_column("Tasks/Crew"),
-                mock.call().add_column("Run 1"),
-                mock.call().add_column("Run 2"),
-                mock.call().add_column("Avg. Total"),
-                mock.call().add_row("Task 1", "10", "9", "9.5"),
-                mock.call().add_row("Task 2", "9", "8", "8.5"),
-                mock.call().add_row("Task 3", "8", "7", "7.5"),
-                mock.call().add_row("Crew", "9.0", "8.0", "8.5"),
-                mock.call().add_row("Execution Time (s)", "135", "155", "145"),
+                mock.call(
+                    title="Tasks Scores \n (1-10 Higher is better)", box=mock.ANY
+                ),  # Title and styling
+                mock.call().add_column("Tasks/Crew/Agents", style="cyan"),  # Columns
+                mock.call().add_column("Run 1", justify="center"),
+                mock.call().add_column("Run 2", justify="center"),
+                mock.call().add_column("Avg. Total", justify="center"),
+                mock.call().add_column("Agents", style="green"),
+                # Verify rows for tasks with agents
+                mock.call().add_row("Task 1", "10.0", "9.0", "9.5", "- Agent 1"),
+                mock.call().add_row("", "", "", "", "", ""),  # Blank row between tasks
+                mock.call().add_row("Task 2", "9.0", "8.0", "8.5", "- Agent 2"),
+                # Add crew averages and execution times
+                mock.call().add_row("Crew", "9.00", "8.00", "8.5", ""),
+                mock.call().add_row("Execution Time (s)", "135", "155", "145", ""),
            ]
        )
+
+        # Ensure the console prints the table
        console.assert_has_calls([mock.call(), mock.call().print(table())])

    def test_evaluate(self, crew_planner):