Brandon/cre 252 add agent to crewai test (#1308)

* Update config typecheck to accept agents

* Clean up prints

* Adding agents to crew evaluator output table

* Properly generating table now

* Update tests
This commit is contained in:
Brandon Hancock (bhancock_ai)
2024-09-07 02:53:23 -04:00
committed by GitHub
parent 01124daf55
commit a234eb8baa
7 changed files with 115 additions and 62 deletions

View File

@@ -584,7 +584,10 @@ class Crew(BaseModel):
self.manager_agent.allow_delegation = True self.manager_agent.allow_delegation = True
manager = self.manager_agent manager = self.manager_agent
if manager.tools is not None and len(manager.tools) > 0: if manager.tools is not None and len(manager.tools) > 0:
raise Exception("Manager agent should not have tools") self._logger.log(
"warning", "Manager agent should not have tools", color="orange"
)
manager.tools = []
manager.tools = self.manager_agent.get_delegation_tools(self.agents) manager.tools = self.manager_agent.get_delegation_tools(self.agents)
else: else:
manager = Agent( manager = Agent(

View File

@@ -103,7 +103,8 @@ def crew(func):
for task_name in sorted_task_names: for task_name in sorted_task_names:
task_instance = tasks[task_name]() task_instance = tasks[task_name]()
instantiated_tasks.append(task_instance) instantiated_tasks.append(task_instance)
if hasattr(task_instance, "agent"): agent_instance = getattr(task_instance, "agent", None)
if agent_instance is not None:
agent_instance = task_instance.agent agent_instance = task_instance.agent
if agent_instance.role not in agent_roles: if agent_instance.role not in agent_roles:
instantiated_agents.append(agent_instance) instantiated_agents.append(agent_instance)

View File

@@ -6,7 +6,7 @@ import uuid
from concurrent.futures import Future from concurrent.futures import Future
from copy import copy from copy import copy
from hashlib import md5 from hashlib import md5
from typing import Any, Dict, List, Optional, Tuple, Type, Union from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
from opentelemetry.trace import Span from opentelemetry.trace import Span
from pydantic import ( from pydantic import (
@@ -108,6 +108,7 @@ class Task(BaseModel):
description="A converter class used to export structured output", description="A converter class used to export structured output",
default=None, default=None,
) )
processed_by_agents: Set[str] = Field(default_factory=set)
_telemetry: Telemetry = PrivateAttr(default_factory=Telemetry) _telemetry: Telemetry = PrivateAttr(default_factory=Telemetry)
_execution_span: Optional[Span] = PrivateAttr(default=None) _execution_span: Optional[Span] = PrivateAttr(default=None)
@@ -241,6 +242,8 @@ class Task(BaseModel):
self.prompt_context = context self.prompt_context = context
tools = tools or self.tools or [] tools = tools or self.tools or []
self.processed_by_agents.add(agent.role)
result = agent.execute_task( result = agent.execute_task(
task=self, task=self,
context=context, context=context,
@@ -273,9 +276,7 @@ class Task(BaseModel):
content = ( content = (
json_output json_output
if json_output if json_output
else pydantic_output.model_dump_json() else pydantic_output.model_dump_json() if pydantic_output else result
if pydantic_output
else result
) )
self._save_file(content) self._save_file(content)
@@ -310,8 +311,10 @@ class Task(BaseModel):
"""Increment the tools errors counter.""" """Increment the tools errors counter."""
self.tools_errors += 1 self.tools_errors += 1
def increment_delegations(self) -> None: def increment_delegations(self, agent_name: Optional[str]) -> None:
"""Increment the delegations counter.""" """Increment the delegations counter."""
if agent_name:
self.processed_by_agents.add(agent_name)
self.delegations += 1 self.delegations += 1
def copy(self, agents: List["BaseAgent"]) -> "Task": def copy(self, agents: List["BaseAgent"]) -> "Task":

View File

@@ -8,6 +8,7 @@ from langchain_core.tools import BaseTool
from langchain_openai import ChatOpenAI from langchain_openai import ChatOpenAI
from crewai.agents.tools_handler import ToolsHandler from crewai.agents.tools_handler import ToolsHandler
from crewai.task import Task
from crewai.telemetry import Telemetry from crewai.telemetry import Telemetry
from crewai.tools.tool_calling import InstructorToolCalling, ToolCalling from crewai.tools.tool_calling import InstructorToolCalling, ToolCalling
from crewai.utilities import I18N, Converter, ConverterError, Printer from crewai.utilities import I18N, Converter, ConverterError, Printer
@@ -51,7 +52,7 @@ class ToolUsage:
original_tools: List[Any], original_tools: List[Any],
tools_description: str, tools_description: str,
tools_names: str, tools_names: str,
task: Any, task: Task,
function_calling_llm: Any, function_calling_llm: Any,
agent: Any, agent: Any,
action: Any, action: Any,
@@ -154,7 +155,10 @@ class ToolUsage:
"Delegate work to coworker", "Delegate work to coworker",
"Ask question to coworker", "Ask question to coworker",
]: ]:
self.task.increment_delegations() coworker = (
calling.arguments.get("coworker") if calling.arguments else None
)
self.task.increment_delegations(coworker)
if calling.arguments: if calling.arguments:
try: try:
@@ -241,7 +245,7 @@ class ToolUsage:
result = self._remember_format(result=result) # type: ignore # "_remember_format" of "ToolUsage" does not return a value (it only ever returns None) result = self._remember_format(result=result) # type: ignore # "_remember_format" of "ToolUsage" does not return a value (it only ever returns None)
return result return result
def _should_remember_format(self) -> None: def _should_remember_format(self) -> bool:
return self.task.used_tools % self._remember_format_after_usages == 0 return self.task.used_tools % self._remember_format_after_usages == 0
def _remember_format(self, result: str) -> None: def _remember_format(self, result: str) -> None:
@@ -353,10 +357,10 @@ class ToolUsage:
return ToolUsageErrorException( # type: ignore # Incompatible return value type (got "ToolUsageErrorException", expected "ToolCalling | InstructorToolCalling") return ToolUsageErrorException( # type: ignore # Incompatible return value type (got "ToolUsageErrorException", expected "ToolCalling | InstructorToolCalling")
f'{self._i18n.errors("tool_arguments_error")}' f'{self._i18n.errors("tool_arguments_error")}'
) )
calling = ToolCalling( # type: ignore # Unexpected keyword argument "log" for "ToolCalling" calling = ToolCalling(
tool_name=tool.name, tool_name=tool.name,
arguments=arguments, arguments=arguments,
log=tool_string, log=tool_string, # type: ignore
) )
except Exception as e: except Exception as e:
self._run_attempts += 1 self._run_attempts += 1
@@ -404,19 +408,19 @@ class ToolUsage:
'"' + value.replace('"', '\\"') + '"' '"' + value.replace('"', '\\"') + '"'
) # Re-encapsulate with double quotes ) # Re-encapsulate with double quotes
elif value.isdigit(): # Check if value is a digit, hence integer elif value.isdigit(): # Check if value is a digit, hence integer
formatted_value = value value = value
elif value.lower() in [ elif value.lower() in [
"true", "true",
"false", "false",
"null", "null",
]: # Check for boolean and null values ]: # Check for boolean and null values
formatted_value = value.lower() value = value.lower()
else: else:
# Assume the value is a string and needs quotes # Assume the value is a string and needs quotes
formatted_value = '"' + value.replace('"', '\\"') + '"' value = '"' + value.replace('"', '\\"') + '"'
# Rebuild the entry with proper quoting # Rebuild the entry with proper quoting
formatted_entry = f'"{key}": {formatted_value}' formatted_entry = f'"{key}": {value}'
formatted_entries.append(formatted_entry) formatted_entries.append(formatted_entry)
# Reconstruct the JSON string # Reconstruct the JSON string

View File

@@ -23,17 +23,16 @@ def process_config(
# Copy values from config (originally from YAML) to the model's attributes. # Copy values from config (originally from YAML) to the model's attributes.
# Only copy if the attribute isn't already set, preserving any explicitly defined values. # Only copy if the attribute isn't already set, preserving any explicitly defined values.
for key, value in config.items(): for key, value in config.items():
if key not in model_class.model_fields: if key not in model_class.model_fields or values.get(key) is not None:
continue continue
if values.get(key) is not None:
continue if isinstance(value, dict):
if isinstance(value, (str, int, float, bool, list)):
values[key] = value
elif isinstance(value, dict):
if isinstance(values.get(key), dict): if isinstance(values.get(key), dict):
values[key].update(value) values[key].update(value)
else: else:
values[key] = value values[key] = value
else:
values[key] = value
# Remove the config from values to avoid duplicate processing # Remove the config from values to avoid duplicate processing
values.pop("config", None) values.pop("config", None)

View File

@@ -1,14 +1,14 @@
from collections import defaultdict from collections import defaultdict
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from rich.console import Console
from rich.table import Table
from crewai.agent import Agent from crewai.agent import Agent
from crewai.task import Task from crewai.task import Task
from crewai.tasks.task_output import TaskOutput from crewai.tasks.task_output import TaskOutput
from crewai.telemetry import Telemetry from crewai.telemetry import Telemetry
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from rich.box import HEAVY_EDGE
from rich.console import Console
from rich.table import Table
class TaskEvaluationPydanticOutput(BaseModel): class TaskEvaluationPydanticOutput(BaseModel):
@@ -77,50 +77,72 @@ class CrewEvaluator:
def print_crew_evaluation_result(self) -> None: def print_crew_evaluation_result(self) -> None:
""" """
Prints the evaluation result of the crew in a table. Prints the evaluation result of the crew in a table.
A Crew with 2 tasks using the command crewai test -n 2 A Crew with 2 tasks using the command crewai test -n 3
will output the following table: will output the following table:
Task Scores Tasks Scores
(1-10 Higher is better) (1-10 Higher is better)
┏━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━┓ ━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ Tasks/Crew ┃ Run 1 ┃ Run 2 ┃ Avg. Total ┃ ┃ Tasks/Crew/Agents ┃ Run 1 ┃ Run 2 ┃ Run 3 ┃ Avg. Total ┃ Agents ┃
┡━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━┩ ━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ Task 1 │ 10.0 │ 9.0 │ 9.5 │ Task 1 │ 9.0 │ 10.0 │ 9.0 │ 9.3 │ - AI LLMs Senior Researcher
│ Task 29.09.09.0 │ │ │ - AI LLMs Reporting Analyst
│ Crew │ 9.5 │ 9.0 │ 9.2 │ │ │ │ │ │
└────────────┴───────┴───────┴────────────┘ │ Task 2 │ 9.0 │ 9.0 │ 9.0 │ 9.0 │ - AI LLMs Senior Researcher │
│ │ │ │ │ │ - AI LLMs Reporting Analyst │
│ │ │ │ │ │ │
│ Crew │ 9.0 │ 9.5 │ 9.0 │ 9.2 │ │
│ Execution Time (s) │ 42 │ 79 │ 52 │ 57 │ │
└────────────────────┴───────┴───────┴───────┴────────────┴──────────────────────────────┘
""" """
task_averages = [ task_averages = [
sum(scores) / len(scores) for scores in zip(*self.tasks_scores.values()) sum(scores) / len(scores) for scores in zip(*self.tasks_scores.values())
] ]
crew_average = sum(task_averages) / len(task_averages) crew_average = sum(task_averages) / len(task_averages)
# Create a table table = Table(title="Tasks Scores \n (1-10 Higher is better)", box=HEAVY_EDGE)
table = Table(title="Tasks Scores \n (1-10 Higher is better)")
# Add columns for the table table.add_column("Tasks/Crew/Agents", style="cyan")
table.add_column("Tasks/Crew")
for run in range(1, len(self.tasks_scores) + 1): for run in range(1, len(self.tasks_scores) + 1):
table.add_column(f"Run {run}") table.add_column(f"Run {run}", justify="center")
table.add_column("Avg. Total") table.add_column("Avg. Total", justify="center")
table.add_column("Agents", style="green")
# Add rows for each task for task_index, task in enumerate(self.crew.tasks):
for task_index in range(len(task_averages)):
task_scores = [ task_scores = [
self.tasks_scores[run][task_index] self.tasks_scores[run][task_index]
for run in range(1, len(self.tasks_scores) + 1) for run in range(1, len(self.tasks_scores) + 1)
] ]
avg_score = task_averages[task_index] avg_score = task_averages[task_index]
agents = list(task.processed_by_agents)
# Add the task row with the first agent
table.add_row( table.add_row(
f"Task {task_index + 1}", *map(str, task_scores), f"{avg_score:.1f}" f"Task {task_index + 1}",
*[f"{score:.1f}" for score in task_scores],
f"{avg_score:.1f}",
f"- {agents[0]}" if agents else "",
) )
# Add a row for the crew average # Add rows for additional agents
for agent in agents[1:]:
table.add_row("", "", "", "", "", f"- {agent}")
# Add a blank separator row if it's not the last task
if task_index < len(self.crew.tasks) - 1:
table.add_row("", "", "", "", "", "")
# Add Crew and Execution Time rows
crew_scores = [ crew_scores = [
sum(self.tasks_scores[run]) / len(self.tasks_scores[run]) sum(self.tasks_scores[run]) / len(self.tasks_scores[run])
for run in range(1, len(self.tasks_scores) + 1) for run in range(1, len(self.tasks_scores) + 1)
] ]
table.add_row("Crew", *map(str, crew_scores), f"{crew_average:.1f}") table.add_row(
"Crew",
*[f"{score:.2f}" for score in crew_scores],
f"{crew_average:.1f}",
"",
)
run_exec_times = [ run_exec_times = [
int(sum(tasks_exec_times)) int(sum(tasks_exec_times))
@@ -128,11 +150,9 @@ class CrewEvaluator:
] ]
execution_time_avg = int(sum(run_exec_times) / len(run_exec_times)) execution_time_avg = int(sum(run_exec_times) / len(run_exec_times))
table.add_row( table.add_row(
"Execution Time (s)", "Execution Time (s)", *map(str, run_exec_times), f"{execution_time_avg}", ""
*map(str, run_exec_times),
f"{execution_time_avg}",
) )
# Display the table in the terminal
console = Console() console = Console()
console.print(table) console.print(table)

View File

@@ -1,7 +1,6 @@
from unittest import mock from unittest import mock
import pytest import pytest
from crewai.agent import Agent from crewai.agent import Agent
from crewai.crew import Crew from crewai.crew import Crew
from crewai.task import Task from crewai.task import Task
@@ -80,6 +79,7 @@ class TestCrewEvaluator:
@mock.patch("crewai.utilities.evaluators.crew_evaluator_handler.Console") @mock.patch("crewai.utilities.evaluators.crew_evaluator_handler.Console")
@mock.patch("crewai.utilities.evaluators.crew_evaluator_handler.Table") @mock.patch("crewai.utilities.evaluators.crew_evaluator_handler.Table")
def test_print_crew_evaluation_result(self, table, console, crew_planner): def test_print_crew_evaluation_result(self, table, console, crew_planner):
# Set up task scores and execution times
crew_planner.tasks_scores = { crew_planner.tasks_scores = {
1: [10, 9, 8], 1: [10, 9, 8],
2: [9, 8, 7], 2: [9, 8, 7],
@@ -89,22 +89,45 @@ class TestCrewEvaluator:
2: [55, 33, 67], 2: [55, 33, 67],
} }
# Mock agents and assign them to tasks
crew_planner.crew.agents = [
mock.Mock(role="Agent 1"),
mock.Mock(role="Agent 2"),
]
crew_planner.crew.tasks = [
mock.Mock(
agent=crew_planner.crew.agents[0], processed_by_agents=["Agent 1"]
),
mock.Mock(
agent=crew_planner.crew.agents[1], processed_by_agents=["Agent 2"]
),
]
# Run the method
crew_planner.print_crew_evaluation_result() crew_planner.print_crew_evaluation_result()
# Verify that the table is created with the appropriate structure and rows
table.assert_has_calls( table.assert_has_calls(
[ [
mock.call(title="Tasks Scores \n (1-10 Higher is better)"), mock.call(
mock.call().add_column("Tasks/Crew"), title="Tasks Scores \n (1-10 Higher is better)", box=mock.ANY
mock.call().add_column("Run 1"), ), # Title and styling
mock.call().add_column("Run 2"), mock.call().add_column("Tasks/Crew/Agents", style="cyan"), # Columns
mock.call().add_column("Avg. Total"), mock.call().add_column("Run 1", justify="center"),
mock.call().add_row("Task 1", "10", "9", "9.5"), mock.call().add_column("Run 2", justify="center"),
mock.call().add_row("Task 2", "9", "8", "8.5"), mock.call().add_column("Avg. Total", justify="center"),
mock.call().add_row("Task 3", "8", "7", "7.5"), mock.call().add_column("Agents", style="green"),
mock.call().add_row("Crew", "9.0", "8.0", "8.5"), # Verify rows for tasks with agents
mock.call().add_row("Execution Time (s)", "135", "155", "145"), mock.call().add_row("Task 1", "10.0", "9.0", "9.5", "- Agent 1"),
mock.call().add_row("", "", "", "", "", ""), # Blank row between tasks
mock.call().add_row("Task 2", "9.0", "8.0", "8.5", "- Agent 2"),
# Add crew averages and execution times
mock.call().add_row("Crew", "9.00", "8.00", "8.5", ""),
mock.call().add_row("Execution Time (s)", "135", "155", "145", ""),
] ]
) )
# Ensure the console prints the table
console.assert_has_calls([mock.call(), mock.call().print(table())]) console.assert_has_calls([mock.call(), mock.call().print(table())])
def test_evaluate(self, crew_planner): def test_evaluate(self, crew_planner):