Brandon/cre 252 add agent to crewai test (#1308)

* Update config typecheck to accept agents

* Clean up prints

* Adding agents to crew evaluator output table

* Properly generating table now

* Update tests
This commit is contained in:
Brandon Hancock (bhancock_ai)
2024-09-07 02:53:23 -04:00
committed by GitHub
parent 01124daf55
commit a234eb8baa
7 changed files with 115 additions and 62 deletions

View File

@@ -584,7 +584,10 @@ class Crew(BaseModel):
self.manager_agent.allow_delegation = True
manager = self.manager_agent
if manager.tools is not None and len(manager.tools) > 0:
raise Exception("Manager agent should not have tools")
self._logger.log(
"warning", "Manager agent should not have tools", color="orange"
)
manager.tools = []
manager.tools = self.manager_agent.get_delegation_tools(self.agents)
else:
manager = Agent(

View File

@@ -103,7 +103,8 @@ def crew(func):
for task_name in sorted_task_names:
task_instance = tasks[task_name]()
instantiated_tasks.append(task_instance)
if hasattr(task_instance, "agent"):
agent_instance = getattr(task_instance, "agent", None)
if agent_instance is not None:
agent_instance = task_instance.agent
if agent_instance.role not in agent_roles:
instantiated_agents.append(agent_instance)

View File

@@ -6,7 +6,7 @@ import uuid
from concurrent.futures import Future
from copy import copy
from hashlib import md5
from typing import Any, Dict, List, Optional, Tuple, Type, Union
from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
from opentelemetry.trace import Span
from pydantic import (
@@ -108,6 +108,7 @@ class Task(BaseModel):
description="A converter class used to export structured output",
default=None,
)
processed_by_agents: Set[str] = Field(default_factory=set)
_telemetry: Telemetry = PrivateAttr(default_factory=Telemetry)
_execution_span: Optional[Span] = PrivateAttr(default=None)
@@ -241,6 +242,8 @@ class Task(BaseModel):
self.prompt_context = context
tools = tools or self.tools or []
self.processed_by_agents.add(agent.role)
result = agent.execute_task(
task=self,
context=context,
@@ -273,9 +276,7 @@ class Task(BaseModel):
content = (
json_output
if json_output
else pydantic_output.model_dump_json()
if pydantic_output
else result
else pydantic_output.model_dump_json() if pydantic_output else result
)
self._save_file(content)
@@ -310,8 +311,10 @@ class Task(BaseModel):
"""Increment the tools errors counter."""
self.tools_errors += 1
def increment_delegations(self) -> None:
def increment_delegations(self, agent_name: Optional[str]) -> None:
"""Increment the delegations counter."""
if agent_name:
self.processed_by_agents.add(agent_name)
self.delegations += 1
def copy(self, agents: List["BaseAgent"]) -> "Task":

View File

@@ -8,6 +8,7 @@ from langchain_core.tools import BaseTool
from langchain_openai import ChatOpenAI
from crewai.agents.tools_handler import ToolsHandler
from crewai.task import Task
from crewai.telemetry import Telemetry
from crewai.tools.tool_calling import InstructorToolCalling, ToolCalling
from crewai.utilities import I18N, Converter, ConverterError, Printer
@@ -51,7 +52,7 @@ class ToolUsage:
original_tools: List[Any],
tools_description: str,
tools_names: str,
task: Any,
task: Task,
function_calling_llm: Any,
agent: Any,
action: Any,
@@ -154,7 +155,10 @@ class ToolUsage:
"Delegate work to coworker",
"Ask question to coworker",
]:
self.task.increment_delegations()
coworker = (
calling.arguments.get("coworker") if calling.arguments else None
)
self.task.increment_delegations(coworker)
if calling.arguments:
try:
@@ -241,7 +245,7 @@ class ToolUsage:
result = self._remember_format(result=result) # type: ignore # "_remember_format" of "ToolUsage" does not return a value (it only ever returns None)
return result
def _should_remember_format(self) -> None:
def _should_remember_format(self) -> bool:
return self.task.used_tools % self._remember_format_after_usages == 0
def _remember_format(self, result: str) -> None:
@@ -353,10 +357,10 @@ class ToolUsage:
return ToolUsageErrorException( # type: ignore # Incompatible return value type (got "ToolUsageErrorException", expected "ToolCalling | InstructorToolCalling")
f'{self._i18n.errors("tool_arguments_error")}'
)
calling = ToolCalling( # type: ignore # Unexpected keyword argument "log" for "ToolCalling"
calling = ToolCalling(
tool_name=tool.name,
arguments=arguments,
log=tool_string,
log=tool_string, # type: ignore
)
except Exception as e:
self._run_attempts += 1
@@ -404,19 +408,19 @@ class ToolUsage:
'"' + value.replace('"', '\\"') + '"'
) # Re-encapsulate with double quotes
elif value.isdigit(): # Check if value is a digit, hence integer
formatted_value = value
value = value
elif value.lower() in [
"true",
"false",
"null",
]: # Check for boolean and null values
formatted_value = value.lower()
value = value.lower()
else:
# Assume the value is a string and needs quotes
formatted_value = '"' + value.replace('"', '\\"') + '"'
value = '"' + value.replace('"', '\\"') + '"'
# Rebuild the entry with proper quoting
formatted_entry = f'"{key}": {formatted_value}'
formatted_entry = f'"{key}": {value}'
formatted_entries.append(formatted_entry)
# Reconstruct the JSON string

View File

@@ -23,17 +23,16 @@ def process_config(
# Copy values from config (originally from YAML) to the model's attributes.
# Only copy if the attribute isn't already set, preserving any explicitly defined values.
for key, value in config.items():
if key not in model_class.model_fields:
if key not in model_class.model_fields or values.get(key) is not None:
continue
if values.get(key) is not None:
continue
if isinstance(value, (str, int, float, bool, list)):
values[key] = value
elif isinstance(value, dict):
if isinstance(value, dict):
if isinstance(values.get(key), dict):
values[key].update(value)
else:
values[key] = value
else:
values[key] = value
# Remove the config from values to avoid duplicate processing
values.pop("config", None)

View File

@@ -1,14 +1,14 @@
from collections import defaultdict
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from rich.console import Console
from rich.table import Table
from crewai.agent import Agent
from crewai.task import Task
from crewai.tasks.task_output import TaskOutput
from crewai.telemetry import Telemetry
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from rich.box import HEAVY_EDGE
from rich.console import Console
from rich.table import Table
class TaskEvaluationPydanticOutput(BaseModel):
@@ -77,50 +77,72 @@ class CrewEvaluator:
def print_crew_evaluation_result(self) -> None:
"""
Prints the evaluation result of the crew in a table.
A Crew with 2 tasks using the command crewai test -n 2
A Crew with 2 tasks using the command crewai test -n 3
will output the following table:
Task Scores
Tasks Scores
(1-10 Higher is better)
┏━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━┓
┃ Tasks/Crew ┃ Run 1 ┃ Run 2 ┃ Avg. Total ┃
┡━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━┩
│ Task 1 │ 10.0 │ 9.0 │ 9.5
│ Task 29.09.09.0
│ Crew │ 9.5 │ 9.0 │ 9.2
└────────────┴───────┴───────┴────────────┘
━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ Tasks/Crew/Agents ┃ Run 1 ┃ Run 2 ┃ Run 3 ┃ Avg. Total ┃ Agents ┃
━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ Task 1 │ 9.0 │ 10.0 │ 9.0 │ 9.3 │ - AI LLMs Senior Researcher
│ │ │ - AI LLMs Reporting Analyst
│ │ │ │ │ │
│ Task 2 │ 9.0 │ 9.0 │ 9.0 │ 9.0 │ - AI LLMs Senior Researcher │
│ │ │ │ │ │ - AI LLMs Reporting Analyst │
│ │ │ │ │ │ │
│ Crew │ 9.0 │ 9.5 │ 9.0 │ 9.2 │ │
│ Execution Time (s) │ 42 │ 79 │ 52 │ 57 │ │
└────────────────────┴───────┴───────┴───────┴────────────┴──────────────────────────────┘
"""
task_averages = [
sum(scores) / len(scores) for scores in zip(*self.tasks_scores.values())
]
crew_average = sum(task_averages) / len(task_averages)
# Create a table
table = Table(title="Tasks Scores \n (1-10 Higher is better)")
table = Table(title="Tasks Scores \n (1-10 Higher is better)", box=HEAVY_EDGE)
# Add columns for the table
table.add_column("Tasks/Crew")
table.add_column("Tasks/Crew/Agents", style="cyan")
for run in range(1, len(self.tasks_scores) + 1):
table.add_column(f"Run {run}")
table.add_column("Avg. Total")
table.add_column(f"Run {run}", justify="center")
table.add_column("Avg. Total", justify="center")
table.add_column("Agents", style="green")
# Add rows for each task
for task_index in range(len(task_averages)):
for task_index, task in enumerate(self.crew.tasks):
task_scores = [
self.tasks_scores[run][task_index]
for run in range(1, len(self.tasks_scores) + 1)
]
avg_score = task_averages[task_index]
agents = list(task.processed_by_agents)
# Add the task row with the first agent
table.add_row(
f"Task {task_index + 1}", *map(str, task_scores), f"{avg_score:.1f}"
f"Task {task_index + 1}",
*[f"{score:.1f}" for score in task_scores],
f"{avg_score:.1f}",
f"- {agents[0]}" if agents else "",
)
# Add a row for the crew average
# Add rows for additional agents
for agent in agents[1:]:
table.add_row("", "", "", "", "", f"- {agent}")
# Add a blank separator row if it's not the last task
if task_index < len(self.crew.tasks) - 1:
table.add_row("", "", "", "", "", "")
# Add Crew and Execution Time rows
crew_scores = [
sum(self.tasks_scores[run]) / len(self.tasks_scores[run])
for run in range(1, len(self.tasks_scores) + 1)
]
table.add_row("Crew", *map(str, crew_scores), f"{crew_average:.1f}")
table.add_row(
"Crew",
*[f"{score:.2f}" for score in crew_scores],
f"{crew_average:.1f}",
"",
)
run_exec_times = [
int(sum(tasks_exec_times))
@@ -128,11 +150,9 @@ class CrewEvaluator:
]
execution_time_avg = int(sum(run_exec_times) / len(run_exec_times))
table.add_row(
"Execution Time (s)",
*map(str, run_exec_times),
f"{execution_time_avg}",
"Execution Time (s)", *map(str, run_exec_times), f"{execution_time_avg}", ""
)
# Display the table in the terminal
console = Console()
console.print(table)

View File

@@ -1,7 +1,6 @@
from unittest import mock
import pytest
from crewai.agent import Agent
from crewai.crew import Crew
from crewai.task import Task
@@ -80,6 +79,7 @@ class TestCrewEvaluator:
@mock.patch("crewai.utilities.evaluators.crew_evaluator_handler.Console")
@mock.patch("crewai.utilities.evaluators.crew_evaluator_handler.Table")
def test_print_crew_evaluation_result(self, table, console, crew_planner):
# Set up task scores and execution times
crew_planner.tasks_scores = {
1: [10, 9, 8],
2: [9, 8, 7],
@@ -89,22 +89,45 @@ class TestCrewEvaluator:
2: [55, 33, 67],
}
# Mock agents and assign them to tasks
crew_planner.crew.agents = [
mock.Mock(role="Agent 1"),
mock.Mock(role="Agent 2"),
]
crew_planner.crew.tasks = [
mock.Mock(
agent=crew_planner.crew.agents[0], processed_by_agents=["Agent 1"]
),
mock.Mock(
agent=crew_planner.crew.agents[1], processed_by_agents=["Agent 2"]
),
]
# Run the method
crew_planner.print_crew_evaluation_result()
# Verify that the table is created with the appropriate structure and rows
table.assert_has_calls(
[
mock.call(title="Tasks Scores \n (1-10 Higher is better)"),
mock.call().add_column("Tasks/Crew"),
mock.call().add_column("Run 1"),
mock.call().add_column("Run 2"),
mock.call().add_column("Avg. Total"),
mock.call().add_row("Task 1", "10", "9", "9.5"),
mock.call().add_row("Task 2", "9", "8", "8.5"),
mock.call().add_row("Task 3", "8", "7", "7.5"),
mock.call().add_row("Crew", "9.0", "8.0", "8.5"),
mock.call().add_row("Execution Time (s)", "135", "155", "145"),
mock.call(
title="Tasks Scores \n (1-10 Higher is better)", box=mock.ANY
), # Title and styling
mock.call().add_column("Tasks/Crew/Agents", style="cyan"), # Columns
mock.call().add_column("Run 1", justify="center"),
mock.call().add_column("Run 2", justify="center"),
mock.call().add_column("Avg. Total", justify="center"),
mock.call().add_column("Agents", style="green"),
# Verify rows for tasks with agents
mock.call().add_row("Task 1", "10.0", "9.0", "9.5", "- Agent 1"),
mock.call().add_row("", "", "", "", "", ""), # Blank row between tasks
mock.call().add_row("Task 2", "9.0", "8.0", "8.5", "- Agent 2"),
# Add crew averages and execution times
mock.call().add_row("Crew", "9.00", "8.00", "8.5", ""),
mock.call().add_row("Execution Time (s)", "135", "155", "145", ""),
]
)
# Ensure the console prints the table
console.assert_has_calls([mock.call(), mock.call().print(table())])
def test_evaluate(self, crew_planner):