mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-10 00:28:31 +00:00
feat: add crew Testing/evalauting feature
This commit is contained in:
@@ -37,6 +37,7 @@ from crewai.utilities.constants import (
|
|||||||
TRAINED_AGENTS_DATA_FILE,
|
TRAINED_AGENTS_DATA_FILE,
|
||||||
TRAINING_DATA_FILE,
|
TRAINING_DATA_FILE,
|
||||||
)
|
)
|
||||||
|
from crewai.utilities.evaluators.crew_evaluator_handler import CrewEvaluator
|
||||||
from crewai.utilities.evaluators.task_evaluator import TaskEvaluator
|
from crewai.utilities.evaluators.task_evaluator import TaskEvaluator
|
||||||
from crewai.utilities.formatter import (
|
from crewai.utilities.formatter import (
|
||||||
aggregate_raw_outputs_from_task_outputs,
|
aggregate_raw_outputs_from_task_outputs,
|
||||||
@@ -970,7 +971,13 @@ class Crew(BaseModel):
|
|||||||
self, n_iterations: int, model: str, inputs: Optional[Dict[str, Any]] = None
|
self, n_iterations: int, model: str, inputs: Optional[Dict[str, Any]] = None
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test the crew with the given inputs."""
|
"""Test the crew with the given inputs."""
|
||||||
pass
|
evaluator = CrewEvaluator(self, model)
|
||||||
|
|
||||||
|
for i in range(1, n_iterations + 1):
|
||||||
|
evaluator.set_iteration(i)
|
||||||
|
self.kickoff(inputs=inputs)
|
||||||
|
|
||||||
|
evaluator.print_crew_evaluation_result()
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return f"Crew(id={self.id}, process={self.process}, number_of_agents={len(self.agents)}, number_of_tasks={len(self.tasks)})"
|
return f"Crew(id={self.id}, process={self.process}, number_of_agents={len(self.agents)}, number_of_tasks={len(self.tasks)})"
|
||||||
|
|||||||
118
src/crewai/utilities/evaluators/crew_evaluator_handler.py
Normal file
118
src/crewai/utilities/evaluators/crew_evaluator_handler.py
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
from langchain_openai import ChatOpenAI
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.table import Table
|
||||||
|
|
||||||
|
from crewai.agent import Agent
|
||||||
|
from crewai.task import Task
|
||||||
|
from crewai.tasks.task_output import TaskOutput
|
||||||
|
|
||||||
|
|
||||||
|
class TaskEvaluationPydanticOutput(BaseModel):
|
||||||
|
quality: float = Field(
|
||||||
|
description="A score from 0 to 10 evaluating on completion, quality, and overall performance from the task_description and task_expected_output to the actual Task Output."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class CrewEvaluator:
|
||||||
|
tasks_scores = defaultdict(list)
|
||||||
|
iteration = 0
|
||||||
|
|
||||||
|
def __init__(self, crew, model: str):
|
||||||
|
self.crew = crew
|
||||||
|
self.model = model
|
||||||
|
self._setup_for_evaluating()
|
||||||
|
|
||||||
|
def _setup_for_evaluating(self) -> None:
|
||||||
|
"""Sets up the crew for evaluating."""
|
||||||
|
for task in self.crew.tasks:
|
||||||
|
task.callback = self.evaluate
|
||||||
|
|
||||||
|
def set_iteration(self, iteration: int) -> None:
|
||||||
|
self.iteration = iteration
|
||||||
|
|
||||||
|
def _evaluator_agent(self):
|
||||||
|
return Agent(
|
||||||
|
role="Task Execution Evaluator",
|
||||||
|
goal=(
|
||||||
|
"Your goal is to evaluate the performance of the agents in the crew based on the tasks they have performed."
|
||||||
|
),
|
||||||
|
backstory="Evaluator agent for crew evaluation",
|
||||||
|
verbose=False,
|
||||||
|
llm=ChatOpenAI(model=self.model),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _evaluation_task(
|
||||||
|
self, evaluator_agent: Agent, task_to_evaluate: Task, task_output: str
|
||||||
|
) -> Task:
|
||||||
|
return Task(
|
||||||
|
description=(
|
||||||
|
"Based on the task description and the expected output, compare and evaluate the performance of the agents in the crew based on the Task Output they have performed."
|
||||||
|
f"task_description: {task_to_evaluate.description}"
|
||||||
|
f"task_expected_output: {task_to_evaluate.expected_output}"
|
||||||
|
f"agent: {task_to_evaluate.agent.role if task_to_evaluate.agent else None}"
|
||||||
|
f"agent_goal: {task_to_evaluate.agent.goal if task_to_evaluate.agent else None}"
|
||||||
|
f"Task Output: {task_output}"
|
||||||
|
),
|
||||||
|
expected_output="Evaluation score based on the performance of the agents on the tasks",
|
||||||
|
agent=evaluator_agent,
|
||||||
|
output_pydantic=TaskEvaluationPydanticOutput,
|
||||||
|
)
|
||||||
|
|
||||||
|
def print_crew_evaluation_result(self) -> None:
|
||||||
|
self.tasks_scores
|
||||||
|
results = self.tasks_scores
|
||||||
|
|
||||||
|
task_averages = [sum(scores) / len(scores) for scores in zip(*results.values())]
|
||||||
|
crew_average = sum(task_averages) / len(task_averages)
|
||||||
|
|
||||||
|
# Create a table
|
||||||
|
table = Table(title="Task Scores")
|
||||||
|
|
||||||
|
# Add columns for the table
|
||||||
|
table.add_column("Task")
|
||||||
|
for run in range(1, len(results) + 1):
|
||||||
|
table.add_column(f"Run {run}")
|
||||||
|
table.add_column("Avg. Total")
|
||||||
|
|
||||||
|
# Add rows for each task
|
||||||
|
for task_index in range(len(task_averages)):
|
||||||
|
task_scores = [
|
||||||
|
results[run][task_index] for run in range(1, len(results) + 1)
|
||||||
|
]
|
||||||
|
avg_score = task_averages[task_index]
|
||||||
|
table.add_row(
|
||||||
|
f"Task {task_index + 1}", *map(str, task_scores), f"{avg_score:.1f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add a row for the crew average
|
||||||
|
crew_scores = [
|
||||||
|
sum(results[run]) / len(results[run]) for run in range(1, len(results) + 1)
|
||||||
|
]
|
||||||
|
table.add_row("Crew", *map(str, crew_scores), f"{crew_average:.1f}")
|
||||||
|
|
||||||
|
# Display the table in the terminal
|
||||||
|
console = Console()
|
||||||
|
console.print(table)
|
||||||
|
|
||||||
|
def evaluate(self, task_output: TaskOutput):
|
||||||
|
current_task = None
|
||||||
|
for task in self.crew.tasks:
|
||||||
|
if task.description == task_output.description:
|
||||||
|
current_task = task
|
||||||
|
break
|
||||||
|
|
||||||
|
if not current_task or not task_output:
|
||||||
|
raise ValueError(
|
||||||
|
"Task to evaluate and task output are required for evaluation"
|
||||||
|
)
|
||||||
|
|
||||||
|
evaluator_agent = self._evaluator_agent()
|
||||||
|
evaluation_task = self._evaluation_task(
|
||||||
|
evaluator_agent, current_task, task_output.raw
|
||||||
|
)
|
||||||
|
|
||||||
|
evaluation_result = evaluation_task.execute_sync()
|
||||||
|
self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality)
|
||||||
Reference in New Issue
Block a user