feat: add crew Testing/evalauting feature

2026-01-11 00:58:30 +00:00 · 2024-07-24 09:14:09 -03:00
parent 6930656897
commit 660a2ae837
2 changed files with 126 additions and 1 deletions
--- a/src/crewai/crew.py
+++ b/src/crewai/crew.py
@@ -37,6 +37,7 @@ from crewai.utilities.constants import (
    TRAINED_AGENTS_DATA_FILE,
    TRAINING_DATA_FILE,
 )
+from crewai.utilities.evaluators.crew_evaluator_handler import CrewEvaluator
 from crewai.utilities.evaluators.task_evaluator import TaskEvaluator
 from crewai.utilities.formatter import (
    aggregate_raw_outputs_from_task_outputs,
@@ -970,7 +971,13 @@ class Crew(BaseModel):
        self, n_iterations: int, model: str, inputs: Optional[Dict[str, Any]] = None
    ) -> None:
        """Test the crew with the given inputs."""
-        pass
+        evaluator = CrewEvaluator(self, model)
+
+        for i in range(1, n_iterations + 1):
+            evaluator.set_iteration(i)
+            self.kickoff(inputs=inputs)
+
+        evaluator.print_crew_evaluation_result()

    def __repr__(self):
        return f"Crew(id={self.id}, process={self.process}, number_of_agents={len(self.agents)}, number_of_tasks={len(self.tasks)})"
--- a/src/crewai/utilities/evaluators/crew_evaluator_handler.py
+++ b/src/crewai/utilities/evaluators/crew_evaluator_handler.py
@@ -0,0 +1,118 @@
+from collections import defaultdict
+
+from langchain_openai import ChatOpenAI
+from pydantic import BaseModel, Field
+from rich.console import Console
+from rich.table import Table
+
+from crewai.agent import Agent
+from crewai.task import Task
+from crewai.tasks.task_output import TaskOutput
+
+
+class TaskEvaluationPydanticOutput(BaseModel):
+    quality: float = Field(
+        description="A score from 0 to 10 evaluating on completion, quality, and overall performance from the task_description and task_expected_output to the actual Task Output."
+    )
+
+
+class CrewEvaluator:
+    tasks_scores = defaultdict(list)
+    iteration = 0
+
+    def __init__(self, crew, model: str):
+        self.crew = crew
+        self.model = model
+        self._setup_for_evaluating()
+
+    def _setup_for_evaluating(self) -> None:
+        """Sets up the crew for evaluating."""
+        for task in self.crew.tasks:
+            task.callback = self.evaluate
+
+    def set_iteration(self, iteration: int) -> None:
+        self.iteration = iteration
+
+    def _evaluator_agent(self):
+        return Agent(
+            role="Task Execution Evaluator",
+            goal=(
+                "Your goal is to evaluate the performance of the agents in the crew based on the tasks they have performed."
+            ),
+            backstory="Evaluator agent for crew evaluation",
+            verbose=False,
+            llm=ChatOpenAI(model=self.model),
+        )
+
+    def _evaluation_task(
+        self, evaluator_agent: Agent, task_to_evaluate: Task, task_output: str
+    ) -> Task:
+        return Task(
+            description=(
+                "Based on the task description and the expected output, compare and evaluate the performance of the agents in the crew based on the Task Output they have performed."
+                f"task_description: {task_to_evaluate.description}"
+                f"task_expected_output: {task_to_evaluate.expected_output}"
+                f"agent: {task_to_evaluate.agent.role if task_to_evaluate.agent else None}"
+                f"agent_goal: {task_to_evaluate.agent.goal if task_to_evaluate.agent else None}"
+                f"Task Output: {task_output}"
+            ),
+            expected_output="Evaluation score based on the performance of the agents on the tasks",
+            agent=evaluator_agent,
+            output_pydantic=TaskEvaluationPydanticOutput,
+        )
+
+    def print_crew_evaluation_result(self) -> None:
+        self.tasks_scores
+        results = self.tasks_scores
+
+        task_averages = [sum(scores) / len(scores) for scores in zip(*results.values())]
+        crew_average = sum(task_averages) / len(task_averages)
+
+        # Create a table
+        table = Table(title="Task Scores")
+
+        # Add columns for the table
+        table.add_column("Task")
+        for run in range(1, len(results) + 1):
+            table.add_column(f"Run {run}")
+        table.add_column("Avg. Total")
+
+        # Add rows for each task
+        for task_index in range(len(task_averages)):
+            task_scores = [
+                results[run][task_index] for run in range(1, len(results) + 1)
+            ]
+            avg_score = task_averages[task_index]
+            table.add_row(
+                f"Task {task_index + 1}", *map(str, task_scores), f"{avg_score:.1f}"
+            )
+
+        # Add a row for the crew average
+        crew_scores = [
+            sum(results[run]) / len(results[run]) for run in range(1, len(results) + 1)
+        ]
+        table.add_row("Crew", *map(str, crew_scores), f"{crew_average:.1f}")
+
+        # Display the table in the terminal
+        console = Console()
+        console.print(table)
+
+    def evaluate(self, task_output: TaskOutput):
+        current_task = None
+        for task in self.crew.tasks:
+            if task.description == task_output.description:
+                current_task = task
+                break
+
+        if not current_task or not task_output:
+            raise ValueError(
+                "Task to evaluate and task output are required for evaluation"
+            )
+
+        evaluator_agent = self._evaluator_agent()
+        evaluation_task = self._evaluation_task(
+            evaluator_agent, current_task, task_output.raw
+        )
+
+        evaluation_result = evaluation_task.execute_sync()
+        self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality)