feat: add tests

2026-05-07 18:19:00 +00:00 · 2024-07-24 13:14:20 -03:00
parent cf600c1a43
commit c05ef3c8cf
2 changed files with 131 additions and 5 deletions
--- a/src/crewai/utilities/evaluators/crew_evaluator_handler.py
+++ b/src/crewai/utilities/evaluators/crew_evaluator_handler.py
@@ -60,10 +60,10 @@ class CrewEvaluator:
        return Task(
            description=(
                "Based on the task description and the expected output, compare and evaluate the performance of the agents in the crew based on the Task Output they have performed using score from 1 to 10 evaluating on completion, quality, and overall performance."
-                f"task_description: {task_to_evaluate.description}"
+                f"task_description: {task_to_evaluate.description} "
-                f"task_expected_output: {task_to_evaluate.expected_output}"
+                f"task_expected_output: {task_to_evaluate.expected_output} "
-                f"agent: {task_to_evaluate.agent.role if task_to_evaluate.agent else None}"
+                f"agent: {task_to_evaluate.agent.role if task_to_evaluate.agent else None} "
-                f"agent_goal: {task_to_evaluate.agent.goal if task_to_evaluate.agent else None}"
+                f"agent_goal: {task_to_evaluate.agent.goal if task_to_evaluate.agent else None} "
                f"Task Output: {task_output}"
            ),
            expected_output="Evaluation Score from 1 to 10 based on the performance of the agents on the tasks",
@@ -72,7 +72,21 @@ class CrewEvaluator:
        )
    def print_crew_evaluation_result(self) -> None:
-        """Prints the evaluation result of the crew in a table."""
+        """
        Prints the evaluation result of the crew in a table.
        A Crew with 2 tasks using the command crewai test -n 2
        will output the following table:
                        Task Scores
                    (1-10 Higher is better)
            ┏━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━┓
            ┃ Tasks/Crew ┃ Run 1 ┃ Run 2 ┃ Avg. Total ┃
            ┡━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━┩
            │ Task 1     │ 10.0  │ 9.0   │ 9.5        │
            │ Task 2     │ 9.0   │ 9.0   │ 9.0        │
            │ Crew       │ 9.5   │ 9.0   │ 9.2        │
            └────────────┴───────┴───────┴────────────┘
        """
        task_averages = [
            sum(scores) / len(scores) for scores in zip(*self.tasks_scores.values())
        ]
--- a/tests/utilities/evaluators/test_crew_evaluator_handler.py
+++ b/tests/utilities/evaluators/test_crew_evaluator_handler.py
@@ -0,0 +1,112 @@
 from unittest import mock
 import pytest
 from crewai.agent import Agent
 from crewai.crew import Crew
 from crewai.task import Task
 from crewai.tasks.task_output import TaskOutput
 from crewai.utilities.evaluators.crew_evaluator_handler import (
    CrewEvaluator,
 )
 class TestCrewEvaluator:
    @pytest.fixture
    def crew_planner(self):
        agent = Agent(role="Agent 1", goal="Goal 1", backstory="Backstory 1")
        task = Task(
            description="Task 1",
            expected_output="Output 1",
            agent=agent,
        )
        crew = Crew(agents=[agent], tasks=[task])
        return CrewEvaluator(crew, model="gpt-4o-mini")
    def test_setup_for_evaluating(self, crew_planner):
        crew_planner._setup_for_evaluating()
        assert crew_planner.crew.tasks[0].callback == crew_planner.evaluate
    def test_set_iteration(self, crew_planner):
        crew_planner.set_iteration(1)
        assert crew_planner.iteration == 1
    def test_evaluator_agent(self, crew_planner):
        agent = crew_planner._evaluator_agent()
        assert agent.role == "Task Execution Evaluator"
        assert (
            agent.goal
            == "Your goal is to evaluate the performance of the agents in the crew based on the tasks they have performed using score from 1 to 10 evaluating on completion, quality, and overall performance."
        )
        assert (
            agent.backstory
            == "Evaluator agent for crew evaluation with precise capabilities to evaluate the performance of the agents in the crew based on the tasks they have performed"
        )
        assert agent.verbose is False
        assert agent.llm.model_name == "gpt-4o-mini"
    def test_evaluation_task(self, crew_planner):
        evaluator_agent = Agent(
            role="Evaluator Agent",
            goal="Evaluate the performance of the agents in the crew",
            backstory="Master in Evaluation",
        )
        task_to_evaluate = Task(
            description="Task 1",
            expected_output="Output 1",
            agent=Agent(role="Agent 1", goal="Goal 1", backstory="Backstory 1"),
        )
        task_output = "Task Output 1"
        task = crew_planner._evaluation_task(
            evaluator_agent, task_to_evaluate, task_output
        )
        assert task.description.startswith(
            "Based on the task description and the expected output, compare and evaluate the performance of the agents in the crew based on the Task Output they have performed using score from 1 to 10 evaluating on completion, quality, and overall performance."
        )
        assert task.agent == evaluator_agent
        assert (
            task.description
            == "Based on the task description and the expected output, compare and evaluate "
            "the performance of the agents in the crew based on the Task Output they have "
            "performed using score from 1 to 10 evaluating on completion, quality, and overall "
            "performance.task_description: Task 1 task_expected_output: Output 1 "
            "agent: Agent 1 agent_goal: Goal 1 Task Output: Task Output 1"
        )
    @mock.patch("crewai.utilities.evaluators.crew_evaluator_handler.Console")
    @mock.patch("crewai.utilities.evaluators.crew_evaluator_handler.Table")
    def test_print_crew_evaluation_result(self, table, console, crew_planner):
        crew_planner.tasks_scores = {
            1: [10, 9, 8],
            2: [9, 8, 7],
        }
        crew_planner.print_crew_evaluation_result()
        table.assert_has_calls(
            [
                mock.call(title="Tasks Scores \n (1-10 Higher is better)"),
                mock.call().add_column("Tasks/Crew"),
                mock.call().add_column("Run 1"),
                mock.call().add_column("Run 2"),
                mock.call().add_column("Avg. Total"),
                mock.call().add_row("Task 1", "10", "9", "9.5"),
                mock.call().add_row("Task 2", "9", "8", "8.5"),
                mock.call().add_row("Task 3", "8", "7", "7.5"),
                mock.call().add_row("Crew", "9.0", "8.0", "8.5"),
            ]
        )
        console.assert_has_calls([mock.call(), mock.call().print(table())])
    def test_evaluate(self, crew_planner):
        task_output = TaskOutput(
            description="Task 1", agent=str(crew_planner.crew.agents[0])
        )
        with mock.patch.object(Task, "execute_sync") as execute:
            execute().pydantic.quality = 9.5
            crew_planner.evaluate(task_output)
            assert crew_planner.tasks_scores[0] == [9.5]