Introducing Agent evaluation (#3130)

* feat: add exchanged messages in LLMCallCompletedEvent * feat: add GoalAlignment metric for Agent evaluation * feat: add SemanticQuality metric for Agent evaluation * feat: add Tool Metrics for Agent evaluation * feat: add Reasoning Metrics for Agent evaluation, still in progress * feat: add AgentEvaluator class This class will evaluate Agent' results and report to user * fix: do not evaluate Agent by default This is a experimental feature we still need refine it further * test: add Agent eval tests * fix: render all feedback per iteration * style: resolve linter issues * style: fix mypy issues * fix: allow messages be empty on LLMCallCompletedEvent
2026-01-09 16:18:30 +00:00 · 2025-07-11 14:18:03 -03:00
parent bf8fa3232b
commit 08fa3797ca
26 changed files with 2930 additions and 14 deletions
--- a/src/crewai/crew.py
+++ b/src/crewai/crew.py
@@ -1313,6 +1313,7 @@ class Crew(FlowTrackable, BaseModel):
        n_iterations: int,
        eval_llm: Union[str, InstanceOf[BaseLLM]],
        inputs: Optional[Dict[str, Any]] = None,
+        include_agent_eval: Optional[bool] = False
    ) -> None:
        """Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures."""
        try:
@@ -1331,13 +1332,29 @@ class Crew(FlowTrackable, BaseModel):
                ),
            )
            test_crew = self.copy()
+
+            # TODO: Refator to use a single Evaluator Manage class
            evaluator = CrewEvaluator(test_crew, llm_instance)

+            if include_agent_eval:
+                from crewai.evaluation import create_default_evaluator
+                agent_evaluator = create_default_evaluator(crew=test_crew)
+
            for i in range(1, n_iterations + 1):
                evaluator.set_iteration(i)
+
+                if include_agent_eval:
+                    agent_evaluator.set_iteration(i)
+
                test_crew.kickoff(inputs=inputs)

+                # TODO: Refactor to use ListenerEvents instead of trigger each iteration manually
+                if include_agent_eval:
+                    agent_evaluator.evaluate_current_iteration()
+
            evaluator.print_crew_evaluation_result()
+            if include_agent_eval:
+                agent_evaluator.get_agent_evaluation(include_evaluation_feedback=True)

            crewai_event_bus.emit(
                self,