From ccea8870d7db745e89f01aa477dc85e67a1c3045 Mon Sep 17 00:00:00 2001 From: Lucas Gomide Date: Thu, 10 Jul 2025 20:15:42 -0300 Subject: [PATCH] fix: reset evaluator for each experiement iteraction --- src/crewai/evaluation/agent_evaluator.py | 6 +++++- src/crewai/evaluation/experiment.py | 12 +++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/crewai/evaluation/agent_evaluator.py b/src/crewai/evaluation/agent_evaluator.py index 430196d14..5469023a9 100644 --- a/src/crewai/evaluation/agent_evaluator.py +++ b/src/crewai/evaluation/agent_evaluator.py @@ -10,6 +10,7 @@ from collections.abc import Sequence from crewai.crew import Crew from crewai.utilities.events.crewai_event_bus import crewai_event_bus from crewai.utilities.events.utils.console_formatter import ConsoleFormatter +from crewai.evaluation.evaluation_display import AgentAggregatedEvaluationResult class AgentEvaluator: def __init__( @@ -36,6 +37,9 @@ class AgentEvaluator: def set_iteration(self, iteration: int) -> None: self.iteration = iteration + def reset_iterations_results(self): + self.iterations_results = {} + def evaluate_current_iteration(self) -> dict[str, list[AgentEvaluationResult]]: if not self.crew: raise ValueError("Cannot evaluate: no crew was provided to the evaluator.") @@ -99,7 +103,7 @@ class AgentEvaluator: def display_results_with_iterations(self): self.display_formatter.display_summary_results(self.iterations_results) - def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = False): + def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = False) -> Dict[str, AgentAggregatedEvaluationResult]: agent_results = {} with crewai_event_bus.scoped_handlers(): task_results = self.get_evaluation_results() diff --git a/src/crewai/evaluation/experiment.py b/src/crewai/evaluation/experiment.py index c9e0549f1..0d97378c3 100644 --- a/src/crewai/evaluation/experiment.py +++ b/src/crewai/evaluation/experiment.py @@ -18,6 +18,7 @@ from rich.panel import Panel from crewai import Crew from crewai.evaluation import AgentEvaluator, create_default_evaluator from pydantic import BaseModel +from crewai.evaluation.evaluation_display import AgentAggregatedEvaluationResult class TestCaseResult(BaseModel): identifier: str @@ -206,7 +207,7 @@ class ExperimentResults: class ExperimentRunner: - def __init__(self, dataset: Optional[List[Dict[str, Any]]] = None, evaluator: Optional[AgentEvaluator] = None): + def __init__(self, dataset: Optional[List[Dict[str, Any]]] = None, evaluator: AgentEvaluator | None = None): self.dataset = dataset or [] self.evaluator = evaluator self.console = Console() @@ -215,15 +216,16 @@ class ExperimentRunner: if not self.dataset: raise ValueError("No dataset provided. Use load_dataset() or provide dataset in constructor.") - if not self.evaluator and not crew: - raise ValueError("Either evaluator or crew must be provided.") + if not crew: + raise ValueError("crew must be provided.") - if not self.evaluator and crew: + if not self.evaluator: self.evaluator = create_default_evaluator(crew=crew) results = [] for test_case in self.dataset: + self.evaluator.reset_iterations_results() result = self._run_test_case(test_case, crew) results.append(result) @@ -263,7 +265,7 @@ class ExperimentRunner: passed=False ) - def _extract_scores(self, agent_evaluations: Dict[str, Any]) -> Union[int, Dict[str, int]]: + def _extract_scores(self, agent_evaluations: Dict[str, AgentAggregatedEvaluationResult]) -> Union[int, Dict[str, int]]: all_scores = defaultdict(list) for evaluation in agent_evaluations.values(): for metric_name, score in evaluation.metrics.items():