mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-25 16:18:13 +00:00
fix: reset evaluator for each experiement iteraction
This commit is contained in:
@@ -10,6 +10,7 @@ from collections.abc import Sequence
|
|||||||
from crewai.crew import Crew
|
from crewai.crew import Crew
|
||||||
from crewai.utilities.events.crewai_event_bus import crewai_event_bus
|
from crewai.utilities.events.crewai_event_bus import crewai_event_bus
|
||||||
from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
|
from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
|
||||||
|
from crewai.evaluation.evaluation_display import AgentAggregatedEvaluationResult
|
||||||
|
|
||||||
class AgentEvaluator:
|
class AgentEvaluator:
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -36,6 +37,9 @@ class AgentEvaluator:
|
|||||||
def set_iteration(self, iteration: int) -> None:
|
def set_iteration(self, iteration: int) -> None:
|
||||||
self.iteration = iteration
|
self.iteration = iteration
|
||||||
|
|
||||||
|
def reset_iterations_results(self):
|
||||||
|
self.iterations_results = {}
|
||||||
|
|
||||||
def evaluate_current_iteration(self) -> dict[str, list[AgentEvaluationResult]]:
|
def evaluate_current_iteration(self) -> dict[str, list[AgentEvaluationResult]]:
|
||||||
if not self.crew:
|
if not self.crew:
|
||||||
raise ValueError("Cannot evaluate: no crew was provided to the evaluator.")
|
raise ValueError("Cannot evaluate: no crew was provided to the evaluator.")
|
||||||
@@ -99,7 +103,7 @@ class AgentEvaluator:
|
|||||||
def display_results_with_iterations(self):
|
def display_results_with_iterations(self):
|
||||||
self.display_formatter.display_summary_results(self.iterations_results)
|
self.display_formatter.display_summary_results(self.iterations_results)
|
||||||
|
|
||||||
def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = False):
|
def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = False) -> Dict[str, AgentAggregatedEvaluationResult]:
|
||||||
agent_results = {}
|
agent_results = {}
|
||||||
with crewai_event_bus.scoped_handlers():
|
with crewai_event_bus.scoped_handlers():
|
||||||
task_results = self.get_evaluation_results()
|
task_results = self.get_evaluation_results()
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ from rich.panel import Panel
|
|||||||
from crewai import Crew
|
from crewai import Crew
|
||||||
from crewai.evaluation import AgentEvaluator, create_default_evaluator
|
from crewai.evaluation import AgentEvaluator, create_default_evaluator
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
from crewai.evaluation.evaluation_display import AgentAggregatedEvaluationResult
|
||||||
|
|
||||||
class TestCaseResult(BaseModel):
|
class TestCaseResult(BaseModel):
|
||||||
identifier: str
|
identifier: str
|
||||||
@@ -206,7 +207,7 @@ class ExperimentResults:
|
|||||||
|
|
||||||
|
|
||||||
class ExperimentRunner:
|
class ExperimentRunner:
|
||||||
def __init__(self, dataset: Optional[List[Dict[str, Any]]] = None, evaluator: Optional[AgentEvaluator] = None):
|
def __init__(self, dataset: Optional[List[Dict[str, Any]]] = None, evaluator: AgentEvaluator | None = None):
|
||||||
self.dataset = dataset or []
|
self.dataset = dataset or []
|
||||||
self.evaluator = evaluator
|
self.evaluator = evaluator
|
||||||
self.console = Console()
|
self.console = Console()
|
||||||
@@ -215,15 +216,16 @@ class ExperimentRunner:
|
|||||||
if not self.dataset:
|
if not self.dataset:
|
||||||
raise ValueError("No dataset provided. Use load_dataset() or provide dataset in constructor.")
|
raise ValueError("No dataset provided. Use load_dataset() or provide dataset in constructor.")
|
||||||
|
|
||||||
if not self.evaluator and not crew:
|
if not crew:
|
||||||
raise ValueError("Either evaluator or crew must be provided.")
|
raise ValueError("crew must be provided.")
|
||||||
|
|
||||||
if not self.evaluator and crew:
|
if not self.evaluator:
|
||||||
self.evaluator = create_default_evaluator(crew=crew)
|
self.evaluator = create_default_evaluator(crew=crew)
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
for test_case in self.dataset:
|
for test_case in self.dataset:
|
||||||
|
self.evaluator.reset_iterations_results()
|
||||||
result = self._run_test_case(test_case, crew)
|
result = self._run_test_case(test_case, crew)
|
||||||
results.append(result)
|
results.append(result)
|
||||||
|
|
||||||
@@ -263,7 +265,7 @@ class ExperimentRunner:
|
|||||||
passed=False
|
passed=False
|
||||||
)
|
)
|
||||||
|
|
||||||
def _extract_scores(self, agent_evaluations: Dict[str, Any]) -> Union[int, Dict[str, int]]:
|
def _extract_scores(self, agent_evaluations: Dict[str, AgentAggregatedEvaluationResult]) -> Union[int, Dict[str, int]]:
|
||||||
all_scores = defaultdict(list)
|
all_scores = defaultdict(list)
|
||||||
for evaluation in agent_evaluations.values():
|
for evaluation in agent_evaluations.values():
|
||||||
for metric_name, score in evaluation.metrics.items():
|
for metric_name, score in evaluation.metrics.items():
|
||||||
|
|||||||
Reference in New Issue
Block a user