From 007f396f4df0e92ea54913e562a7aff057a0eeab Mon Sep 17 00:00:00 2001 From: Lucas Gomide Date: Thu, 10 Jul 2025 20:56:06 -0300 Subject: [PATCH] chore: split Experimental evaluation classes --- src/crewai/evaluation/__init__.py | 2 +- src/crewai/evaluation/experiment/__init__.py | 8 ++ .../{experiment.py => experiment/result.py} | 121 +----------------- src/crewai/evaluation/experiment/runner.py | 109 ++++++++++++++++ 4 files changed, 125 insertions(+), 115 deletions(-) create mode 100644 src/crewai/evaluation/experiment/__init__.py rename src/crewai/evaluation/{experiment.py => experiment/result.py} (62%) create mode 100644 src/crewai/evaluation/experiment/runner.py diff --git a/src/crewai/evaluation/__init__.py b/src/crewai/evaluation/__init__.py index 076f66123..3f1d8b4df 100644 --- a/src/crewai/evaluation/__init__.py +++ b/src/crewai/evaluation/__init__.py @@ -38,7 +38,7 @@ from crewai.evaluation.agent_evaluator import ( from crewai.evaluation.experiment import ( ExperimentRunner, ExperimentResults, - TestCaseResult + ExperimentResult ) __all__ = [ diff --git a/src/crewai/evaluation/experiment/__init__.py b/src/crewai/evaluation/experiment/__init__.py new file mode 100644 index 000000000..bd6708393 --- /dev/null +++ b/src/crewai/evaluation/experiment/__init__.py @@ -0,0 +1,8 @@ +from crewai.evaluation.experiment.runner import ExperimentRunner +from crewai.evaluation.experiment.result import ExperimentResults, ExperimentResult + +__all__ = [ + "ExperimentRunner", + "ExperimentResults", + "ExperimentResult" +] diff --git a/src/crewai/evaluation/experiment.py b/src/crewai/evaluation/experiment/result.py similarity index 62% rename from src/crewai/evaluation/experiment.py rename to src/crewai/evaluation/experiment/result.py index 8c68c1c09..bdfd0e23f 100644 --- a/src/crewai/evaluation/experiment.py +++ b/src/crewai/evaluation/experiment/result.py @@ -1,35 +1,22 @@ -"""Agent evaluation experiments for crewAI. - -This module provides tools for running experiments to evaluate agent performance -across a dataset of test cases, as well as persistence and visualization of results. -""" - -from collections import defaultdict -from hashlib import md5 import json import os from datetime import datetime -from typing import List, Dict, Union, Optional, Any, Tuple -from dataclasses import dataclass, asdict +from typing import List, Dict, Optional, Any from rich.console import Console from rich.table import Table from rich.panel import Panel - -from crewai import Crew -from crewai.evaluation import AgentEvaluator, create_default_evaluator from pydantic import BaseModel -from crewai.evaluation.evaluation_display import AgentAggregatedEvaluationResult -class TestCaseResult(BaseModel): +class ExperimentResult(BaseModel): identifier: str - inputs: Dict[str, Any] - score: Union[int, Dict[str, Union[int, float]]] - expected_score: Union[int, Dict[str, Union[int, float]]] + inputs: dict[str, Any] + score: int | dict[str, int | float] + expected_score: int | dict[str, int | float] passed: bool - agent_evaluations: Optional[Dict[str, Any]] = None + agent_evaluations: dict[str, Any] | None = None class ExperimentResults: - def __init__(self, results: List[TestCaseResult], metadata: Optional[Dict[str, Any]] = None): + def __init__(self, results: List[ExperimentResult], metadata: Optional[Dict[str, Any]] = None): self.results = results self.metadata = metadata or {} self.timestamp = datetime.now() @@ -204,97 +191,3 @@ class ExperimentResults: table.add_row("➖ Missing Tests", str(len(missing_tests)), details) self.console.print(table) - - -class ExperimentRunner: - def __init__(self, dataset: Optional[List[Dict[str, Any]]] = None, evaluator: AgentEvaluator | None = None): - self.dataset = dataset or [] - self.evaluator = evaluator - self.console = Console() - - def run(self, crew: Optional[Crew] = None) -> ExperimentResults: - if not self.dataset: - raise ValueError("No dataset provided. Use load_dataset() or provide dataset in constructor.") - - if not crew: - raise ValueError("crew must be provided.") - - if not self.evaluator: - self.evaluator = create_default_evaluator(crew=crew) - - results = [] - - for test_case in self.dataset: - self.evaluator.reset_iterations_results() - result = self._run_test_case(test_case, crew) - results.append(result) - - - return ExperimentResults(results) - - def _run_test_case(self, test_case: Dict[str, Any], crew: Crew) -> TestCaseResult: - inputs = test_case["inputs"] - expected_score = test_case["expected_score"] - identifier = test_case.get("identifier") or md5(str(test_case)).hexdigest() - - try: - self.console.print(f"[dim]Running crew with input: {str(inputs)[:50]}...[/dim]") - crew.kickoff(inputs=inputs) - - agent_evaluations = self.evaluator.get_agent_evaluation() - - actual_score = self._extract_scores(agent_evaluations) - - passed = self._compare_scores(expected_score, actual_score) - return TestCaseResult( - identifier=identifier, - inputs=inputs, - score=actual_score, - expected_score=expected_score, - passed=passed, - agent_evaluations=agent_evaluations - ) - - except Exception as e: - self.console.print(f"[red]Error running test case: {str(e)}[/red]") - return TestCaseResult( - identifier=identifier, - inputs=inputs, - score=0, - expected_score=expected_score, - passed=False - ) - - def _extract_scores(self, agent_evaluations: Dict[str, AgentAggregatedEvaluationResult]) -> Union[int, Dict[str, int]]: - all_scores = defaultdict(list) - for evaluation in agent_evaluations.values(): - for metric_name, score in evaluation.metrics.items(): - if score.score is not None: - all_scores[metric_name.value].append(score.score) - - avg_scores = {m: sum(s)/len(s) for m, s in all_scores.items()} - - if len(avg_scores) == 1: - return list(avg_scores.values())[0] - - return avg_scores - - def _compare_scores(self, expected: Union[int, Dict[str, int]], - actual: Union[int, Dict[str, int]]) -> bool: - if isinstance(expected, (int, float)) and isinstance(actual, (int, float)): - return actual >= expected - - elif isinstance(expected, dict) and isinstance(actual, (int, float)): - return False - - elif isinstance(expected, (int, float)) and isinstance(actual, dict): - avg_score = sum(actual.values()) / len(actual) - return avg_score >= expected - - elif isinstance(expected, dict) and isinstance(actual, dict): - for metric, exp_score in expected.items(): - if metric not in actual or actual[metric] < exp_score: - return False - return True - - return False \ No newline at end of file diff --git a/src/crewai/evaluation/experiment/runner.py b/src/crewai/evaluation/experiment/runner.py new file mode 100644 index 000000000..9b784ac60 --- /dev/null +++ b/src/crewai/evaluation/experiment/runner.py @@ -0,0 +1,109 @@ +from collections import defaultdict +from hashlib import md5 +from typing import List, Dict, Union, Optional, Any +from rich.console import Console + +from crewai import Crew +from crewai.evaluation import AgentEvaluator, create_default_evaluator +from crewai.evaluation.evaluation_display import AgentAggregatedEvaluationResult +from crewai.evaluation.experiment.result import ExperimentResults, ExperimentResult + +class ExperimentRunner: + def __init__(self, dataset: List[Dict[str, Any]]): + self.dataset = dataset or [] + self.evaluator = None + self.console = Console() + + def run(self, crew: Optional[Crew] = None) -> ExperimentResults: + if not crew: + raise ValueError("crew must be provided.") + + self.evaluator = create_default_evaluator(crew=crew) + + results = [] + + for test_case in self.dataset: + self.evaluator.reset_iterations_results() + result = self._run_test_case(test_case, crew) + results.append(result) + + + return ExperimentResults(results) + + def _run_test_case(self, test_case: Dict[str, Any], crew: Crew) -> ExperimentResult: + inputs = test_case["inputs"] + expected_score = test_case["expected_score"] + identifier = test_case.get("identifier") or md5(str(test_case), usedforsecurity=False).hexdigest() + + try: + self.console.print(f"[dim]Running crew with input: {str(inputs)[:50]}...[/dim]") + crew.kickoff(inputs=inputs) + + agent_evaluations = self.evaluator.get_agent_evaluation() + + actual_score = self._extract_scores(agent_evaluations) + + passed = self._assert_scores(expected_score, actual_score) + return ExperimentResult( + identifier=identifier, + inputs=inputs, + score=actual_score, + expected_score=expected_score, + passed=passed, + agent_evaluations=agent_evaluations + ) + + except Exception as e: + self.console.print(f"[red]Error running test case: {str(e)}[/red]") + return ExperimentResult( + identifier=identifier, + inputs=inputs, + score=0, + expected_score=expected_score, + passed=False + ) + + def _extract_scores(self, agent_evaluations: Dict[str, AgentAggregatedEvaluationResult]) -> Union[int, Dict[str, int]]: + all_scores = defaultdict(list) + for evaluation in agent_evaluations.values(): + for metric_name, score in evaluation.metrics.items(): + if score.score is not None: + all_scores[metric_name.value].append(score.score) + + avg_scores = {m: sum(s)/len(s) for m, s in all_scores.items()} + + if len(avg_scores) == 1: + return list(avg_scores.values())[0] + + return avg_scores + + def _assert_scores(self, expected: Union[int, Dict[str, int]], + actual: Union[int, Dict[str, int]]) -> bool: + """ + Compare expected and actual scores, and return whether the test case passed. + + The rules for comparison are as follows: + + - If both expected and actual scores are single numbers, the actual score must be greater than or equal to the expected score. + - If the expected score is a single number and the actual score is a dict, the test case fails. + - If the expected score is a dict and the actual score is a single number, the test case fails. + - If both expected and actual scores are dicts, the actual score must have all the same keys as the expected score, and the value for each key must be greater than or equal to the expected score. + """ + + if isinstance(expected, (int, float)) and isinstance(actual, (int, float)): + return actual >= expected + + elif isinstance(expected, dict) and isinstance(actual, (int, float)): + return False + + elif isinstance(expected, (int, float)) and isinstance(actual, dict): + avg_score = sum(actual.values()) / len(actual) + return avg_score >= expected + + elif isinstance(expected, dict) and isinstance(actual, dict): + for metric, exp_score in expected.items(): + if metric not in actual or actual[metric] < exp_score: + return False + return True + + return False \ No newline at end of file