From 4439755b2dc455b2ab18014d8809b9877e5cc677 Mon Sep 17 00:00:00 2001 From: Lucas Gomide Date: Fri, 11 Jul 2025 14:43:29 -0300 Subject: [PATCH] feat: add experimental folder for beta features (#3141) --- src/crewai/crew.py | 2 +- src/crewai/evaluation/experiment/__init__.py | 8 ---- src/crewai/evaluation/metrics/__init__.py | 0 src/crewai/experimental/__init__.py | 40 +++++++++++++++++++ .../{ => experimental}/evaluation/__init__.py | 29 +++++--------- .../evaluation/agent_evaluator.py | 10 ++--- .../evaluation/base_evaluator.py | 0 .../evaluation/evaluation_display.py | 4 +- .../evaluation/evaluation_listener.py | 0 .../evaluation/experiment/__init__.py | 8 ++++ .../evaluation/experiment/result.py | 2 +- .../evaluation/experiment/result_display.py | 2 +- .../evaluation/experiment/runner.py | 8 ++-- .../evaluation/json_parser.py | 0 .../evaluation/metrics/__init__.py | 26 ++++++++++++ .../evaluation/metrics/goal_metrics.py | 4 +- .../evaluation/metrics/reasoning_metrics.py | 4 +- .../metrics/semantic_quality_metrics.py | 4 +- .../evaluation/metrics/tools_metrics.py | 4 +- tests/evaluation/metrics/test_goal_metrics.py | 4 +- .../metrics/test_reasoning_metrics.py | 4 +- .../metrics/test_semantic_quality_metrics.py | 4 +- .../evaluation/metrics/test_tools_metrics.py | 2 +- tests/evaluation/test_agent_evaluator.py | 8 ++-- tests/evaluation/test_experiment_result.py | 2 +- tests/evaluation/test_experiment_runner.py | 16 ++++---- 26 files changed, 125 insertions(+), 70 deletions(-) delete mode 100644 src/crewai/evaluation/experiment/__init__.py delete mode 100644 src/crewai/evaluation/metrics/__init__.py create mode 100644 src/crewai/experimental/__init__.py rename src/crewai/{ => experimental}/evaluation/__init__.py (62%) rename src/crewai/{ => experimental}/evaluation/agent_evaluator.py (94%) rename src/crewai/{ => experimental}/evaluation/base_evaluator.py (100%) rename src/crewai/{ => experimental}/evaluation/evaluation_display.py (98%) rename src/crewai/{ => experimental}/evaluation/evaluation_listener.py (100%) create mode 100644 src/crewai/experimental/evaluation/experiment/__init__.py rename src/crewai/{ => experimental}/evaluation/experiment/result.py (97%) rename src/crewai/{ => experimental}/evaluation/experiment/result_display.py (97%) rename src/crewai/{ => experimental}/evaluation/experiment/runner.py (92%) rename src/crewai/{ => experimental}/evaluation/json_parser.py (100%) create mode 100644 src/crewai/experimental/evaluation/metrics/__init__.py rename src/crewai/{ => experimental}/evaluation/metrics/goal_metrics.py (91%) rename src/crewai/{ => experimental}/evaluation/metrics/reasoning_metrics.py (98%) rename src/crewai/{ => experimental}/evaluation/metrics/semantic_quality_metrics.py (91%) rename src/crewai/{ => experimental}/evaluation/metrics/tools_metrics.py (98%) diff --git a/src/crewai/crew.py b/src/crewai/crew.py index 89acee505..1f02b9b06 100644 --- a/src/crewai/crew.py +++ b/src/crewai/crew.py @@ -1337,7 +1337,7 @@ class Crew(FlowTrackable, BaseModel): evaluator = CrewEvaluator(test_crew, llm_instance) if include_agent_eval: - from crewai.evaluation import create_default_evaluator + from crewai.experimental.evaluation import create_default_evaluator agent_evaluator = create_default_evaluator(crew=test_crew) for i in range(1, n_iterations + 1): diff --git a/src/crewai/evaluation/experiment/__init__.py b/src/crewai/evaluation/experiment/__init__.py deleted file mode 100644 index bd6708393..000000000 --- a/src/crewai/evaluation/experiment/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from crewai.evaluation.experiment.runner import ExperimentRunner -from crewai.evaluation.experiment.result import ExperimentResults, ExperimentResult - -__all__ = [ - "ExperimentRunner", - "ExperimentResults", - "ExperimentResult" -] diff --git a/src/crewai/evaluation/metrics/__init__.py b/src/crewai/evaluation/metrics/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/crewai/experimental/__init__.py b/src/crewai/experimental/__init__.py new file mode 100644 index 000000000..67eb7847f --- /dev/null +++ b/src/crewai/experimental/__init__.py @@ -0,0 +1,40 @@ +from crewai.experimental.evaluation import ( + BaseEvaluator, + EvaluationScore, + MetricCategory, + AgentEvaluationResult, + SemanticQualityEvaluator, + GoalAlignmentEvaluator, + ReasoningEfficiencyEvaluator, + ToolSelectionEvaluator, + ParameterExtractionEvaluator, + ToolInvocationEvaluator, + EvaluationTraceCallback, + create_evaluation_callbacks, + AgentEvaluator, + create_default_evaluator, + ExperimentRunner, + ExperimentResults, + ExperimentResult, +) + + +__all__ = [ + "BaseEvaluator", + "EvaluationScore", + "MetricCategory", + "AgentEvaluationResult", + "SemanticQualityEvaluator", + "GoalAlignmentEvaluator", + "ReasoningEfficiencyEvaluator", + "ToolSelectionEvaluator", + "ParameterExtractionEvaluator", + "ToolInvocationEvaluator", + "EvaluationTraceCallback", + "create_evaluation_callbacks", + "AgentEvaluator", + "create_default_evaluator", + "ExperimentRunner", + "ExperimentResults", + "ExperimentResult" +] \ No newline at end of file diff --git a/src/crewai/evaluation/__init__.py b/src/crewai/experimental/evaluation/__init__.py similarity index 62% rename from src/crewai/evaluation/__init__.py rename to src/crewai/experimental/evaluation/__init__.py index bdcdce76d..024c3aaed 100644 --- a/src/crewai/evaluation/__init__.py +++ b/src/crewai/experimental/evaluation/__init__.py @@ -1,41 +1,30 @@ -from crewai.evaluation.base_evaluator import ( +from crewai.experimental.evaluation.base_evaluator import ( BaseEvaluator, EvaluationScore, MetricCategory, AgentEvaluationResult ) -from crewai.evaluation.metrics.semantic_quality_metrics import ( - SemanticQualityEvaluator -) - -from crewai.evaluation.metrics.goal_metrics import ( - GoalAlignmentEvaluator -) - -from crewai.evaluation.metrics.reasoning_metrics import ( - ReasoningEfficiencyEvaluator -) - - -from crewai.evaluation.metrics.tools_metrics import ( +from crewai.experimental.evaluation.metrics import ( + SemanticQualityEvaluator, + GoalAlignmentEvaluator, + ReasoningEfficiencyEvaluator, ToolSelectionEvaluator, ParameterExtractionEvaluator, ToolInvocationEvaluator ) -from crewai.evaluation.evaluation_listener import ( +from crewai.experimental.evaluation.evaluation_listener import ( EvaluationTraceCallback, create_evaluation_callbacks ) - -from crewai.evaluation.agent_evaluator import ( +from crewai.experimental.evaluation.agent_evaluator import ( AgentEvaluator, create_default_evaluator ) -from crewai.evaluation.experiment import ( +from crewai.experimental.evaluation.experiment import ( ExperimentRunner, ExperimentResults, ExperimentResult @@ -59,4 +48,4 @@ __all__ = [ "ExperimentRunner", "ExperimentResults", "ExperimentResult" -] \ No newline at end of file +] diff --git a/src/crewai/evaluation/agent_evaluator.py b/src/crewai/experimental/evaluation/agent_evaluator.py similarity index 94% rename from src/crewai/evaluation/agent_evaluator.py rename to src/crewai/experimental/evaluation/agent_evaluator.py index 5469023a9..3cf9583dc 100644 --- a/src/crewai/evaluation/agent_evaluator.py +++ b/src/crewai/experimental/evaluation/agent_evaluator.py @@ -1,16 +1,16 @@ -from crewai.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy +from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy from crewai.agent import Agent from crewai.task import Task -from crewai.evaluation.evaluation_display import EvaluationDisplayFormatter +from crewai.experimental.evaluation.evaluation_display import EvaluationDisplayFormatter from typing import Any, Dict from collections import defaultdict -from crewai.evaluation import BaseEvaluator, create_evaluation_callbacks +from crewai.experimental.evaluation import BaseEvaluator, create_evaluation_callbacks from collections.abc import Sequence from crewai.crew import Crew from crewai.utilities.events.crewai_event_bus import crewai_event_bus from crewai.utilities.events.utils.console_formatter import ConsoleFormatter -from crewai.evaluation.evaluation_display import AgentAggregatedEvaluationResult +from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult class AgentEvaluator: def __init__( @@ -161,7 +161,7 @@ class AgentEvaluator: return result def create_default_evaluator(crew, llm=None): - from crewai.evaluation import ( + from crewai.experimental.evaluation import ( GoalAlignmentEvaluator, SemanticQualityEvaluator, ToolSelectionEvaluator, diff --git a/src/crewai/evaluation/base_evaluator.py b/src/crewai/experimental/evaluation/base_evaluator.py similarity index 100% rename from src/crewai/evaluation/base_evaluator.py rename to src/crewai/experimental/evaluation/base_evaluator.py diff --git a/src/crewai/evaluation/evaluation_display.py b/src/crewai/experimental/evaluation/evaluation_display.py similarity index 98% rename from src/crewai/evaluation/evaluation_display.py rename to src/crewai/experimental/evaluation/evaluation_display.py index 0e30c53f0..c25ea6db4 100644 --- a/src/crewai/evaluation/evaluation_display.py +++ b/src/crewai/experimental/evaluation/evaluation_display.py @@ -3,8 +3,8 @@ from typing import Dict, Any, List from rich.table import Table from rich.box import HEAVY_EDGE, ROUNDED from collections.abc import Sequence -from crewai.evaluation.base_evaluator import AgentAggregatedEvaluationResult, AggregationStrategy, AgentEvaluationResult, MetricCategory -from crewai.evaluation import EvaluationScore +from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult, AggregationStrategy, AgentEvaluationResult, MetricCategory +from crewai.experimental.evaluation import EvaluationScore from crewai.utilities.events.utils.console_formatter import ConsoleFormatter from crewai.utilities.llm_utils import create_llm diff --git a/src/crewai/evaluation/evaluation_listener.py b/src/crewai/experimental/evaluation/evaluation_listener.py similarity index 100% rename from src/crewai/evaluation/evaluation_listener.py rename to src/crewai/experimental/evaluation/evaluation_listener.py diff --git a/src/crewai/experimental/evaluation/experiment/__init__.py b/src/crewai/experimental/evaluation/experiment/__init__.py new file mode 100644 index 000000000..8e4fd8983 --- /dev/null +++ b/src/crewai/experimental/evaluation/experiment/__init__.py @@ -0,0 +1,8 @@ +from crewai.experimental.evaluation.experiment.runner import ExperimentRunner +from crewai.experimental.evaluation.experiment.result import ExperimentResults, ExperimentResult + +__all__ = [ + "ExperimentRunner", + "ExperimentResults", + "ExperimentResult" +] diff --git a/src/crewai/evaluation/experiment/result.py b/src/crewai/experimental/evaluation/experiment/result.py similarity index 97% rename from src/crewai/evaluation/experiment/result.py rename to src/crewai/experimental/evaluation/experiment/result.py index eb6c451ac..44b24e95a 100644 --- a/src/crewai/evaluation/experiment/result.py +++ b/src/crewai/experimental/evaluation/experiment/result.py @@ -18,7 +18,7 @@ class ExperimentResults: self.metadata = metadata or {} self.timestamp = datetime.now(timezone.utc) - from crewai.evaluation.experiment.result_display import ExperimentResultsDisplay + from crewai.experimental.evaluation.experiment.result_display import ExperimentResultsDisplay self.display = ExperimentResultsDisplay() def to_json(self, filepath: str | None = None) -> dict[str, Any]: diff --git a/src/crewai/evaluation/experiment/result_display.py b/src/crewai/experimental/evaluation/experiment/result_display.py similarity index 97% rename from src/crewai/evaluation/experiment/result_display.py rename to src/crewai/experimental/evaluation/experiment/result_display.py index 81d60f5f7..79bab2fc8 100644 --- a/src/crewai/evaluation/experiment/result_display.py +++ b/src/crewai/experimental/evaluation/experiment/result_display.py @@ -2,7 +2,7 @@ from typing import Dict, Any from rich.console import Console from rich.table import Table from rich.panel import Panel -from crewai.evaluation.experiment.result import ExperimentResults +from crewai.experimental.evaluation.experiment.result import ExperimentResults class ExperimentResultsDisplay: def __init__(self): diff --git a/src/crewai/evaluation/experiment/runner.py b/src/crewai/experimental/evaluation/experiment/runner.py similarity index 92% rename from src/crewai/evaluation/experiment/runner.py rename to src/crewai/experimental/evaluation/experiment/runner.py index 52fe756b9..b1b653e68 100644 --- a/src/crewai/evaluation/experiment/runner.py +++ b/src/crewai/experimental/evaluation/experiment/runner.py @@ -3,10 +3,10 @@ from hashlib import md5 from typing import Any from crewai import Crew -from crewai.evaluation import AgentEvaluator, create_default_evaluator -from crewai.evaluation.experiment.result_display import ExperimentResultsDisplay -from crewai.evaluation.experiment.result import ExperimentResults, ExperimentResult -from crewai.evaluation.evaluation_display import AgentAggregatedEvaluationResult +from crewai.experimental.evaluation import AgentEvaluator, create_default_evaluator +from crewai.experimental.evaluation.experiment.result_display import ExperimentResultsDisplay +from crewai.experimental.evaluation.experiment.result import ExperimentResults, ExperimentResult +from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult class ExperimentRunner: def __init__(self, dataset: list[dict[str, Any]]): diff --git a/src/crewai/evaluation/json_parser.py b/src/crewai/experimental/evaluation/json_parser.py similarity index 100% rename from src/crewai/evaluation/json_parser.py rename to src/crewai/experimental/evaluation/json_parser.py diff --git a/src/crewai/experimental/evaluation/metrics/__init__.py b/src/crewai/experimental/evaluation/metrics/__init__.py new file mode 100644 index 000000000..1b306587b --- /dev/null +++ b/src/crewai/experimental/evaluation/metrics/__init__.py @@ -0,0 +1,26 @@ +from crewai.experimental.evaluation.metrics.reasoning_metrics import ( + ReasoningEfficiencyEvaluator +) + +from crewai.experimental.evaluation.metrics.tools_metrics import ( + ToolSelectionEvaluator, + ParameterExtractionEvaluator, + ToolInvocationEvaluator +) + +from crewai.experimental.evaluation.metrics.goal_metrics import ( + GoalAlignmentEvaluator +) + +from crewai.experimental.evaluation.metrics.semantic_quality_metrics import ( + SemanticQualityEvaluator +) + +__all__ = [ + "ReasoningEfficiencyEvaluator", + "ToolSelectionEvaluator", + "ParameterExtractionEvaluator", + "ToolInvocationEvaluator", + "GoalAlignmentEvaluator", + "SemanticQualityEvaluator" +] \ No newline at end of file diff --git a/src/crewai/evaluation/metrics/goal_metrics.py b/src/crewai/experimental/evaluation/metrics/goal_metrics.py similarity index 91% rename from src/crewai/evaluation/metrics/goal_metrics.py rename to src/crewai/experimental/evaluation/metrics/goal_metrics.py index bc6c63801..85f0b91aa 100644 --- a/src/crewai/evaluation/metrics/goal_metrics.py +++ b/src/crewai/experimental/evaluation/metrics/goal_metrics.py @@ -3,8 +3,8 @@ from typing import Any, Dict from crewai.agent import Agent from crewai.task import Task -from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory -from crewai.evaluation.json_parser import extract_json_from_llm_response +from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory +from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response class GoalAlignmentEvaluator(BaseEvaluator): @property diff --git a/src/crewai/evaluation/metrics/reasoning_metrics.py b/src/crewai/experimental/evaluation/metrics/reasoning_metrics.py similarity index 98% rename from src/crewai/evaluation/metrics/reasoning_metrics.py rename to src/crewai/experimental/evaluation/metrics/reasoning_metrics.py index e1ce06c23..605e5b06c 100644 --- a/src/crewai/evaluation/metrics/reasoning_metrics.py +++ b/src/crewai/experimental/evaluation/metrics/reasoning_metrics.py @@ -16,8 +16,8 @@ from collections.abc import Sequence from crewai.agent import Agent from crewai.task import Task -from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory -from crewai.evaluation.json_parser import extract_json_from_llm_response +from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory +from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response from crewai.tasks.task_output import TaskOutput class ReasoningPatternType(Enum): diff --git a/src/crewai/evaluation/metrics/semantic_quality_metrics.py b/src/crewai/experimental/evaluation/metrics/semantic_quality_metrics.py similarity index 91% rename from src/crewai/evaluation/metrics/semantic_quality_metrics.py rename to src/crewai/experimental/evaluation/metrics/semantic_quality_metrics.py index a12c62ae3..011618481 100644 --- a/src/crewai/evaluation/metrics/semantic_quality_metrics.py +++ b/src/crewai/experimental/evaluation/metrics/semantic_quality_metrics.py @@ -3,8 +3,8 @@ from typing import Any, Dict from crewai.agent import Agent from crewai.task import Task -from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory -from crewai.evaluation.json_parser import extract_json_from_llm_response +from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory +from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response class SemanticQualityEvaluator(BaseEvaluator): @property diff --git a/src/crewai/evaluation/metrics/tools_metrics.py b/src/crewai/experimental/evaluation/metrics/tools_metrics.py similarity index 98% rename from src/crewai/evaluation/metrics/tools_metrics.py rename to src/crewai/experimental/evaluation/metrics/tools_metrics.py index 00762fc76..16b2ec7cd 100644 --- a/src/crewai/evaluation/metrics/tools_metrics.py +++ b/src/crewai/experimental/evaluation/metrics/tools_metrics.py @@ -1,8 +1,8 @@ import json from typing import Dict, Any -from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory -from crewai.evaluation.json_parser import extract_json_from_llm_response +from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory +from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response from crewai.agent import Agent from crewai.task import Task diff --git a/tests/evaluation/metrics/test_goal_metrics.py b/tests/evaluation/metrics/test_goal_metrics.py index 69ec42d1f..be44ecd7a 100644 --- a/tests/evaluation/metrics/test_goal_metrics.py +++ b/tests/evaluation/metrics/test_goal_metrics.py @@ -1,8 +1,8 @@ from unittest.mock import patch, MagicMock from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest -from crewai.evaluation.base_evaluator import EvaluationScore -from crewai.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator +from crewai.experimental.evaluation.base_evaluator import EvaluationScore +from crewai.experimental.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator from crewai.utilities.llm_utils import LLM diff --git a/tests/evaluation/metrics/test_reasoning_metrics.py b/tests/evaluation/metrics/test_reasoning_metrics.py index 5d8015e3b..d5f2cf1f5 100644 --- a/tests/evaluation/metrics/test_reasoning_metrics.py +++ b/tests/evaluation/metrics/test_reasoning_metrics.py @@ -3,12 +3,12 @@ from unittest.mock import patch, MagicMock from typing import List, Dict, Any from crewai.tasks.task_output import TaskOutput -from crewai.evaluation.metrics.reasoning_metrics import ( +from crewai.experimental.evaluation.metrics.reasoning_metrics import ( ReasoningEfficiencyEvaluator, ) from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest from crewai.utilities.llm_utils import LLM -from crewai.evaluation.base_evaluator import EvaluationScore +from crewai.experimental.evaluation.base_evaluator import EvaluationScore class TestReasoningEfficiencyEvaluator(BaseEvaluationMetricsTest): @pytest.fixture diff --git a/tests/evaluation/metrics/test_semantic_quality_metrics.py b/tests/evaluation/metrics/test_semantic_quality_metrics.py index e5adb198b..0d4dd386d 100644 --- a/tests/evaluation/metrics/test_semantic_quality_metrics.py +++ b/tests/evaluation/metrics/test_semantic_quality_metrics.py @@ -1,7 +1,7 @@ from unittest.mock import patch, MagicMock -from crewai.evaluation.base_evaluator import EvaluationScore -from crewai.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator +from crewai.experimental.evaluation.base_evaluator import EvaluationScore +from crewai.experimental.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest from crewai.utilities.llm_utils import LLM diff --git a/tests/evaluation/metrics/test_tools_metrics.py b/tests/evaluation/metrics/test_tools_metrics.py index ad6e50fdd..16b907ca8 100644 --- a/tests/evaluation/metrics/test_tools_metrics.py +++ b/tests/evaluation/metrics/test_tools_metrics.py @@ -1,6 +1,6 @@ from unittest.mock import patch, MagicMock -from crewai.evaluation.metrics.tools_metrics import ( +from crewai.experimental.evaluation.metrics.tools_metrics import ( ToolSelectionEvaluator, ParameterExtractionEvaluator, ToolInvocationEvaluator diff --git a/tests/evaluation/test_agent_evaluator.py b/tests/evaluation/test_agent_evaluator.py index f7935e85f..88f493908 100644 --- a/tests/evaluation/test_agent_evaluator.py +++ b/tests/evaluation/test_agent_evaluator.py @@ -3,9 +3,9 @@ import pytest from crewai.agent import Agent from crewai.task import Task from crewai.crew import Crew -from crewai.evaluation.agent_evaluator import AgentEvaluator -from crewai.evaluation.base_evaluator import AgentEvaluationResult -from crewai.evaluation import ( +from crewai.experimental.evaluation.agent_evaluator import AgentEvaluator +from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult +from crewai.experimental.evaluation import ( GoalAlignmentEvaluator, SemanticQualityEvaluator, ToolSelectionEvaluator, @@ -14,7 +14,7 @@ from crewai.evaluation import ( ReasoningEfficiencyEvaluator ) -from crewai.evaluation import create_default_evaluator +from crewai.experimental.evaluation import create_default_evaluator class TestAgentEvaluator: @pytest.fixture def mock_crew(self): diff --git a/tests/evaluation/test_experiment_result.py b/tests/evaluation/test_experiment_result.py index 1c281067a..5ba390b48 100644 --- a/tests/evaluation/test_experiment_result.py +++ b/tests/evaluation/test_experiment_result.py @@ -1,7 +1,7 @@ import pytest from unittest.mock import MagicMock, patch -from crewai.evaluation.experiment.result import ExperimentResult, ExperimentResults +from crewai.experimental.evaluation.experiment.result import ExperimentResult, ExperimentResults class TestExperimentResult: diff --git a/tests/evaluation/test_experiment_runner.py b/tests/evaluation/test_experiment_runner.py index f7e4b2699..58382fa65 100644 --- a/tests/evaluation/test_experiment_runner.py +++ b/tests/evaluation/test_experiment_runner.py @@ -2,10 +2,10 @@ import pytest from unittest.mock import MagicMock, patch from crewai.crew import Crew -from crewai.evaluation.experiment.runner import ExperimentRunner -from crewai.evaluation.experiment.result import ExperimentResults -from crewai.evaluation.evaluation_display import AgentAggregatedEvaluationResult -from crewai.evaluation.base_evaluator import MetricCategory, EvaluationScore +from crewai.experimental.evaluation.experiment.runner import ExperimentRunner +from crewai.experimental.evaluation.experiment.result import ExperimentResults +from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult +from crewai.experimental.evaluation.base_evaluator import MetricCategory, EvaluationScore class TestExperimentRunner: @@ -44,7 +44,7 @@ class TestExperimentRunner: return {"Test Agent": agent_evaluation} - @patch('crewai.evaluation.experiment.runner.create_default_evaluator') + @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator') def test_run_success(self, mock_create_evaluator, mock_crew, mock_evaluator_results): dataset = [ { @@ -102,7 +102,7 @@ class TestExperimentRunner: assert mock_evaluator.get_agent_evaluation.call_count == 3 - @patch('crewai.evaluation.experiment.runner.create_default_evaluator') + @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator') def test_run_success_with_unknown_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results): dataset = [ { @@ -130,7 +130,7 @@ class TestExperimentRunner: assert "unknown_metric" in result.expected_score.keys() assert result.passed is True - @patch('crewai.evaluation.experiment.runner.create_default_evaluator') + @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator') def test_run_success_with_single_metric_evaluator_and_expected_specific_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results): dataset = [ { @@ -163,7 +163,7 @@ class TestExperimentRunner: assert "goal_alignment" in result.expected_score.keys() assert result.passed is True - @patch('crewai.evaluation.experiment.runner.create_default_evaluator') + @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator') def test_run_success_when_expected_metric_is_not_available(self, mock_create_evaluator, mock_crew, mock_evaluator_results): dataset = [ {