feat: add experimental folder for beta features

This commit is contained in:
Lucas Gomide
2025-07-11 14:27:13 -03:00
parent e3b044c044
commit a810c31ced
26 changed files with 125 additions and 70 deletions

View File

@@ -1337,7 +1337,7 @@ class Crew(FlowTrackable, BaseModel):
evaluator = CrewEvaluator(test_crew, llm_instance) evaluator = CrewEvaluator(test_crew, llm_instance)
if include_agent_eval: if include_agent_eval:
from crewai.evaluation import create_default_evaluator from crewai.experimental.evaluation import create_default_evaluator
agent_evaluator = create_default_evaluator(crew=test_crew) agent_evaluator = create_default_evaluator(crew=test_crew)
for i in range(1, n_iterations + 1): for i in range(1, n_iterations + 1):

View File

@@ -1,8 +0,0 @@
from crewai.evaluation.experiment.runner import ExperimentRunner
from crewai.evaluation.experiment.result import ExperimentResults, ExperimentResult
__all__ = [
"ExperimentRunner",
"ExperimentResults",
"ExperimentResult"
]

View File

@@ -0,0 +1,40 @@
from crewai.experimental.evaluation import (
BaseEvaluator,
EvaluationScore,
MetricCategory,
AgentEvaluationResult,
SemanticQualityEvaluator,
GoalAlignmentEvaluator,
ReasoningEfficiencyEvaluator,
ToolSelectionEvaluator,
ParameterExtractionEvaluator,
ToolInvocationEvaluator,
EvaluationTraceCallback,
create_evaluation_callbacks,
AgentEvaluator,
create_default_evaluator,
ExperimentRunner,
ExperimentResults,
ExperimentResult,
)
__all__ = [
"BaseEvaluator",
"EvaluationScore",
"MetricCategory",
"AgentEvaluationResult",
"SemanticQualityEvaluator",
"GoalAlignmentEvaluator",
"ReasoningEfficiencyEvaluator",
"ToolSelectionEvaluator",
"ParameterExtractionEvaluator",
"ToolInvocationEvaluator",
"EvaluationTraceCallback",
"create_evaluation_callbacks",
"AgentEvaluator",
"create_default_evaluator",
"ExperimentRunner",
"ExperimentResults",
"ExperimentResult"
]

View File

@@ -1,41 +1,30 @@
from crewai.evaluation.base_evaluator import ( from crewai.experimental.evaluation.base_evaluator import (
BaseEvaluator, BaseEvaluator,
EvaluationScore, EvaluationScore,
MetricCategory, MetricCategory,
AgentEvaluationResult AgentEvaluationResult
) )
from crewai.evaluation.metrics.semantic_quality_metrics import ( from crewai.experimental.evaluation.metrics import (
SemanticQualityEvaluator SemanticQualityEvaluator,
) GoalAlignmentEvaluator,
ReasoningEfficiencyEvaluator,
from crewai.evaluation.metrics.goal_metrics import (
GoalAlignmentEvaluator
)
from crewai.evaluation.metrics.reasoning_metrics import (
ReasoningEfficiencyEvaluator
)
from crewai.evaluation.metrics.tools_metrics import (
ToolSelectionEvaluator, ToolSelectionEvaluator,
ParameterExtractionEvaluator, ParameterExtractionEvaluator,
ToolInvocationEvaluator ToolInvocationEvaluator
) )
from crewai.evaluation.evaluation_listener import ( from crewai.experimental.evaluation.evaluation_listener import (
EvaluationTraceCallback, EvaluationTraceCallback,
create_evaluation_callbacks create_evaluation_callbacks
) )
from crewai.experimental.evaluation.agent_evaluator import (
from crewai.evaluation.agent_evaluator import (
AgentEvaluator, AgentEvaluator,
create_default_evaluator create_default_evaluator
) )
from crewai.evaluation.experiment import ( from crewai.experimental.evaluation.experiment import (
ExperimentRunner, ExperimentRunner,
ExperimentResults, ExperimentResults,
ExperimentResult ExperimentResult
@@ -59,4 +48,4 @@ __all__ = [
"ExperimentRunner", "ExperimentRunner",
"ExperimentResults", "ExperimentResults",
"ExperimentResult" "ExperimentResult"
] ]

View File

@@ -1,16 +1,16 @@
from crewai.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy
from crewai.agent import Agent from crewai.agent import Agent
from crewai.task import Task from crewai.task import Task
from crewai.evaluation.evaluation_display import EvaluationDisplayFormatter from crewai.experimental.evaluation.evaluation_display import EvaluationDisplayFormatter
from typing import Any, Dict from typing import Any, Dict
from collections import defaultdict from collections import defaultdict
from crewai.evaluation import BaseEvaluator, create_evaluation_callbacks from crewai.experimental.evaluation import BaseEvaluator, create_evaluation_callbacks
from collections.abc import Sequence from collections.abc import Sequence
from crewai.crew import Crew from crewai.crew import Crew
from crewai.utilities.events.crewai_event_bus import crewai_event_bus from crewai.utilities.events.crewai_event_bus import crewai_event_bus
from crewai.utilities.events.utils.console_formatter import ConsoleFormatter from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
from crewai.evaluation.evaluation_display import AgentAggregatedEvaluationResult from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
class AgentEvaluator: class AgentEvaluator:
def __init__( def __init__(
@@ -161,7 +161,7 @@ class AgentEvaluator:
return result return result
def create_default_evaluator(crew, llm=None): def create_default_evaluator(crew, llm=None):
from crewai.evaluation import ( from crewai.experimental.evaluation import (
GoalAlignmentEvaluator, GoalAlignmentEvaluator,
SemanticQualityEvaluator, SemanticQualityEvaluator,
ToolSelectionEvaluator, ToolSelectionEvaluator,

View File

@@ -3,8 +3,8 @@ from typing import Dict, Any, List
from rich.table import Table from rich.table import Table
from rich.box import HEAVY_EDGE, ROUNDED from rich.box import HEAVY_EDGE, ROUNDED
from collections.abc import Sequence from collections.abc import Sequence
from crewai.evaluation.base_evaluator import AgentAggregatedEvaluationResult, AggregationStrategy, AgentEvaluationResult, MetricCategory from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult, AggregationStrategy, AgentEvaluationResult, MetricCategory
from crewai.evaluation import EvaluationScore from crewai.experimental.evaluation import EvaluationScore
from crewai.utilities.events.utils.console_formatter import ConsoleFormatter from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
from crewai.utilities.llm_utils import create_llm from crewai.utilities.llm_utils import create_llm

View File

@@ -0,0 +1,8 @@
from crewai.experimental.evaluation.experiment.runner import ExperimentRunner
from crewai.experimental.evaluation.experiment.result import ExperimentResults, ExperimentResult
__all__ = [
"ExperimentRunner",
"ExperimentResults",
"ExperimentResult"
]

View File

@@ -18,7 +18,7 @@ class ExperimentResults:
self.metadata = metadata or {} self.metadata = metadata or {}
self.timestamp = datetime.now(timezone.utc) self.timestamp = datetime.now(timezone.utc)
from crewai.evaluation.experiment.result_display import ExperimentResultsDisplay from crewai.experimental.evaluation.experiment.result_display import ExperimentResultsDisplay
self.display = ExperimentResultsDisplay() self.display = ExperimentResultsDisplay()
def to_json(self, filepath: str | None = None) -> dict[str, Any]: def to_json(self, filepath: str | None = None) -> dict[str, Any]:

View File

@@ -2,7 +2,7 @@ from typing import Dict, Any
from rich.console import Console from rich.console import Console
from rich.table import Table from rich.table import Table
from rich.panel import Panel from rich.panel import Panel
from crewai.evaluation.experiment.result import ExperimentResults from crewai.experimental.evaluation.experiment.result import ExperimentResults
class ExperimentResultsDisplay: class ExperimentResultsDisplay:
def __init__(self): def __init__(self):

View File

@@ -3,10 +3,10 @@ from hashlib import md5
from typing import Any from typing import Any
from crewai import Crew from crewai import Crew
from crewai.evaluation import AgentEvaluator, create_default_evaluator from crewai.experimental.evaluation import AgentEvaluator, create_default_evaluator
from crewai.evaluation.experiment.result_display import ExperimentResultsDisplay from crewai.experimental.evaluation.experiment.result_display import ExperimentResultsDisplay
from crewai.evaluation.experiment.result import ExperimentResults, ExperimentResult from crewai.experimental.evaluation.experiment.result import ExperimentResults, ExperimentResult
from crewai.evaluation.evaluation_display import AgentAggregatedEvaluationResult from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
class ExperimentRunner: class ExperimentRunner:
def __init__(self, dataset: list[dict[str, Any]]): def __init__(self, dataset: list[dict[str, Any]]):

View File

@@ -0,0 +1,26 @@
from crewai.experimental.evaluation.metrics.reasoning_metrics import (
ReasoningEfficiencyEvaluator
)
from crewai.experimental.evaluation.metrics.tools_metrics import (
ToolSelectionEvaluator,
ParameterExtractionEvaluator,
ToolInvocationEvaluator
)
from crewai.experimental.evaluation.metrics.goal_metrics import (
GoalAlignmentEvaluator
)
from crewai.experimental.evaluation.metrics.semantic_quality_metrics import (
SemanticQualityEvaluator
)
__all__ = [
"ReasoningEfficiencyEvaluator",
"ToolSelectionEvaluator",
"ParameterExtractionEvaluator",
"ToolInvocationEvaluator",
"GoalAlignmentEvaluator",
"SemanticQualityEvaluator"
]

View File

@@ -3,8 +3,8 @@ from typing import Any, Dict
from crewai.agent import Agent from crewai.agent import Agent
from crewai.task import Task from crewai.task import Task
from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
from crewai.evaluation.json_parser import extract_json_from_llm_response from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
class GoalAlignmentEvaluator(BaseEvaluator): class GoalAlignmentEvaluator(BaseEvaluator):
@property @property

View File

@@ -16,8 +16,8 @@ from collections.abc import Sequence
from crewai.agent import Agent from crewai.agent import Agent
from crewai.task import Task from crewai.task import Task
from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
from crewai.evaluation.json_parser import extract_json_from_llm_response from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
from crewai.tasks.task_output import TaskOutput from crewai.tasks.task_output import TaskOutput
class ReasoningPatternType(Enum): class ReasoningPatternType(Enum):

View File

@@ -3,8 +3,8 @@ from typing import Any, Dict
from crewai.agent import Agent from crewai.agent import Agent
from crewai.task import Task from crewai.task import Task
from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
from crewai.evaluation.json_parser import extract_json_from_llm_response from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
class SemanticQualityEvaluator(BaseEvaluator): class SemanticQualityEvaluator(BaseEvaluator):
@property @property

View File

@@ -1,8 +1,8 @@
import json import json
from typing import Dict, Any from typing import Dict, Any
from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
from crewai.evaluation.json_parser import extract_json_from_llm_response from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
from crewai.agent import Agent from crewai.agent import Agent
from crewai.task import Task from crewai.task import Task

View File

@@ -1,8 +1,8 @@
from unittest.mock import patch, MagicMock from unittest.mock import patch, MagicMock
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
from crewai.evaluation.base_evaluator import EvaluationScore from crewai.experimental.evaluation.base_evaluator import EvaluationScore
from crewai.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator from crewai.experimental.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator
from crewai.utilities.llm_utils import LLM from crewai.utilities.llm_utils import LLM

View File

@@ -3,12 +3,12 @@ from unittest.mock import patch, MagicMock
from typing import List, Dict, Any from typing import List, Dict, Any
from crewai.tasks.task_output import TaskOutput from crewai.tasks.task_output import TaskOutput
from crewai.evaluation.metrics.reasoning_metrics import ( from crewai.experimental.evaluation.metrics.reasoning_metrics import (
ReasoningEfficiencyEvaluator, ReasoningEfficiencyEvaluator,
) )
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
from crewai.utilities.llm_utils import LLM from crewai.utilities.llm_utils import LLM
from crewai.evaluation.base_evaluator import EvaluationScore from crewai.experimental.evaluation.base_evaluator import EvaluationScore
class TestReasoningEfficiencyEvaluator(BaseEvaluationMetricsTest): class TestReasoningEfficiencyEvaluator(BaseEvaluationMetricsTest):
@pytest.fixture @pytest.fixture

View File

@@ -1,7 +1,7 @@
from unittest.mock import patch, MagicMock from unittest.mock import patch, MagicMock
from crewai.evaluation.base_evaluator import EvaluationScore from crewai.experimental.evaluation.base_evaluator import EvaluationScore
from crewai.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator from crewai.experimental.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
from crewai.utilities.llm_utils import LLM from crewai.utilities.llm_utils import LLM

View File

@@ -1,6 +1,6 @@
from unittest.mock import patch, MagicMock from unittest.mock import patch, MagicMock
from crewai.evaluation.metrics.tools_metrics import ( from crewai.experimental.evaluation.metrics.tools_metrics import (
ToolSelectionEvaluator, ToolSelectionEvaluator,
ParameterExtractionEvaluator, ParameterExtractionEvaluator,
ToolInvocationEvaluator ToolInvocationEvaluator

View File

@@ -3,9 +3,9 @@ import pytest
from crewai.agent import Agent from crewai.agent import Agent
from crewai.task import Task from crewai.task import Task
from crewai.crew import Crew from crewai.crew import Crew
from crewai.evaluation.agent_evaluator import AgentEvaluator from crewai.experimental.evaluation.agent_evaluator import AgentEvaluator
from crewai.evaluation.base_evaluator import AgentEvaluationResult from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult
from crewai.evaluation import ( from crewai.experimental.evaluation import (
GoalAlignmentEvaluator, GoalAlignmentEvaluator,
SemanticQualityEvaluator, SemanticQualityEvaluator,
ToolSelectionEvaluator, ToolSelectionEvaluator,
@@ -14,7 +14,7 @@ from crewai.evaluation import (
ReasoningEfficiencyEvaluator ReasoningEfficiencyEvaluator
) )
from crewai.evaluation import create_default_evaluator from crewai.experimental.evaluation import create_default_evaluator
class TestAgentEvaluator: class TestAgentEvaluator:
@pytest.fixture @pytest.fixture
def mock_crew(self): def mock_crew(self):

View File

@@ -1,7 +1,7 @@
import pytest import pytest
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
from crewai.evaluation.experiment.result import ExperimentResult, ExperimentResults from crewai.experimental.evaluation.experiment.result import ExperimentResult, ExperimentResults
class TestExperimentResult: class TestExperimentResult:

View File

@@ -2,10 +2,10 @@ import pytest
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
from crewai.crew import Crew from crewai.crew import Crew
from crewai.evaluation.experiment.runner import ExperimentRunner from crewai.experimental.evaluation.experiment.runner import ExperimentRunner
from crewai.evaluation.experiment.result import ExperimentResults from crewai.experimental.evaluation.experiment.result import ExperimentResults
from crewai.evaluation.evaluation_display import AgentAggregatedEvaluationResult from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
from crewai.evaluation.base_evaluator import MetricCategory, EvaluationScore from crewai.experimental.evaluation.base_evaluator import MetricCategory, EvaluationScore
class TestExperimentRunner: class TestExperimentRunner:
@@ -44,7 +44,7 @@ class TestExperimentRunner:
return {"Test Agent": agent_evaluation} return {"Test Agent": agent_evaluation}
@patch('crewai.evaluation.experiment.runner.create_default_evaluator') @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
def test_run_success(self, mock_create_evaluator, mock_crew, mock_evaluator_results): def test_run_success(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
dataset = [ dataset = [
{ {
@@ -102,7 +102,7 @@ class TestExperimentRunner:
assert mock_evaluator.get_agent_evaluation.call_count == 3 assert mock_evaluator.get_agent_evaluation.call_count == 3
@patch('crewai.evaluation.experiment.runner.create_default_evaluator') @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
def test_run_success_with_unknown_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results): def test_run_success_with_unknown_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
dataset = [ dataset = [
{ {
@@ -130,7 +130,7 @@ class TestExperimentRunner:
assert "unknown_metric" in result.expected_score.keys() assert "unknown_metric" in result.expected_score.keys()
assert result.passed is True assert result.passed is True
@patch('crewai.evaluation.experiment.runner.create_default_evaluator') @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
def test_run_success_with_single_metric_evaluator_and_expected_specific_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results): def test_run_success_with_single_metric_evaluator_and_expected_specific_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
dataset = [ dataset = [
{ {
@@ -163,7 +163,7 @@ class TestExperimentRunner:
assert "goal_alignment" in result.expected_score.keys() assert "goal_alignment" in result.expected_score.keys()
assert result.passed is True assert result.passed is True
@patch('crewai.evaluation.experiment.runner.create_default_evaluator') @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
def test_run_success_when_expected_metric_is_not_available(self, mock_create_evaluator, mock_crew, mock_evaluator_results): def test_run_success_when_expected_metric_is_not_available(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
dataset = [ dataset = [
{ {