feat: add experimental folder for beta features (#3141)

This commit is contained in:
Lucas Gomide
2025-07-11 14:43:29 -03:00
committed by GitHub
parent e3b044c044
commit 4439755b2d
26 changed files with 125 additions and 70 deletions

View File

@@ -1337,7 +1337,7 @@ class Crew(FlowTrackable, BaseModel):
evaluator = CrewEvaluator(test_crew, llm_instance)
if include_agent_eval:
from crewai.evaluation import create_default_evaluator
from crewai.experimental.evaluation import create_default_evaluator
agent_evaluator = create_default_evaluator(crew=test_crew)
for i in range(1, n_iterations + 1):

View File

@@ -1,8 +0,0 @@
from crewai.evaluation.experiment.runner import ExperimentRunner
from crewai.evaluation.experiment.result import ExperimentResults, ExperimentResult
__all__ = [
"ExperimentRunner",
"ExperimentResults",
"ExperimentResult"
]

View File

@@ -0,0 +1,40 @@
from crewai.experimental.evaluation import (
BaseEvaluator,
EvaluationScore,
MetricCategory,
AgentEvaluationResult,
SemanticQualityEvaluator,
GoalAlignmentEvaluator,
ReasoningEfficiencyEvaluator,
ToolSelectionEvaluator,
ParameterExtractionEvaluator,
ToolInvocationEvaluator,
EvaluationTraceCallback,
create_evaluation_callbacks,
AgentEvaluator,
create_default_evaluator,
ExperimentRunner,
ExperimentResults,
ExperimentResult,
)
__all__ = [
"BaseEvaluator",
"EvaluationScore",
"MetricCategory",
"AgentEvaluationResult",
"SemanticQualityEvaluator",
"GoalAlignmentEvaluator",
"ReasoningEfficiencyEvaluator",
"ToolSelectionEvaluator",
"ParameterExtractionEvaluator",
"ToolInvocationEvaluator",
"EvaluationTraceCallback",
"create_evaluation_callbacks",
"AgentEvaluator",
"create_default_evaluator",
"ExperimentRunner",
"ExperimentResults",
"ExperimentResult"
]

View File

@@ -1,41 +1,30 @@
from crewai.evaluation.base_evaluator import (
from crewai.experimental.evaluation.base_evaluator import (
BaseEvaluator,
EvaluationScore,
MetricCategory,
AgentEvaluationResult
)
from crewai.evaluation.metrics.semantic_quality_metrics import (
SemanticQualityEvaluator
)
from crewai.evaluation.metrics.goal_metrics import (
GoalAlignmentEvaluator
)
from crewai.evaluation.metrics.reasoning_metrics import (
ReasoningEfficiencyEvaluator
)
from crewai.evaluation.metrics.tools_metrics import (
from crewai.experimental.evaluation.metrics import (
SemanticQualityEvaluator,
GoalAlignmentEvaluator,
ReasoningEfficiencyEvaluator,
ToolSelectionEvaluator,
ParameterExtractionEvaluator,
ToolInvocationEvaluator
)
from crewai.evaluation.evaluation_listener import (
from crewai.experimental.evaluation.evaluation_listener import (
EvaluationTraceCallback,
create_evaluation_callbacks
)
from crewai.evaluation.agent_evaluator import (
from crewai.experimental.evaluation.agent_evaluator import (
AgentEvaluator,
create_default_evaluator
)
from crewai.evaluation.experiment import (
from crewai.experimental.evaluation.experiment import (
ExperimentRunner,
ExperimentResults,
ExperimentResult
@@ -59,4 +48,4 @@ __all__ = [
"ExperimentRunner",
"ExperimentResults",
"ExperimentResult"
]
]

View File

@@ -1,16 +1,16 @@
from crewai.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy
from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy
from crewai.agent import Agent
from crewai.task import Task
from crewai.evaluation.evaluation_display import EvaluationDisplayFormatter
from crewai.experimental.evaluation.evaluation_display import EvaluationDisplayFormatter
from typing import Any, Dict
from collections import defaultdict
from crewai.evaluation import BaseEvaluator, create_evaluation_callbacks
from crewai.experimental.evaluation import BaseEvaluator, create_evaluation_callbacks
from collections.abc import Sequence
from crewai.crew import Crew
from crewai.utilities.events.crewai_event_bus import crewai_event_bus
from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
from crewai.evaluation.evaluation_display import AgentAggregatedEvaluationResult
from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
class AgentEvaluator:
def __init__(
@@ -161,7 +161,7 @@ class AgentEvaluator:
return result
def create_default_evaluator(crew, llm=None):
from crewai.evaluation import (
from crewai.experimental.evaluation import (
GoalAlignmentEvaluator,
SemanticQualityEvaluator,
ToolSelectionEvaluator,

View File

@@ -3,8 +3,8 @@ from typing import Dict, Any, List
from rich.table import Table
from rich.box import HEAVY_EDGE, ROUNDED
from collections.abc import Sequence
from crewai.evaluation.base_evaluator import AgentAggregatedEvaluationResult, AggregationStrategy, AgentEvaluationResult, MetricCategory
from crewai.evaluation import EvaluationScore
from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult, AggregationStrategy, AgentEvaluationResult, MetricCategory
from crewai.experimental.evaluation import EvaluationScore
from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
from crewai.utilities.llm_utils import create_llm

View File

@@ -0,0 +1,8 @@
from crewai.experimental.evaluation.experiment.runner import ExperimentRunner
from crewai.experimental.evaluation.experiment.result import ExperimentResults, ExperimentResult
__all__ = [
"ExperimentRunner",
"ExperimentResults",
"ExperimentResult"
]

View File

@@ -18,7 +18,7 @@ class ExperimentResults:
self.metadata = metadata or {}
self.timestamp = datetime.now(timezone.utc)
from crewai.evaluation.experiment.result_display import ExperimentResultsDisplay
from crewai.experimental.evaluation.experiment.result_display import ExperimentResultsDisplay
self.display = ExperimentResultsDisplay()
def to_json(self, filepath: str | None = None) -> dict[str, Any]:

View File

@@ -2,7 +2,7 @@ from typing import Dict, Any
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from crewai.evaluation.experiment.result import ExperimentResults
from crewai.experimental.evaluation.experiment.result import ExperimentResults
class ExperimentResultsDisplay:
def __init__(self):

View File

@@ -3,10 +3,10 @@ from hashlib import md5
from typing import Any
from crewai import Crew
from crewai.evaluation import AgentEvaluator, create_default_evaluator
from crewai.evaluation.experiment.result_display import ExperimentResultsDisplay
from crewai.evaluation.experiment.result import ExperimentResults, ExperimentResult
from crewai.evaluation.evaluation_display import AgentAggregatedEvaluationResult
from crewai.experimental.evaluation import AgentEvaluator, create_default_evaluator
from crewai.experimental.evaluation.experiment.result_display import ExperimentResultsDisplay
from crewai.experimental.evaluation.experiment.result import ExperimentResults, ExperimentResult
from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
class ExperimentRunner:
def __init__(self, dataset: list[dict[str, Any]]):

View File

@@ -0,0 +1,26 @@
from crewai.experimental.evaluation.metrics.reasoning_metrics import (
ReasoningEfficiencyEvaluator
)
from crewai.experimental.evaluation.metrics.tools_metrics import (
ToolSelectionEvaluator,
ParameterExtractionEvaluator,
ToolInvocationEvaluator
)
from crewai.experimental.evaluation.metrics.goal_metrics import (
GoalAlignmentEvaluator
)
from crewai.experimental.evaluation.metrics.semantic_quality_metrics import (
SemanticQualityEvaluator
)
__all__ = [
"ReasoningEfficiencyEvaluator",
"ToolSelectionEvaluator",
"ParameterExtractionEvaluator",
"ToolInvocationEvaluator",
"GoalAlignmentEvaluator",
"SemanticQualityEvaluator"
]

View File

@@ -3,8 +3,8 @@ from typing import Any, Dict
from crewai.agent import Agent
from crewai.task import Task
from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
from crewai.evaluation.json_parser import extract_json_from_llm_response
from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
class GoalAlignmentEvaluator(BaseEvaluator):
@property

View File

@@ -16,8 +16,8 @@ from collections.abc import Sequence
from crewai.agent import Agent
from crewai.task import Task
from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
from crewai.evaluation.json_parser import extract_json_from_llm_response
from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
from crewai.tasks.task_output import TaskOutput
class ReasoningPatternType(Enum):

View File

@@ -3,8 +3,8 @@ from typing import Any, Dict
from crewai.agent import Agent
from crewai.task import Task
from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
from crewai.evaluation.json_parser import extract_json_from_llm_response
from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
class SemanticQualityEvaluator(BaseEvaluator):
@property

View File

@@ -1,8 +1,8 @@
import json
from typing import Dict, Any
from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
from crewai.evaluation.json_parser import extract_json_from_llm_response
from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
from crewai.agent import Agent
from crewai.task import Task

View File

@@ -1,8 +1,8 @@
from unittest.mock import patch, MagicMock
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
from crewai.evaluation.base_evaluator import EvaluationScore
from crewai.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator
from crewai.experimental.evaluation.base_evaluator import EvaluationScore
from crewai.experimental.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator
from crewai.utilities.llm_utils import LLM

View File

@@ -3,12 +3,12 @@ from unittest.mock import patch, MagicMock
from typing import List, Dict, Any
from crewai.tasks.task_output import TaskOutput
from crewai.evaluation.metrics.reasoning_metrics import (
from crewai.experimental.evaluation.metrics.reasoning_metrics import (
ReasoningEfficiencyEvaluator,
)
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
from crewai.utilities.llm_utils import LLM
from crewai.evaluation.base_evaluator import EvaluationScore
from crewai.experimental.evaluation.base_evaluator import EvaluationScore
class TestReasoningEfficiencyEvaluator(BaseEvaluationMetricsTest):
@pytest.fixture

View File

@@ -1,7 +1,7 @@
from unittest.mock import patch, MagicMock
from crewai.evaluation.base_evaluator import EvaluationScore
from crewai.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator
from crewai.experimental.evaluation.base_evaluator import EvaluationScore
from crewai.experimental.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
from crewai.utilities.llm_utils import LLM

View File

@@ -1,6 +1,6 @@
from unittest.mock import patch, MagicMock
from crewai.evaluation.metrics.tools_metrics import (
from crewai.experimental.evaluation.metrics.tools_metrics import (
ToolSelectionEvaluator,
ParameterExtractionEvaluator,
ToolInvocationEvaluator

View File

@@ -3,9 +3,9 @@ import pytest
from crewai.agent import Agent
from crewai.task import Task
from crewai.crew import Crew
from crewai.evaluation.agent_evaluator import AgentEvaluator
from crewai.evaluation.base_evaluator import AgentEvaluationResult
from crewai.evaluation import (
from crewai.experimental.evaluation.agent_evaluator import AgentEvaluator
from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult
from crewai.experimental.evaluation import (
GoalAlignmentEvaluator,
SemanticQualityEvaluator,
ToolSelectionEvaluator,
@@ -14,7 +14,7 @@ from crewai.evaluation import (
ReasoningEfficiencyEvaluator
)
from crewai.evaluation import create_default_evaluator
from crewai.experimental.evaluation import create_default_evaluator
class TestAgentEvaluator:
@pytest.fixture
def mock_crew(self):

View File

@@ -1,7 +1,7 @@
import pytest
from unittest.mock import MagicMock, patch
from crewai.evaluation.experiment.result import ExperimentResult, ExperimentResults
from crewai.experimental.evaluation.experiment.result import ExperimentResult, ExperimentResults
class TestExperimentResult:

View File

@@ -2,10 +2,10 @@ import pytest
from unittest.mock import MagicMock, patch
from crewai.crew import Crew
from crewai.evaluation.experiment.runner import ExperimentRunner
from crewai.evaluation.experiment.result import ExperimentResults
from crewai.evaluation.evaluation_display import AgentAggregatedEvaluationResult
from crewai.evaluation.base_evaluator import MetricCategory, EvaluationScore
from crewai.experimental.evaluation.experiment.runner import ExperimentRunner
from crewai.experimental.evaluation.experiment.result import ExperimentResults
from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
from crewai.experimental.evaluation.base_evaluator import MetricCategory, EvaluationScore
class TestExperimentRunner:
@@ -44,7 +44,7 @@ class TestExperimentRunner:
return {"Test Agent": agent_evaluation}
@patch('crewai.evaluation.experiment.runner.create_default_evaluator')
@patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
def test_run_success(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
dataset = [
{
@@ -102,7 +102,7 @@ class TestExperimentRunner:
assert mock_evaluator.get_agent_evaluation.call_count == 3
@patch('crewai.evaluation.experiment.runner.create_default_evaluator')
@patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
def test_run_success_with_unknown_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
dataset = [
{
@@ -130,7 +130,7 @@ class TestExperimentRunner:
assert "unknown_metric" in result.expected_score.keys()
assert result.passed is True
@patch('crewai.evaluation.experiment.runner.create_default_evaluator')
@patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
def test_run_success_with_single_metric_evaluator_and_expected_specific_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
dataset = [
{
@@ -163,7 +163,7 @@ class TestExperimentRunner:
assert "goal_alignment" in result.expected_score.keys()
assert result.passed is True
@patch('crewai.evaluation.experiment.runner.create_default_evaluator')
@patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
def test_run_success_when_expected_metric_is_not_available(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
dataset = [
{