mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-08 23:58:34 +00:00
Introduce Evaluator Experiment (#3133)
* feat: add exchanged messages in LLMCallCompletedEvent * feat: add GoalAlignment metric for Agent evaluation * feat: add SemanticQuality metric for Agent evaluation * feat: add Tool Metrics for Agent evaluation * feat: add Reasoning Metrics for Agent evaluation, still in progress * feat: add AgentEvaluator class This class will evaluate Agent' results and report to user * fix: do not evaluate Agent by default This is a experimental feature we still need refine it further * test: add Agent eval tests * fix: render all feedback per iteration * style: resolve linter issues * style: fix mypy issues * fix: allow messages be empty on LLMCallCompletedEvent * feat: add Experiment evaluation framework with baseline comparison * fix: reset evaluator for each experiement iteraction * fix: fix track of new test cases * chore: split Experimental evaluation classes * refactor: remove unused method * refactor: isolate Console print in a dedicated class * fix: make crew required to run an experiment * fix: use time-aware to define experiment result * test: add tests for Evaluator Experiment * style: fix linter issues * fix: encode string before hashing * style: resolve linter issues * feat: add experimental folder for beta features (#3141) * test: move tests to experimental folder
This commit is contained in:
@@ -1,8 +1,8 @@
|
||||
from unittest.mock import patch, MagicMock
|
||||
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||
from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||
|
||||
from crewai.evaluation.base_evaluator import EvaluationScore
|
||||
from crewai.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator
|
||||
from crewai.experimental.evaluation.base_evaluator import EvaluationScore
|
||||
from crewai.experimental.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator
|
||||
from crewai.utilities.llm_utils import LLM
|
||||
|
||||
|
||||
@@ -3,12 +3,12 @@ from unittest.mock import patch, MagicMock
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from crewai.tasks.task_output import TaskOutput
|
||||
from crewai.evaluation.metrics.reasoning_metrics import (
|
||||
from crewai.experimental.evaluation.metrics.reasoning_metrics import (
|
||||
ReasoningEfficiencyEvaluator,
|
||||
)
|
||||
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||
from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||
from crewai.utilities.llm_utils import LLM
|
||||
from crewai.evaluation.base_evaluator import EvaluationScore
|
||||
from crewai.experimental.evaluation.base_evaluator import EvaluationScore
|
||||
|
||||
class TestReasoningEfficiencyEvaluator(BaseEvaluationMetricsTest):
|
||||
@pytest.fixture
|
||||
@@ -1,8 +1,8 @@
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from crewai.evaluation.base_evaluator import EvaluationScore
|
||||
from crewai.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator
|
||||
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||
from crewai.experimental.evaluation.base_evaluator import EvaluationScore
|
||||
from crewai.experimental.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator
|
||||
from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||
from crewai.utilities.llm_utils import LLM
|
||||
|
||||
class TestSemanticQualityEvaluator(BaseEvaluationMetricsTest):
|
||||
@@ -1,12 +1,12 @@
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from crewai.evaluation.metrics.tools_metrics import (
|
||||
from crewai.experimental.evaluation.metrics.tools_metrics import (
|
||||
ToolSelectionEvaluator,
|
||||
ParameterExtractionEvaluator,
|
||||
ToolInvocationEvaluator
|
||||
)
|
||||
from crewai.utilities.llm_utils import LLM
|
||||
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||
from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||
|
||||
class TestToolSelectionEvaluator(BaseEvaluationMetricsTest):
|
||||
def test_no_tools_available(self, mock_task, mock_agent):
|
||||
@@ -3,9 +3,9 @@ import pytest
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
from crewai.crew import Crew
|
||||
from crewai.evaluation.agent_evaluator import AgentEvaluator
|
||||
from crewai.evaluation.base_evaluator import AgentEvaluationResult
|
||||
from crewai.evaluation import (
|
||||
from crewai.experimental.evaluation.agent_evaluator import AgentEvaluator
|
||||
from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult
|
||||
from crewai.experimental.evaluation import (
|
||||
GoalAlignmentEvaluator,
|
||||
SemanticQualityEvaluator,
|
||||
ToolSelectionEvaluator,
|
||||
@@ -14,7 +14,7 @@ from crewai.evaluation import (
|
||||
ReasoningEfficiencyEvaluator
|
||||
)
|
||||
|
||||
from crewai.evaluation import create_default_evaluator
|
||||
from crewai.experimental.evaluation import create_default_evaluator
|
||||
class TestAgentEvaluator:
|
||||
@pytest.fixture
|
||||
def mock_crew(self):
|
||||
111
tests/experimental/evaluation/test_experiment_result.py
Normal file
111
tests/experimental/evaluation/test_experiment_result.py
Normal file
@@ -0,0 +1,111 @@
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from crewai.experimental.evaluation.experiment.result import ExperimentResult, ExperimentResults
|
||||
|
||||
|
||||
class TestExperimentResult:
|
||||
@pytest.fixture
|
||||
def mock_results(self):
|
||||
return [
|
||||
ExperimentResult(
|
||||
identifier="test-1",
|
||||
inputs={"query": "What is the capital of France?"},
|
||||
score=10,
|
||||
expected_score=7,
|
||||
passed=True
|
||||
),
|
||||
ExperimentResult(
|
||||
identifier="test-2",
|
||||
inputs={"query": "Who wrote Hamlet?"},
|
||||
score={"relevance": 9, "factuality": 8},
|
||||
expected_score={"relevance": 7, "factuality": 7},
|
||||
passed=True,
|
||||
agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
|
||||
),
|
||||
ExperimentResult(
|
||||
identifier="test-3",
|
||||
inputs={"query": "Any query"},
|
||||
score={"relevance": 9, "factuality": 8},
|
||||
expected_score={"relevance": 7, "factuality": 7},
|
||||
passed=False,
|
||||
agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
|
||||
),
|
||||
ExperimentResult(
|
||||
identifier="test-4",
|
||||
inputs={"query": "Another query"},
|
||||
score={"relevance": 9, "factuality": 8},
|
||||
expected_score={"relevance": 7, "factuality": 7},
|
||||
passed=True,
|
||||
agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
|
||||
),
|
||||
ExperimentResult(
|
||||
identifier="test-6",
|
||||
inputs={"query": "Yet another query"},
|
||||
score={"relevance": 9, "factuality": 8},
|
||||
expected_score={"relevance": 7, "factuality": 7},
|
||||
passed=True,
|
||||
agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
|
||||
)
|
||||
]
|
||||
|
||||
@patch('os.path.exists', return_value=True)
|
||||
@patch('os.path.getsize', return_value=1)
|
||||
@patch('json.load')
|
||||
@patch('builtins.open', new_callable=MagicMock)
|
||||
def test_experiment_results_compare_with_baseline(self, mock_open, mock_json_load, mock_path_getsize, mock_path_exists, mock_results):
|
||||
baseline_data = {
|
||||
"timestamp": "2023-01-01T00:00:00+00:00",
|
||||
"results": [
|
||||
{
|
||||
"identifier": "test-1",
|
||||
"inputs": {"query": "What is the capital of France?"},
|
||||
"score": 7,
|
||||
"expected_score": 7,
|
||||
"passed": False
|
||||
},
|
||||
{
|
||||
"identifier": "test-2",
|
||||
"inputs": {"query": "Who wrote Hamlet?"},
|
||||
"score": {"relevance": 8, "factuality": 7},
|
||||
"expected_score": {"relevance": 7, "factuality": 7},
|
||||
"passed": True
|
||||
},
|
||||
{
|
||||
"identifier": "test-3",
|
||||
"inputs": {"query": "Any query"},
|
||||
"score": {"relevance": 8, "factuality": 7},
|
||||
"expected_score": {"relevance": 7, "factuality": 7},
|
||||
"passed": True
|
||||
},
|
||||
{
|
||||
"identifier": "test-4",
|
||||
"inputs": {"query": "Another query"},
|
||||
"score": {"relevance": 8, "factuality": 7},
|
||||
"expected_score": {"relevance": 7, "factuality": 7},
|
||||
"passed": True
|
||||
},
|
||||
{
|
||||
"identifier": "test-5",
|
||||
"inputs": {"query": "Another query"},
|
||||
"score": {"relevance": 8, "factuality": 7},
|
||||
"expected_score": {"relevance": 7, "factuality": 7},
|
||||
"passed": True
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
mock_json_load.return_value = baseline_data
|
||||
|
||||
results = ExperimentResults(results=mock_results)
|
||||
results.display = MagicMock()
|
||||
|
||||
comparison = results.compare_with_baseline(baseline_filepath="baseline.json")
|
||||
|
||||
assert "baseline_timestamp" in comparison
|
||||
assert comparison["baseline_timestamp"] == "2023-01-01T00:00:00+00:00"
|
||||
assert comparison["improved"] == ["test-1"]
|
||||
assert comparison["regressed"] == ["test-3"]
|
||||
assert comparison["unchanged"] == ["test-2", "test-4"]
|
||||
assert comparison["new_tests"] == ["test-6"]
|
||||
assert comparison["missing_tests"] == ["test-5"]
|
||||
197
tests/experimental/evaluation/test_experiment_runner.py
Normal file
197
tests/experimental/evaluation/test_experiment_runner.py
Normal file
@@ -0,0 +1,197 @@
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from crewai.crew import Crew
|
||||
from crewai.experimental.evaluation.experiment.runner import ExperimentRunner
|
||||
from crewai.experimental.evaluation.experiment.result import ExperimentResults
|
||||
from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
|
||||
from crewai.experimental.evaluation.base_evaluator import MetricCategory, EvaluationScore
|
||||
|
||||
|
||||
class TestExperimentRunner:
|
||||
@pytest.fixture
|
||||
def mock_crew(self):
|
||||
return MagicMock(llm=Crew)
|
||||
|
||||
@pytest.fixture
|
||||
def mock_evaluator_results(self):
|
||||
agent_evaluation = AgentAggregatedEvaluationResult(
|
||||
agent_id="Test Agent",
|
||||
agent_role="Test Agent Role",
|
||||
metrics={
|
||||
MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
|
||||
score=9,
|
||||
feedback="Test feedback for goal alignment",
|
||||
raw_response="Test raw response for goal alignment"
|
||||
),
|
||||
MetricCategory.REASONING_EFFICIENCY: EvaluationScore(
|
||||
score=None,
|
||||
feedback="Reasoning efficiency not applicable",
|
||||
raw_response="Reasoning efficiency not applicable"
|
||||
),
|
||||
MetricCategory.PARAMETER_EXTRACTION: EvaluationScore(
|
||||
score=7,
|
||||
feedback="Test parameter extraction explanation",
|
||||
raw_response="Test raw output"
|
||||
),
|
||||
MetricCategory.TOOL_SELECTION: EvaluationScore(
|
||||
score=8,
|
||||
feedback="Test tool selection explanation",
|
||||
raw_response="Test raw output"
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
return {"Test Agent": agent_evaluation}
|
||||
|
||||
@patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
|
||||
def test_run_success(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
|
||||
dataset = [
|
||||
{
|
||||
"identifier": "test-case-1",
|
||||
"inputs": {"query": "Test query 1"},
|
||||
"expected_score": 8
|
||||
},
|
||||
{
|
||||
"identifier": "test-case-2",
|
||||
"inputs": {"query": "Test query 2"},
|
||||
"expected_score": {"goal_alignment": 7}
|
||||
},
|
||||
{
|
||||
"inputs": {"query": "Test query 3"},
|
||||
"expected_score": {"tool_selection": 9}
|
||||
}
|
||||
]
|
||||
|
||||
mock_evaluator = MagicMock()
|
||||
mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
|
||||
mock_evaluator.reset_iterations_results = MagicMock()
|
||||
mock_create_evaluator.return_value = mock_evaluator
|
||||
|
||||
runner = ExperimentRunner(dataset=dataset)
|
||||
|
||||
results = runner.run(crew=mock_crew)
|
||||
|
||||
assert isinstance(results, ExperimentResults)
|
||||
result_1, result_2, result_3 = results.results
|
||||
assert len(results.results) == 3
|
||||
|
||||
assert result_1.identifier == "test-case-1"
|
||||
assert result_1.inputs == {"query": "Test query 1"}
|
||||
assert result_1.expected_score == 8
|
||||
assert result_1.passed is True
|
||||
|
||||
assert result_2.identifier == "test-case-2"
|
||||
assert result_2.inputs == {"query": "Test query 2"}
|
||||
assert isinstance(result_2.expected_score, dict)
|
||||
assert "goal_alignment" in result_2.expected_score
|
||||
assert result_2.passed is True
|
||||
|
||||
assert result_3.identifier == "c2ed49e63aa9a83af3ca382794134fd5"
|
||||
assert result_3.inputs == {"query": "Test query 3"}
|
||||
assert isinstance(result_3.expected_score, dict)
|
||||
assert "tool_selection" in result_3.expected_score
|
||||
assert result_3.passed is False
|
||||
|
||||
assert mock_crew.kickoff.call_count == 3
|
||||
mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 1"})
|
||||
mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 2"})
|
||||
mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 3"})
|
||||
|
||||
assert mock_evaluator.reset_iterations_results.call_count == 3
|
||||
assert mock_evaluator.get_agent_evaluation.call_count == 3
|
||||
|
||||
|
||||
@patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
|
||||
def test_run_success_with_unknown_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
|
||||
dataset = [
|
||||
{
|
||||
"identifier": "test-case-2",
|
||||
"inputs": {"query": "Test query 2"},
|
||||
"expected_score": {"goal_alignment": 7, "unknown_metric": 8}
|
||||
}
|
||||
]
|
||||
|
||||
mock_evaluator = MagicMock()
|
||||
mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
|
||||
mock_evaluator.reset_iterations_results = MagicMock()
|
||||
mock_create_evaluator.return_value = mock_evaluator
|
||||
|
||||
runner = ExperimentRunner(dataset=dataset)
|
||||
|
||||
results = runner.run(crew=mock_crew)
|
||||
|
||||
result, = results.results
|
||||
|
||||
assert result.identifier == "test-case-2"
|
||||
assert result.inputs == {"query": "Test query 2"}
|
||||
assert isinstance(result.expected_score, dict)
|
||||
assert "goal_alignment" in result.expected_score.keys()
|
||||
assert "unknown_metric" in result.expected_score.keys()
|
||||
assert result.passed is True
|
||||
|
||||
@patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
|
||||
def test_run_success_with_single_metric_evaluator_and_expected_specific_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
|
||||
dataset = [
|
||||
{
|
||||
"identifier": "test-case-2",
|
||||
"inputs": {"query": "Test query 2"},
|
||||
"expected_score": {"goal_alignment": 7}
|
||||
}
|
||||
]
|
||||
|
||||
mock_evaluator = MagicMock()
|
||||
mock_create_evaluator["Test Agent"].metrics = {
|
||||
MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
|
||||
score=9,
|
||||
feedback="Test feedback for goal alignment",
|
||||
raw_response="Test raw response for goal alignment"
|
||||
)
|
||||
}
|
||||
mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
|
||||
mock_evaluator.reset_iterations_results = MagicMock()
|
||||
mock_create_evaluator.return_value = mock_evaluator
|
||||
|
||||
runner = ExperimentRunner(dataset=dataset)
|
||||
|
||||
results = runner.run(crew=mock_crew)
|
||||
result, = results.results
|
||||
|
||||
assert result.identifier == "test-case-2"
|
||||
assert result.inputs == {"query": "Test query 2"}
|
||||
assert isinstance(result.expected_score, dict)
|
||||
assert "goal_alignment" in result.expected_score.keys()
|
||||
assert result.passed is True
|
||||
|
||||
@patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
|
||||
def test_run_success_when_expected_metric_is_not_available(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
|
||||
dataset = [
|
||||
{
|
||||
"identifier": "test-case-2",
|
||||
"inputs": {"query": "Test query 2"},
|
||||
"expected_score": {"unknown_metric": 7}
|
||||
}
|
||||
]
|
||||
|
||||
mock_evaluator = MagicMock()
|
||||
mock_create_evaluator["Test Agent"].metrics = {
|
||||
MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
|
||||
score=5,
|
||||
feedback="Test feedback for goal alignment",
|
||||
raw_response="Test raw response for goal alignment"
|
||||
)
|
||||
}
|
||||
mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
|
||||
mock_evaluator.reset_iterations_results = MagicMock()
|
||||
mock_create_evaluator.return_value = mock_evaluator
|
||||
|
||||
runner = ExperimentRunner(dataset=dataset)
|
||||
|
||||
results = runner.run(crew=mock_crew)
|
||||
result, = results.results
|
||||
|
||||
assert result.identifier == "test-case-2"
|
||||
assert result.inputs == {"query": "Test query 2"}
|
||||
assert isinstance(result.expected_score, dict)
|
||||
assert "unknown_metric" in result.expected_score.keys()
|
||||
assert result.passed is False
|
||||
Reference in New Issue
Block a user