mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-08 15:48:29 +00:00
* feat: add exchanged messages in LLMCallCompletedEvent * feat: add GoalAlignment metric for Agent evaluation * feat: add SemanticQuality metric for Agent evaluation * feat: add Tool Metrics for Agent evaluation * feat: add Reasoning Metrics for Agent evaluation, still in progress * feat: add AgentEvaluator class This class will evaluate Agent' results and report to user * fix: do not evaluate Agent by default This is a experimental feature we still need refine it further * test: add Agent eval tests * fix: render all feedback per iteration * style: resolve linter issues * style: fix mypy issues * fix: allow messages be empty on LLMCallCompletedEvent * feat: add Experiment evaluation framework with baseline comparison * fix: reset evaluator for each experiement iteraction * fix: fix track of new test cases * chore: split Experimental evaluation classes * refactor: remove unused method * refactor: isolate Console print in a dedicated class * fix: make crew required to run an experiment * fix: use time-aware to define experiment result * test: add tests for Evaluator Experiment * style: fix linter issues * fix: encode string before hashing * style: resolve linter issues * feat: add experimental folder for beta features (#3141) * test: move tests to experimental folder
111 lines
4.5 KiB
Python
111 lines
4.5 KiB
Python
import pytest
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
from crewai.experimental.evaluation.experiment.result import ExperimentResult, ExperimentResults
|
|
|
|
|
|
class TestExperimentResult:
|
|
@pytest.fixture
|
|
def mock_results(self):
|
|
return [
|
|
ExperimentResult(
|
|
identifier="test-1",
|
|
inputs={"query": "What is the capital of France?"},
|
|
score=10,
|
|
expected_score=7,
|
|
passed=True
|
|
),
|
|
ExperimentResult(
|
|
identifier="test-2",
|
|
inputs={"query": "Who wrote Hamlet?"},
|
|
score={"relevance": 9, "factuality": 8},
|
|
expected_score={"relevance": 7, "factuality": 7},
|
|
passed=True,
|
|
agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
|
|
),
|
|
ExperimentResult(
|
|
identifier="test-3",
|
|
inputs={"query": "Any query"},
|
|
score={"relevance": 9, "factuality": 8},
|
|
expected_score={"relevance": 7, "factuality": 7},
|
|
passed=False,
|
|
agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
|
|
),
|
|
ExperimentResult(
|
|
identifier="test-4",
|
|
inputs={"query": "Another query"},
|
|
score={"relevance": 9, "factuality": 8},
|
|
expected_score={"relevance": 7, "factuality": 7},
|
|
passed=True,
|
|
agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
|
|
),
|
|
ExperimentResult(
|
|
identifier="test-6",
|
|
inputs={"query": "Yet another query"},
|
|
score={"relevance": 9, "factuality": 8},
|
|
expected_score={"relevance": 7, "factuality": 7},
|
|
passed=True,
|
|
agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
|
|
)
|
|
]
|
|
|
|
@patch('os.path.exists', return_value=True)
|
|
@patch('os.path.getsize', return_value=1)
|
|
@patch('json.load')
|
|
@patch('builtins.open', new_callable=MagicMock)
|
|
def test_experiment_results_compare_with_baseline(self, mock_open, mock_json_load, mock_path_getsize, mock_path_exists, mock_results):
|
|
baseline_data = {
|
|
"timestamp": "2023-01-01T00:00:00+00:00",
|
|
"results": [
|
|
{
|
|
"identifier": "test-1",
|
|
"inputs": {"query": "What is the capital of France?"},
|
|
"score": 7,
|
|
"expected_score": 7,
|
|
"passed": False
|
|
},
|
|
{
|
|
"identifier": "test-2",
|
|
"inputs": {"query": "Who wrote Hamlet?"},
|
|
"score": {"relevance": 8, "factuality": 7},
|
|
"expected_score": {"relevance": 7, "factuality": 7},
|
|
"passed": True
|
|
},
|
|
{
|
|
"identifier": "test-3",
|
|
"inputs": {"query": "Any query"},
|
|
"score": {"relevance": 8, "factuality": 7},
|
|
"expected_score": {"relevance": 7, "factuality": 7},
|
|
"passed": True
|
|
},
|
|
{
|
|
"identifier": "test-4",
|
|
"inputs": {"query": "Another query"},
|
|
"score": {"relevance": 8, "factuality": 7},
|
|
"expected_score": {"relevance": 7, "factuality": 7},
|
|
"passed": True
|
|
},
|
|
{
|
|
"identifier": "test-5",
|
|
"inputs": {"query": "Another query"},
|
|
"score": {"relevance": 8, "factuality": 7},
|
|
"expected_score": {"relevance": 7, "factuality": 7},
|
|
"passed": True
|
|
}
|
|
]
|
|
}
|
|
|
|
mock_json_load.return_value = baseline_data
|
|
|
|
results = ExperimentResults(results=mock_results)
|
|
results.display = MagicMock()
|
|
|
|
comparison = results.compare_with_baseline(baseline_filepath="baseline.json")
|
|
|
|
assert "baseline_timestamp" in comparison
|
|
assert comparison["baseline_timestamp"] == "2023-01-01T00:00:00+00:00"
|
|
assert comparison["improved"] == ["test-1"]
|
|
assert comparison["regressed"] == ["test-3"]
|
|
assert comparison["unchanged"] == ["test-2", "test-4"]
|
|
assert comparison["new_tests"] == ["test-6"]
|
|
assert comparison["missing_tests"] == ["test-5"] |