mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-08 15:48:29 +00:00
* feat: add exchanged messages in LLMCallCompletedEvent * feat: add GoalAlignment metric for Agent evaluation * feat: add SemanticQuality metric for Agent evaluation * feat: add Tool Metrics for Agent evaluation * feat: add Reasoning Metrics for Agent evaluation, still in progress * feat: add AgentEvaluator class This class will evaluate Agent' results and report to user * fix: do not evaluate Agent by default This is a experimental feature we still need refine it further * test: add Agent eval tests * fix: render all feedback per iteration * style: resolve linter issues * style: fix mypy issues * fix: allow messages be empty on LLMCallCompletedEvent * feat: add Experiment evaluation framework with baseline comparison * fix: reset evaluator for each experiement iteraction * fix: fix track of new test cases * chore: split Experimental evaluation classes * refactor: remove unused method * refactor: isolate Console print in a dedicated class * fix: make crew required to run an experiment * fix: use time-aware to define experiment result * test: add tests for Evaluator Experiment * style: fix linter issues * fix: encode string before hashing * style: resolve linter issues * feat: add experimental folder for beta features (#3141) * test: move tests to experimental folder
60 lines
2.2 KiB
Python
60 lines
2.2 KiB
Python
from unittest.mock import patch, MagicMock
|
|
from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
|
|
|
from crewai.experimental.evaluation.base_evaluator import EvaluationScore
|
|
from crewai.experimental.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator
|
|
from crewai.utilities.llm_utils import LLM
|
|
|
|
|
|
class TestGoalAlignmentEvaluator(BaseEvaluationMetricsTest):
|
|
@patch("crewai.utilities.llm_utils.create_llm")
|
|
def test_evaluate_success(self, mock_create_llm, mock_agent, mock_task, execution_trace):
|
|
mock_llm = MagicMock(spec=LLM)
|
|
mock_llm.call.return_value = """
|
|
{
|
|
"score": 8.5,
|
|
"feedback": "The agent correctly understood the task and produced relevant output."
|
|
}
|
|
"""
|
|
mock_create_llm.return_value = mock_llm
|
|
|
|
evaluator = GoalAlignmentEvaluator(llm=mock_llm)
|
|
|
|
result = evaluator.evaluate(
|
|
agent=mock_agent,
|
|
task=mock_task,
|
|
execution_trace=execution_trace,
|
|
final_output="This is the final output"
|
|
)
|
|
|
|
assert isinstance(result, EvaluationScore)
|
|
assert result.score == 8.5
|
|
assert "correctly understood the task" in result.feedback
|
|
|
|
mock_llm.call.assert_called_once()
|
|
prompt = mock_llm.call.call_args[0][0]
|
|
assert len(prompt) >= 2
|
|
assert "system" in prompt[0]["role"]
|
|
assert "user" in prompt[1]["role"]
|
|
assert mock_agent.role in prompt[1]["content"]
|
|
assert mock_task.description in prompt[1]["content"]
|
|
|
|
@patch("crewai.utilities.llm_utils.create_llm")
|
|
def test_evaluate_error_handling(self, mock_create_llm, mock_agent, mock_task, execution_trace):
|
|
mock_llm = MagicMock(spec=LLM)
|
|
mock_llm.call.return_value = "Invalid JSON response"
|
|
mock_create_llm.return_value = mock_llm
|
|
|
|
evaluator = GoalAlignmentEvaluator(llm=mock_llm)
|
|
|
|
result = evaluator.evaluate(
|
|
agent=mock_agent,
|
|
task=mock_task,
|
|
execution_trace=execution_trace,
|
|
final_output="This is the final output"
|
|
)
|
|
|
|
assert isinstance(result, EvaluationScore)
|
|
assert result.score is None
|
|
assert "Failed to parse" in result.feedback
|