crewAI/src/crewai/evaluation/metrics/goal_metrics.py

from typing import Any, Dict

from crewai.agent import Agent
from crewai.task import Task

from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
from crewai.evaluation.json_parser import extract_json_from_llm_response

class GoalAlignmentEvaluator(BaseEvaluator):
    @property
    def metric_category(self) -> MetricCategory:
        return MetricCategory.GOAL_ALIGNMENT

    def evaluate(
        self,
        agent: Agent,
        task: Task,
        execution_trace: Dict[str, Any],
        final_output: Any,
    ) -> EvaluationScore:
        prompt = [
            {"role": "system", "content": """You are an expert evaluator assessing how well an AI agent's output aligns with its assigned task goal.

Score the agent's goal alignment on a scale from 0-10 where:
- 0: Complete misalignment, agent did not understand or attempt the task goal
- 5: Partial alignment, agent attempted the task but missed key requirements
- 10: Perfect alignment, agent fully satisfied all task requirements

Consider:
1. Did the agent correctly interpret the task goal?
2. Did the final output directly address the requirements?
3. Did the agent focus on relevant aspects of the task?
4. Did the agent provide all requested information or deliverables?

Return your evaluation as JSON with fields 'score' (number) and 'feedback' (string).
"""},
            {"role": "user", "content": f"""
Agent role: {agent.role}
Agent goal: {agent.goal}
Task description: {task.description}
Expected output: {task.expected_output}

Agent's final output:
{final_output}

Evaluate how well the agent's output aligns with the assigned task goal.
"""}
        ]
        assert self.llm is not None
        response = self.llm.call(prompt)

        try:
            evaluation_data: dict[str, Any] = extract_json_from_llm_response(response)
            assert evaluation_data is not None

            return EvaluationScore(
                score=evaluation_data.get("score", 0),
                feedback=evaluation_data.get("feedback", response),
                raw_response=response
            )
        except Exception:
            return EvaluationScore(
                score=None,
                feedback=f"Failed to parse evaluation. Raw response: {response}",
                raw_response=response
            )