crewAI/tests/test_acceptance_criteria_validation.py

"""Unit tests for acceptance criteria validation feature at task level."""

import pytest
from unittest.mock import MagicMock, patch, call
from typing import List, Tuple

from crewai.agents.crew_agent_executor import CrewAgentExecutor
from crewai.agents.agent_state import AgentState
from crewai.tools.agent_tools.scratchpad_tool import ScratchpadTool
from crewai.agents.parser import AgentFinish
from crewai.utilities import Printer
from crewai.llm import LLM


class TestAcceptanceCriteriaValidation:
    """Test suite for task-level acceptance criteria validation functionality."""

    def setup_method(self):
        """Set up test fixtures."""
        self.mock_llm = MagicMock(spec=LLM)
        self.mock_agent = MagicMock()
        self.mock_task = MagicMock()
        self.mock_crew = MagicMock()
        self.mock_tools_handler = MagicMock()

        # Set up agent attributes
        self.mock_agent.role = "Test Agent"
        self.mock_agent.reasoning = True
        self.mock_agent.verbose = False
        self.mock_agent.reasoning_interval = None
        self.mock_agent.adaptive_reasoning = False

        # Create executor
        self.executor = CrewAgentExecutor(
            llm=self.mock_llm,
            task=self.mock_task,
            crew=self.mock_crew,
            agent=self.mock_agent,
            prompt={},
            max_iter=10,
            tools=[],
            tools_names="",
            stop_words=[],
            tools_description="",
            tools_handler=self.mock_tools_handler,
            callbacks=[]
        )

        # Set up agent state with acceptance criteria
        self.executor.agent_state = AgentState(task_id="test-task-id")
        self.executor.agent_state.acceptance_criteria = [
            "Include all required information",
            "Format output properly",
            "Provide complete analysis"
        ]

        # Mock printer
        self.executor._printer = MagicMock(spec=Printer)

    def test_validate_acceptance_criteria_all_met(self):
        """Test validation when all acceptance criteria are met."""
        output = "Complete output with all information, properly formatted, with full analysis"

        # Configure LLM to return all criteria met
        self.mock_llm.call.return_value = '''{
            "1": "MET",
            "2": "MET",
            "3": "MET"
        }'''

        is_valid, unmet_criteria = self.executor._validate_acceptance_criteria(output)

        assert is_valid is True
        assert unmet_criteria == []
        assert self.mock_llm.call.call_count == 1

    def test_validate_acceptance_criteria_some_unmet(self):
        """Test validation when some criteria are not met."""
        output = "Partial output missing formatting"

        # Configure LLM to return mixed results
        self.mock_llm.call.return_value = '''{
            "1": "MET",
            "2": "NOT MET: Missing proper formatting",
            "3": "NOT MET: Analysis incomplete"
        }'''

        is_valid, unmet_criteria = self.executor._validate_acceptance_criteria(output)

        assert is_valid is False
        assert len(unmet_criteria) == 2
        assert "Format output properly" in unmet_criteria
        assert "Provide complete analysis" in unmet_criteria

    def test_create_criteria_retry_prompt_with_scratchpad(self):
        """Test retry prompt creation when scratchpad has data."""
        # Set up scratchpad tool with data
        self.executor.scratchpad_tool = ScratchpadTool()
        self.executor.agent_state.scratchpad = {
            "research_data": {"key": "value"},
            "analysis_results": ["item1", "item2"]
        }

        # Set up task details
        self.mock_task.description = "Analyze research data and provide insights"
        self.mock_task.expected_output = "A comprehensive report with analysis and recommendations"

        unmet_criteria = ["Include specific examples", "Add recommendations"]

        prompt = self.executor._create_criteria_retry_prompt(unmet_criteria)

        # Verify prompt content with new format
        assert "VALIDATION FAILED" in prompt
        assert "YOU CANNOT PROVIDE A FINAL ANSWER YET" in prompt
        assert "ORIGINAL TASK:" in prompt
        assert "Analyze research data" in prompt
        assert "EXPECTED OUTPUT:" in prompt
        assert "comprehensive report" in prompt
        assert "Include specific examples" in prompt
        assert "Add recommendations" in prompt
        assert "Access Scratchpad Memory" in prompt
        assert "'research_data'" in prompt
        assert "'analysis_results'" in prompt
        assert "Action:" in prompt
        assert "Action Input:" in prompt
        assert "CONTINUE WITH TOOL USAGE NOW" in prompt
        assert "DO NOT ATTEMPT ANOTHER FINAL ANSWER" in prompt

    def test_create_criteria_retry_prompt_without_scratchpad(self):
        """Test retry prompt creation when no scratchpad data exists."""
        unmet_criteria = ["Add more detail"]

        prompt = self.executor._create_criteria_retry_prompt(unmet_criteria)

        assert "Add more detail" in prompt
        assert "VALIDATION FAILED" in prompt
        assert "📦 YOUR SCRATCHPAD CONTAINS DATA" not in prompt

    @patch('crewai.agents.crew_agent_executor.get_llm_response')
    @patch('crewai.agents.crew_agent_executor.process_llm_response')
    def test_invoke_loop_blocks_incomplete_final_answer(self, mock_process, mock_get_response):
        """Test that invoke loop blocks incomplete final answers."""
        # Set up conditions
        self.executor.agent_state.acceptance_criteria = ["Complete all sections"]

        # First attempt returns incomplete final answer
        incomplete_answer = AgentFinish(
            thought="Done",
            output="Exploring potential follow-up tasks!",
            text="Final Answer: Exploring potential follow-up tasks!"
        )

        # After retry, return complete answer
        complete_answer = AgentFinish(
            thought="Done with all sections",
            output="Complete output with all sections addressed",
            text="Final Answer: Complete output with all sections addressed"
        )

        # Configure mocks
        mock_process.side_effect = [incomplete_answer, complete_answer]
        mock_get_response.return_value = "response"

        # Configure validation
        self.mock_llm.call.side_effect = [
            '{"1": "NOT MET: Missing required sections"}',  # First validation fails
            '{"1": "MET"}'  # Second validation passes
        ]

        # Execute
        result = self.executor._invoke_loop()

        # Verify
        assert result == complete_answer
        assert self.mock_llm.call.call_count == 2  # Two validation attempts
        assert mock_process.call_count == 2  # Two processing attempts

        # Verify error message was shown
        self._verify_validation_messages_shown()

    def test_validation_happens_on_every_final_answer_attempt(self):
        """Test that validation happens on every AgentFinish attempt."""
        self.executor.agent_state.acceptance_criteria = ["Complete all sections"]

        # Configure LLM to always return criteria not met
        self.mock_llm.call.return_value = '{"1": "NOT MET: Missing required sections"}'

        output = "Incomplete output"

        # Validate multiple times - each should trigger validation
        for _ in range(3):
            is_valid, unmet_criteria = self.executor._validate_acceptance_criteria(output)
            assert is_valid is False
            assert len(unmet_criteria) == 1

        # Verify validation was called every time
        assert self.mock_llm.call.call_count == 3

    def _verify_validation_messages_shown(self):
        """Helper to verify validation messages were displayed."""
        print_calls = self.executor._printer.print.call_args_list

        # Check for validation message
        validation_msg_shown = any(
            "Validating acceptance criteria" in str(call)
            for call in print_calls
        )

        # Check for failure message
        failure_msg_shown = any(
            "Cannot finalize" in str(call)
            for call in print_calls
        )

        assert validation_msg_shown or failure_msg_shown