improved

2026-01-11 00:58:30 +00:00 · 2025-06-01 03:08:25 -07:00
parent 7c12aeaa0c
commit 6ecb30ee87
8 changed files with 1067 additions and 33 deletions
--- a/tests/cassettes/test_agent_with_knowledge_sources_generate_search_query.yaml
+++ b/tests/cassettes/test_agent_with_knowledge_sources_generate_search_query.yaml
--- a/tests/test_acceptance_criteria_validation.py
+++ b/tests/test_acceptance_criteria_validation.py
@@ -0,0 +1,215 @@
+"""Unit tests for acceptance criteria validation feature at task level."""
+
+import pytest
+from unittest.mock import MagicMock, patch, call
+from typing import List, Tuple
+
+from crewai.agents.crew_agent_executor import CrewAgentExecutor
+from crewai.agents.agent_state import AgentState
+from crewai.tools.agent_tools.scratchpad_tool import ScratchpadTool
+from crewai.agents.parser import AgentFinish
+from crewai.utilities import Printer
+from crewai.llm import LLM
+
+
+class TestAcceptanceCriteriaValidation:
+    """Test suite for task-level acceptance criteria validation functionality."""
+
+    def setup_method(self):
+        """Set up test fixtures."""
+        self.mock_llm = MagicMock(spec=LLM)
+        self.mock_agent = MagicMock()
+        self.mock_task = MagicMock()
+        self.mock_crew = MagicMock()
+        self.mock_tools_handler = MagicMock()
+
+        # Set up agent attributes
+        self.mock_agent.role = "Test Agent"
+        self.mock_agent.reasoning = True
+        self.mock_agent.verbose = False
+        self.mock_agent.reasoning_interval = None
+        self.mock_agent.adaptive_reasoning = False
+
+        # Create executor
+        self.executor = CrewAgentExecutor(
+            llm=self.mock_llm,
+            task=self.mock_task,
+            crew=self.mock_crew,
+            agent=self.mock_agent,
+            prompt={},
+            max_iter=10,
+            tools=[],
+            tools_names="",
+            stop_words=[],
+            tools_description="",
+            tools_handler=self.mock_tools_handler,
+            callbacks=[]
+        )
+
+        # Set up agent state with acceptance criteria
+        self.executor.agent_state = AgentState(task_id="test-task-id")
+        self.executor.agent_state.acceptance_criteria = [
+            "Include all required information",
+            "Format output properly",
+            "Provide complete analysis"
+        ]
+
+        # Mock printer
+        self.executor._printer = MagicMock(spec=Printer)
+
+    def test_validate_acceptance_criteria_all_met(self):
+        """Test validation when all acceptance criteria are met."""
+        output = "Complete output with all information, properly formatted, with full analysis"
+
+        # Configure LLM to return all criteria met
+        self.mock_llm.call.return_value = '''{
+            "1": "MET",
+            "2": "MET",
+            "3": "MET"
+        }'''
+
+        is_valid, unmet_criteria = self.executor._validate_acceptance_criteria(output)
+
+        assert is_valid is True
+        assert unmet_criteria == []
+        assert self.mock_llm.call.call_count == 1
+
+    def test_validate_acceptance_criteria_some_unmet(self):
+        """Test validation when some criteria are not met."""
+        output = "Partial output missing formatting"
+
+        # Configure LLM to return mixed results
+        self.mock_llm.call.return_value = '''{
+            "1": "MET",
+            "2": "NOT MET: Missing proper formatting",
+            "3": "NOT MET: Analysis incomplete"
+        }'''
+
+        is_valid, unmet_criteria = self.executor._validate_acceptance_criteria(output)
+
+        assert is_valid is False
+        assert len(unmet_criteria) == 2
+        assert "Format output properly" in unmet_criteria
+        assert "Provide complete analysis" in unmet_criteria
+
+    def test_create_criteria_retry_prompt_with_scratchpad(self):
+        """Test retry prompt creation when scratchpad has data."""
+        # Set up scratchpad tool with data
+        self.executor.scratchpad_tool = ScratchpadTool()
+        self.executor.agent_state.scratchpad = {
+            "research_data": {"key": "value"},
+            "analysis_results": ["item1", "item2"]
+        }
+
+        # Set up task details
+        self.mock_task.description = "Analyze research data and provide insights"
+        self.mock_task.expected_output = "A comprehensive report with analysis and recommendations"
+
+        unmet_criteria = ["Include specific examples", "Add recommendations"]
+
+        prompt = self.executor._create_criteria_retry_prompt(unmet_criteria)
+
+        # Verify prompt content with new format
+        assert "VALIDATION FAILED" in prompt
+        assert "YOU CANNOT PROVIDE A FINAL ANSWER YET" in prompt
+        assert "ORIGINAL TASK:" in prompt
+        assert "Analyze research data" in prompt
+        assert "EXPECTED OUTPUT:" in prompt
+        assert "comprehensive report" in prompt
+        assert "Include specific examples" in prompt
+        assert "Add recommendations" in prompt
+        assert "Access Scratchpad Memory" in prompt
+        assert "'research_data'" in prompt
+        assert "'analysis_results'" in prompt
+        assert "Action:" in prompt
+        assert "Action Input:" in prompt
+        assert "CONTINUE WITH TOOL USAGE NOW" in prompt
+        assert "DO NOT ATTEMPT ANOTHER FINAL ANSWER" in prompt
+
+    def test_create_criteria_retry_prompt_without_scratchpad(self):
+        """Test retry prompt creation when no scratchpad data exists."""
+        unmet_criteria = ["Add more detail"]
+
+        prompt = self.executor._create_criteria_retry_prompt(unmet_criteria)
+
+        assert "Add more detail" in prompt
+        assert "VALIDATION FAILED" in prompt
+        assert "📦 YOUR SCRATCHPAD CONTAINS DATA" not in prompt
+
+    @patch('crewai.agents.crew_agent_executor.get_llm_response')
+    @patch('crewai.agents.crew_agent_executor.process_llm_response')
+    def test_invoke_loop_blocks_incomplete_final_answer(self, mock_process, mock_get_response):
+        """Test that invoke loop blocks incomplete final answers."""
+        # Set up conditions
+        self.executor.agent_state.acceptance_criteria = ["Complete all sections"]
+
+        # First attempt returns incomplete final answer
+        incomplete_answer = AgentFinish(
+            thought="Done",
+            output="Exploring potential follow-up tasks!",
+            text="Final Answer: Exploring potential follow-up tasks!"
+        )
+
+        # After retry, return complete answer
+        complete_answer = AgentFinish(
+            thought="Done with all sections",
+            output="Complete output with all sections addressed",
+            text="Final Answer: Complete output with all sections addressed"
+        )
+
+        # Configure mocks
+        mock_process.side_effect = [incomplete_answer, complete_answer]
+        mock_get_response.return_value = "response"
+
+        # Configure validation
+        self.mock_llm.call.side_effect = [
+            '{"1": "NOT MET: Missing required sections"}',  # First validation fails
+            '{"1": "MET"}'  # Second validation passes
+        ]
+
+        # Execute
+        result = self.executor._invoke_loop()
+
+        # Verify
+        assert result == complete_answer
+        assert self.mock_llm.call.call_count == 2  # Two validation attempts
+        assert mock_process.call_count == 2  # Two processing attempts
+
+        # Verify error message was shown
+        self._verify_validation_messages_shown()
+
+    def test_validation_happens_on_every_final_answer_attempt(self):
+        """Test that validation happens on every AgentFinish attempt."""
+        self.executor.agent_state.acceptance_criteria = ["Complete all sections"]
+
+        # Configure LLM to always return criteria not met
+        self.mock_llm.call.return_value = '{"1": "NOT MET: Missing required sections"}'
+
+        output = "Incomplete output"
+
+        # Validate multiple times - each should trigger validation
+        for _ in range(3):
+            is_valid, unmet_criteria = self.executor._validate_acceptance_criteria(output)
+            assert is_valid is False
+            assert len(unmet_criteria) == 1
+
+        # Verify validation was called every time
+        assert self.mock_llm.call.call_count == 3
+
+    def _verify_validation_messages_shown(self):
+        """Helper to verify validation messages were displayed."""
+        print_calls = self.executor._printer.print.call_args_list
+
+        # Check for validation message
+        validation_msg_shown = any(
+            "Validating acceptance criteria" in str(call)
+            for call in print_calls
+        )
+
+        # Check for failure message
+        failure_msg_shown = any(
+            "Cannot finalize" in str(call)
+            for call in print_calls
+        )
+
+        assert validation_msg_shown or failure_msg_shown
--- a/tests/tools/agent_tools/test_scratchpad_tool.py
+++ b/tests/tools/agent_tools/test_scratchpad_tool.py
@@ -0,0 +1,137 @@
+"""Unit tests for the ScratchpadTool."""
+
+import pytest
+from crewai.tools.agent_tools.scratchpad_tool import ScratchpadTool, ScratchpadToolSchema
+
+
+class TestScratchpadTool:
+    """Test suite for the ScratchpadTool functionality."""
+
+    def test_schema_description(self):
+        """Test that the schema has helpful description."""
+        schema = ScratchpadToolSchema
+        key_field = schema.model_fields['key']
+
+        assert "Example:" in key_field.description
+        assert '{"key":' in key_field.description
+
+    def test_empty_scratchpad_error_message(self):
+        """Test error message when scratchpad is empty."""
+        tool = ScratchpadTool()
+        result = tool._run(key="nonexistent")
+
+        assert "❌ SCRATCHPAD IS EMPTY" in result
+        assert "does not contain any data yet" in result
+        assert "Try executing other tools first" in result
+
+    def test_key_not_found_error_message(self):
+        """Test error message when key is not found."""
+        tool = ScratchpadTool(scratchpad_data={
+            "existing_key": "value",
+            "another_key": {"data": "test"}
+        })
+
+        result = tool._run(key="wrong_key")
+
+        assert "❌ KEY NOT FOUND: 'wrong_key'" in result
+        assert "Available keys:" in result
+        assert "- 'existing_key'" in result
+        assert "- 'another_key'" in result
+        assert 'Example Action Input: {"key": "existing_key"}' in result
+        assert "Keys are case-sensitive" in result
+
+    def test_successful_retrieval_string(self):
+        """Test successful retrieval of string data."""
+        tool = ScratchpadTool(scratchpad_data={
+            "message": "Hello, World!"
+        })
+
+        result = tool._run(key="message")
+        assert result == "Hello, World!"
+
+    def test_successful_retrieval_dict(self):
+        """Test successful retrieval of dictionary data."""
+        test_dict = {"name": "John", "age": 30}
+        tool = ScratchpadTool(scratchpad_data={
+            "user_data": test_dict
+        })
+
+        result = tool._run(key="user_data")
+        assert '"name": "John"' in result
+        assert '"age": 30' in result
+
+    def test_successful_retrieval_list(self):
+        """Test successful retrieval of list data."""
+        test_list = ["item1", "item2", "item3"]
+        tool = ScratchpadTool(scratchpad_data={
+            "items": test_list
+        })
+
+        result = tool._run(key="items")
+        assert '"item1"' in result
+        assert '"item2"' in result
+        assert '"item3"' in result
+
+    def test_tool_description_empty(self):
+        """Test tool description when scratchpad is empty."""
+        tool = ScratchpadTool()
+
+        assert "HOW TO USE THIS TOOL:" in tool.description
+        assert 'Example: {"key": "email_data"}' in tool.description
+        assert "📝 STATUS: Scratchpad is currently empty" in tool.description
+
+    def test_tool_description_with_data(self):
+        """Test tool description when scratchpad has data."""
+        tool = ScratchpadTool(scratchpad_data={
+            "emails": ["email1@test.com", "email2@test.com"],
+            "results": {"count": 5, "status": "success"},
+            "api_key": "secret_key_123"
+        })
+
+        desc = tool.description
+
+        # Check basic structure
+        assert "HOW TO USE THIS TOOL:" in desc
+        assert "📦 AVAILABLE DATA IN SCRATCHPAD:" in desc
+        assert "💡 EXAMPLE USAGE:" in desc
+
+        # Check key listings
+        assert "📌 'emails': list of 2 items" in desc
+        assert "📌 'results': dict with 2 items" in desc
+        assert "📌 'api_key': string (14 chars)" in desc
+
+        # Check example uses first key
+        assert 'Action Input: {"key": "emails"}' in desc
+
+    def test_update_scratchpad(self):
+        """Test updating scratchpad data."""
+        tool = ScratchpadTool()
+
+        # Initially empty
+        assert not tool.scratchpad_data
+
+        # Update with data
+        new_data = {"test": "value"}
+        tool.update_scratchpad(new_data)
+
+        assert tool.scratchpad_data == new_data
+        assert "📌 'test': string (5 chars)" in tool.description
+
+    def test_complex_data_preview(self):
+        """Test preview generation for complex data structures."""
+        tool = ScratchpadTool(scratchpad_data={
+            "nested_dict": {
+                "data": ["item1", "item2", "item3"]
+            },
+            "empty_list": [],
+            "boolean_value": True,
+            "number": 42
+        })
+
+        desc = tool.description
+
+        # Special case for dict with 'data' key containing list
+        assert "📌 'nested_dict': list of 3 items" in desc
+        assert "📌 'empty_list': list of 0 items" in desc
+        assert "📌 'boolean_value': bool" in desc
+        assert "📌 'number': int" in desc