Implement LLM generations, logprobs, and XML parsing features

- Add support for n generations and logprobs parameters in LLM class - Extend Agent class to accept LLM generation parameters (llm_n, llm_logprobs, llm_top_logprobs) - Add return_full_completion parameter to access complete LLM response metadata - Implement XML parser utility for extracting content from tags like <thinking> - Add completion metadata support to TaskOutput and LiteAgentOutput classes - Add comprehensive tests and examples demonstrating new functionality - Maintain full backward compatibility with existing code Addresses issue #3052: How to obtain n generations or generations in different tags Co-Authored-By: João <joao@crewai.com>
2026-01-25 08:08:14 +00:00 · 2025-06-24 05:14:22 +00:00
parent c96d4a6823
commit 39ea952acd
10 changed files with 1090 additions and 27 deletions
--- a/tests/test_integration_llm_features.py
+++ b/tests/test_integration_llm_features.py
@@ -0,0 +1,158 @@
+import pytest
+from unittest.mock import Mock, patch
+from crewai import Agent, Task, Crew, LLM
+from crewai.lite_agent import LiteAgent
+from crewai.utilities.xml_parser import extract_xml_content
+
+
+class TestIntegrationLLMFeatures:
+    """Integration tests for LLM features with agents and tasks."""
+
+    @patch('crewai.llm.litellm.completion')
+    def test_agent_with_multiple_generations(self, mock_completion):
+        """Test agent execution with multiple generations."""
+        mock_response = Mock()
+        mock_response.choices = [
+            Mock(message=Mock(content="Generation 1")),
+            Mock(message=Mock(content="Generation 2")),
+            Mock(message=Mock(content="Generation 3")),
+        ]
+        mock_response.usage = {"prompt_tokens": 20, "completion_tokens": 30}
+        mock_response.model = "gpt-3.5-turbo"
+        mock_response.created = 1234567890
+        mock_response.id = "test-id"
+        mock_response.object = "chat.completion"
+        mock_response.system_fingerprint = "test-fingerprint"
+        mock_completion.return_value = mock_response
+
+        llm = LLM(model="gpt-3.5-turbo", n=3, return_full_completion=True)
+        agent = Agent(
+            role="writer",
+            goal="write content",
+            backstory="You are a writer",
+            llm=llm,
+            return_completion_metadata=True,
+        )
+
+        task = Task(
+            description="Write a short story",
+            agent=agent,
+            expected_output="A short story",
+        )
+
+        with patch.object(agent, 'agent_executor') as mock_executor:
+            mock_executor.invoke.return_value = {"output": "Generation 1"}
+            
+            result = agent.execute_task(task)
+            assert result == "Generation 1"
+
+    @patch('crewai.llm.litellm.completion')
+    def test_lite_agent_with_xml_extraction(self, mock_completion):
+        """Test LiteAgent with XML content extraction."""
+        response_with_xml = """
+        <thinking>
+        I need to analyze this problem step by step.
+        First, I'll consider the requirements.
+        </thinking>
+        
+        Based on my analysis, here's the solution: The answer is 42.
+        """
+
+        mock_response = Mock()
+        mock_response.choices = [Mock(message=Mock(content=response_with_xml))]
+        mock_response.usage = {"prompt_tokens": 15, "completion_tokens": 25}
+        mock_response.model = "gpt-3.5-turbo"
+        mock_response.created = 1234567890
+        mock_response.id = "test-id"
+        mock_response.object = "chat.completion"
+        mock_response.system_fingerprint = "test-fingerprint"
+        mock_completion.return_value = mock_response
+
+        lite_agent = LiteAgent(
+            role="analyst",
+            goal="analyze problems",
+            backstory="You are an analyst",
+            llm=LLM(model="gpt-3.5-turbo", return_full_completion=True),
+        )
+
+        with patch.object(lite_agent, '_invoke_loop') as mock_invoke:
+            mock_invoke.return_value = response_with_xml
+            
+            result = lite_agent.kickoff("Analyze this problem")
+            
+            thinking_content = extract_xml_content(result.raw, "thinking")
+            assert thinking_content is not None
+            assert "step by step" in thinking_content
+            assert "requirements" in thinking_content
+
+    def test_xml_parser_with_complex_agent_output(self):
+        """Test XML parser with complex agent output containing multiple tags."""
+        complex_output = """
+        <thinking>
+        This is a complex problem that requires careful analysis.
+        I need to break it down into steps.
+        </thinking>
+        
+        <reasoning>
+        Step 1: Understand the requirements
+        Step 2: Analyze the constraints
+        Step 3: Develop a solution
+        </reasoning>
+        
+        <conclusion>
+        The best approach is to use a systematic methodology.
+        </conclusion>
+        
+        Final answer: Use the systematic approach outlined above.
+        """
+
+        thinking = extract_xml_content(complex_output, "thinking")
+        reasoning = extract_xml_content(complex_output, "reasoning")
+        conclusion = extract_xml_content(complex_output, "conclusion")
+
+        assert thinking is not None
+        assert "complex problem" in thinking
+        assert reasoning is not None
+        assert "Step 1" in reasoning
+        assert "Step 2" in reasoning
+        assert "Step 3" in reasoning
+        assert conclusion is not None
+        assert "systematic methodology" in conclusion
+
+    @patch('crewai.llm.litellm.completion')
+    def test_crew_with_llm_parameters(self, mock_completion):
+        """Test crew execution with LLM parameters."""
+        mock_response = Mock()
+        mock_response.choices = [Mock(message=Mock(content="Test response"))]
+        mock_response.usage = {"prompt_tokens": 10, "completion_tokens": 5}
+        mock_response.model = "gpt-3.5-turbo"
+        mock_response.created = 1234567890
+        mock_response.id = "test-id"
+        mock_response.object = "chat.completion"
+        mock_response.system_fingerprint = "test-fingerprint"
+        mock_completion.return_value = mock_response
+
+        agent = Agent(
+            role="analyst",
+            goal="analyze data",
+            backstory="You are an analyst",
+            llm_n=2,
+            llm_logprobs=5,
+            return_completion_metadata=True,
+        )
+
+        task = Task(
+            description="Analyze the data",
+            agent=agent,
+            expected_output="Analysis results",
+        )
+
+        crew = Crew(agents=[agent], tasks=[task])
+        
+        with patch.object(crew, 'kickoff') as mock_kickoff:
+            mock_output = Mock()
+            mock_output.tasks_output = [Mock(completion_metadata={"choices": mock_response.choices})]
+            mock_kickoff.return_value = mock_output
+            
+            result = crew.kickoff()
+            assert result is not None
--- a/tests/test_llm_generations_logprobs.py
+++ b/tests/test_llm_generations_logprobs.py
@@ -0,0 +1,227 @@
+import pytest
+from unittest.mock import Mock, patch
+from crewai import Agent, Task, LLM
+from crewai.tasks.task_output import TaskOutput
+from crewai.lite_agent import LiteAgent, LiteAgentOutput
+from crewai.utilities.xml_parser import (
+    extract_xml_content,
+    extract_all_xml_content,
+    extract_multiple_xml_tags,
+    extract_multiple_xml_tags_all,
+    extract_xml_with_attributes,
+    remove_xml_tags,
+    strip_xml_tags_keep_content,
+)
+
+
+class TestLLMGenerationsLogprobs:
+    """Test suite for LLM generations and logprobs functionality."""
+
+    def test_llm_with_n_parameter(self):
+        """Test that LLM accepts n parameter for multiple generations."""
+        llm = LLM(model="gpt-3.5-turbo", n=3)
+        assert llm.n == 3
+
+    def test_llm_with_logprobs_parameter(self):
+        """Test that LLM accepts logprobs parameter."""
+        llm = LLM(model="gpt-3.5-turbo", logprobs=5)
+        assert llm.logprobs == 5
+
+    def test_llm_with_return_full_completion(self):
+        """Test that LLM accepts return_full_completion parameter."""
+        llm = LLM(model="gpt-3.5-turbo", return_full_completion=True)
+        assert llm.return_full_completion is True
+
+    def test_agent_with_llm_parameters(self):
+        """Test that Agent accepts LLM generation parameters."""
+        agent = Agent(
+            role="test",
+            goal="test",
+            backstory="test",
+            llm_n=3,
+            llm_logprobs=5,
+            llm_top_logprobs=3,
+            return_completion_metadata=True,
+        )
+        assert agent.llm_n == 3
+        assert agent.llm_logprobs == 5
+        assert agent.llm_top_logprobs == 3
+        assert agent.return_completion_metadata is True
+
+    @patch('crewai.llm.litellm.completion')
+    def test_llm_call_returns_full_completion(self, mock_completion):
+        """Test that LLM.call can return full completion object."""
+        mock_response = Mock()
+        mock_response.choices = [Mock()]
+        mock_response.choices[0].message.content = "Test response"
+        mock_response.usage = {"prompt_tokens": 10, "completion_tokens": 5}
+        mock_response.model = "gpt-3.5-turbo"
+        mock_response.created = 1234567890
+        mock_response.id = "test-id"
+        mock_response.object = "chat.completion"
+        mock_response.system_fingerprint = "test-fingerprint"
+        mock_completion.return_value = mock_response
+
+        llm = LLM(model="gpt-3.5-turbo", return_full_completion=True)
+        result = llm.call("Test message")
+
+        assert isinstance(result, dict)
+        assert result["content"] == "Test response"
+        assert "choices" in result
+        assert "usage" in result
+        assert result["model"] == "gpt-3.5-turbo"
+
+    def test_task_output_completion_metadata(self):
+        """Test TaskOutput with completion metadata."""
+        mock_choices = [
+            Mock(message=Mock(content="Generation 1")),
+            Mock(message=Mock(content="Generation 2")),
+        ]
+        mock_usage = {"prompt_tokens": 10, "completion_tokens": 15}
+        
+        completion_metadata = {
+            "choices": mock_choices,
+            "usage": mock_usage,
+            "model": "gpt-3.5-turbo",
+        }
+
+        task_output = TaskOutput(
+            description="Test task",
+            raw="Generation 1",
+            agent="test-agent",
+            completion_metadata=completion_metadata,
+        )
+
+        generations = task_output.get_generations()
+        assert generations == ["Generation 1", "Generation 2"]
+
+        usage = task_output.get_usage_metrics()
+        assert usage == mock_usage
+
+    def test_lite_agent_output_completion_metadata(self):
+        """Test LiteAgentOutput with completion metadata."""
+        mock_choices = [
+            Mock(message=Mock(content="Generation 1")),
+            Mock(message=Mock(content="Generation 2")),
+        ]
+        mock_usage = {"prompt_tokens": 10, "completion_tokens": 15}
+        
+        completion_metadata = {
+            "choices": mock_choices,
+            "usage": mock_usage,
+            "model": "gpt-3.5-turbo",
+        }
+
+        output = LiteAgentOutput(
+            raw="Generation 1",
+            agent_role="test-agent",
+            completion_metadata=completion_metadata,
+        )
+
+        generations = output.get_generations()
+        assert generations == ["Generation 1", "Generation 2"]
+
+        usage = output.get_usage_metrics_from_completion()
+        assert usage == mock_usage
+
+
+class TestXMLParser:
+    """Test suite for XML parsing functionality."""
+
+    def test_extract_xml_content_basic(self):
+        """Test basic XML content extraction."""
+        text = "Some text <thinking>This is my thought</thinking> more text"
+        result = extract_xml_content(text, "thinking")
+        assert result == "This is my thought"
+
+    def test_extract_xml_content_not_found(self):
+        """Test XML content extraction when tag not found."""
+        text = "Some text without the tag"
+        result = extract_xml_content(text, "thinking")
+        assert result is None
+
+    def test_extract_xml_content_multiline(self):
+        """Test XML content extraction with multiline content."""
+        text = """Some text
+        <thinking>
+        This is a multiline
+        thought process
+        </thinking>
+        more text"""
+        result = extract_xml_content(text, "thinking")
+        assert "multiline" in result
+        assert "thought process" in result
+
+    def test_extract_all_xml_content(self):
+        """Test extracting all occurrences of XML content."""
+        text = """
+        <thinking>First thought</thinking>
+        Some text
+        <thinking>Second thought</thinking>
+        """
+        result = extract_all_xml_content(text, "thinking")
+        assert len(result) == 2
+        assert result[0] == "First thought"
+        assert result[1] == "Second thought"
+
+    def test_extract_multiple_xml_tags(self):
+        """Test extracting multiple different XML tags."""
+        text = """
+        <thinking>My thoughts</thinking>
+        <reasoning>My reasoning</reasoning>
+        <conclusion>My conclusion</conclusion>
+        """
+        result = extract_multiple_xml_tags(text, ["thinking", "reasoning", "conclusion"])
+        assert result["thinking"] == "My thoughts"
+        assert result["reasoning"] == "My reasoning"
+        assert result["conclusion"] == "My conclusion"
+
+    def test_extract_multiple_xml_tags_all(self):
+        """Test extracting all occurrences of multiple XML tags."""
+        text = """
+        <thinking>First thought</thinking>
+        <reasoning>First reasoning</reasoning>
+        <thinking>Second thought</thinking>
+        """
+        result = extract_multiple_xml_tags_all(text, ["thinking", "reasoning"])
+        assert len(result["thinking"]) == 2
+        assert len(result["reasoning"]) == 1
+        assert result["thinking"][0] == "First thought"
+        assert result["thinking"][1] == "Second thought"
+
+    def test_extract_xml_with_attributes(self):
+        """Test extracting XML with attributes."""
+        text = '<thinking type="deep" level="2">Complex thought</thinking>'
+        result = extract_xml_with_attributes(text, "thinking")
+        assert len(result) == 1
+        assert result[0]["content"] == "Complex thought"
+        assert result[0]["attributes"]["type"] == "deep"
+        assert result[0]["attributes"]["level"] == "2"
+
+    def test_remove_xml_tags(self):
+        """Test removing XML tags and their content."""
+        text = "Keep this <thinking>Remove this</thinking> and this"
+        result = remove_xml_tags(text, ["thinking"])
+        assert result == "Keep this and this"
+
+    def test_strip_xml_tags_keep_content(self):
+        """Test stripping XML tags but keeping content."""
+        text = "Keep this <thinking>Keep this too</thinking> and this"
+        result = strip_xml_tags_keep_content(text, ["thinking"])
+        assert result == "Keep this Keep this too and this"
+
+    def test_nested_xml_tags(self):
+        """Test handling of nested XML tags."""
+        text = "<outer>Before <inner>nested content</inner> after</outer>"
+        result = extract_xml_content(text, "outer")
+        assert "Before" in result
+        assert "nested content" in result
+        assert "after" in result
+
+    def test_xml_with_special_characters(self):
+        """Test XML parsing with special characters."""
+        text = "<thinking>Content with & < > \" ' characters</thinking>"
+        result = extract_xml_content(text, "thinking")
+        assert "&" in result
+        assert "<" in result
+        assert ">" in result
--- a/tests/test_xml_parser_examples.py
+++ b/tests/test_xml_parser_examples.py
@@ -0,0 +1,162 @@
+import pytest
+from crewai.utilities.xml_parser import (
+    extract_xml_content,
+    extract_all_xml_content,
+    extract_multiple_xml_tags,
+    remove_xml_tags,
+    strip_xml_tags_keep_content,
+)
+
+
+class TestXMLParserExamples:
+    """Test XML parser with realistic agent output examples."""
+
+    def test_agent_thinking_extraction(self):
+        """Test extracting thinking content from agent output."""
+        agent_output = """
+        I need to solve this problem step by step.
+        
+        <thinking>
+        Let me break this down:
+        1. First, I need to understand the requirements
+        2. Then, I'll analyze the constraints
+        3. Finally, I'll propose a solution
+        
+        The key insight is that we need to balance efficiency with accuracy.
+        </thinking>
+        
+        Based on my analysis, here's my recommendation: Use approach A.
+        """
+        
+        thinking = extract_xml_content(agent_output, "thinking")
+        assert thinking is not None
+        assert "break this down" in thinking
+        assert "requirements" in thinking
+        assert "constraints" in thinking
+        assert "efficiency with accuracy" in thinking
+
+    def test_multiple_reasoning_tags(self):
+        """Test extracting multiple reasoning sections."""
+        agent_output = """
+        <reasoning>
+        Initial analysis shows three possible approaches.
+        </reasoning>
+        
+        Let me explore each option:
+        
+        <reasoning>
+        Option A: Fast but less accurate
+        Option B: Slow but very accurate  
+        Option C: Balanced approach
+        </reasoning>
+        
+        My final recommendation is Option C.
+        """
+        
+        reasoning_sections = extract_all_xml_content(agent_output, "reasoning")
+        assert len(reasoning_sections) == 2
+        assert "three possible approaches" in reasoning_sections[0]
+        assert "Option A" in reasoning_sections[1]
+        assert "Option B" in reasoning_sections[1]
+        assert "Option C" in reasoning_sections[1]
+
+    def test_complex_agent_workflow(self):
+        """Test complex agent output with multiple tag types."""
+        complex_output = """
+        <thinking>
+        This is a complex problem requiring systematic analysis.
+        I need to consider multiple factors.
+        </thinking>
+        
+        <analysis>
+        Factor 1: Performance requirements
+        Factor 2: Cost constraints
+        Factor 3: Time limitations
+        </analysis>
+        
+        <reasoning>
+        Given the analysis above, I believe we should prioritize performance
+        while keeping costs reasonable. Time is less critical in this case.
+        </reasoning>
+        
+        <conclusion>
+        Recommend Solution X with performance optimizations.
+        </conclusion>
+        
+        Final answer: Implement Solution X with the following optimizations...
+        """
+        
+        extracted = extract_multiple_xml_tags(
+            complex_output, 
+            ["thinking", "analysis", "reasoning", "conclusion"]
+        )
+        
+        assert extracted["thinking"] is not None
+        assert "systematic analysis" in extracted["thinking"]
+        
+        assert extracted["analysis"] is not None
+        assert "Factor 1" in extracted["analysis"]
+        assert "Factor 2" in extracted["analysis"]
+        assert "Factor 3" in extracted["analysis"]
+        
+        assert extracted["reasoning"] is not None
+        assert "prioritize performance" in extracted["reasoning"]
+        
+        assert extracted["conclusion"] is not None
+        assert "Solution X" in extracted["conclusion"]
+
+    def test_clean_output_for_user(self):
+        """Test cleaning agent output for user presentation."""
+        raw_output = """
+        <thinking>
+        Internal reasoning that user shouldn't see.
+        This contains implementation details.
+        </thinking>
+        
+        <debug>
+        Debug information: variable X = 42
+        </debug>
+        
+        Here's the answer to your question: The solution is to use method Y.
+        
+        <internal_notes>
+        Remember to update the documentation later.
+        </internal_notes>
+        
+        This approach will give you the best results.
+        """
+        
+        clean_output = remove_xml_tags(
+            raw_output, 
+            ["thinking", "debug", "internal_notes"]
+        )
+        
+        assert "Internal reasoning" not in clean_output
+        assert "Debug information" not in clean_output
+        assert "update the documentation" not in clean_output
+        assert "Here's the answer" in clean_output
+        assert "method Y" in clean_output
+        assert "best results" in clean_output
+
+    def test_preserve_structured_content(self):
+        """Test preserving structured content while removing tags."""
+        structured_output = """
+        <steps>
+        1. Initialize the system
+        2. Load the configuration
+        3. Process the data
+        4. Generate the report
+        </steps>
+        
+        Follow these steps to complete the task.
+        """
+        
+        clean_output = strip_xml_tags_keep_content(structured_output, ["steps"])
+        
+        assert "<steps>" not in clean_output
+        assert "</steps>" not in clean_output
+        assert "1. Initialize" in clean_output
+        assert "2. Load" in clean_output
+        assert "3. Process" in clean_output
+        assert "4. Generate" in clean_output
+        assert "Follow these steps" in clean_output