Implement LLM generations, logprobs, and XML parsing features

- Add support for n generations and logprobs parameters in LLM class
- Extend Agent class to accept LLM generation parameters (llm_n, llm_logprobs, llm_top_logprobs)
- Add return_full_completion parameter to access complete LLM response metadata
- Implement XML parser utility for extracting content from tags like <thinking>
- Add completion metadata support to TaskOutput and LiteAgentOutput classes
- Add comprehensive tests and examples demonstrating new functionality
- Maintain full backward compatibility with existing code

Addresses issue #3052: How to obtain n generations or generations in different tags

Co-Authored-By: João <joao@crewai.com>
This commit is contained in:
Devin AI
2025-06-24 05:14:22 +00:00
parent c96d4a6823
commit 39ea952acd
10 changed files with 1090 additions and 27 deletions

View File

@@ -0,0 +1,158 @@
import pytest
from unittest.mock import Mock, patch
from crewai import Agent, Task, Crew, LLM
from crewai.lite_agent import LiteAgent
from crewai.utilities.xml_parser import extract_xml_content
class TestIntegrationLLMFeatures:
"""Integration tests for LLM features with agents and tasks."""
@patch('crewai.llm.litellm.completion')
def test_agent_with_multiple_generations(self, mock_completion):
"""Test agent execution with multiple generations."""
mock_response = Mock()
mock_response.choices = [
Mock(message=Mock(content="Generation 1")),
Mock(message=Mock(content="Generation 2")),
Mock(message=Mock(content="Generation 3")),
]
mock_response.usage = {"prompt_tokens": 20, "completion_tokens": 30}
mock_response.model = "gpt-3.5-turbo"
mock_response.created = 1234567890
mock_response.id = "test-id"
mock_response.object = "chat.completion"
mock_response.system_fingerprint = "test-fingerprint"
mock_completion.return_value = mock_response
llm = LLM(model="gpt-3.5-turbo", n=3, return_full_completion=True)
agent = Agent(
role="writer",
goal="write content",
backstory="You are a writer",
llm=llm,
return_completion_metadata=True,
)
task = Task(
description="Write a short story",
agent=agent,
expected_output="A short story",
)
with patch.object(agent, 'agent_executor') as mock_executor:
mock_executor.invoke.return_value = {"output": "Generation 1"}
result = agent.execute_task(task)
assert result == "Generation 1"
@patch('crewai.llm.litellm.completion')
def test_lite_agent_with_xml_extraction(self, mock_completion):
"""Test LiteAgent with XML content extraction."""
response_with_xml = """
<thinking>
I need to analyze this problem step by step.
First, I'll consider the requirements.
</thinking>
Based on my analysis, here's the solution: The answer is 42.
"""
mock_response = Mock()
mock_response.choices = [Mock(message=Mock(content=response_with_xml))]
mock_response.usage = {"prompt_tokens": 15, "completion_tokens": 25}
mock_response.model = "gpt-3.5-turbo"
mock_response.created = 1234567890
mock_response.id = "test-id"
mock_response.object = "chat.completion"
mock_response.system_fingerprint = "test-fingerprint"
mock_completion.return_value = mock_response
lite_agent = LiteAgent(
role="analyst",
goal="analyze problems",
backstory="You are an analyst",
llm=LLM(model="gpt-3.5-turbo", return_full_completion=True),
)
with patch.object(lite_agent, '_invoke_loop') as mock_invoke:
mock_invoke.return_value = response_with_xml
result = lite_agent.kickoff("Analyze this problem")
thinking_content = extract_xml_content(result.raw, "thinking")
assert thinking_content is not None
assert "step by step" in thinking_content
assert "requirements" in thinking_content
def test_xml_parser_with_complex_agent_output(self):
"""Test XML parser with complex agent output containing multiple tags."""
complex_output = """
<thinking>
This is a complex problem that requires careful analysis.
I need to break it down into steps.
</thinking>
<reasoning>
Step 1: Understand the requirements
Step 2: Analyze the constraints
Step 3: Develop a solution
</reasoning>
<conclusion>
The best approach is to use a systematic methodology.
</conclusion>
Final answer: Use the systematic approach outlined above.
"""
thinking = extract_xml_content(complex_output, "thinking")
reasoning = extract_xml_content(complex_output, "reasoning")
conclusion = extract_xml_content(complex_output, "conclusion")
assert thinking is not None
assert "complex problem" in thinking
assert reasoning is not None
assert "Step 1" in reasoning
assert "Step 2" in reasoning
assert "Step 3" in reasoning
assert conclusion is not None
assert "systematic methodology" in conclusion
@patch('crewai.llm.litellm.completion')
def test_crew_with_llm_parameters(self, mock_completion):
"""Test crew execution with LLM parameters."""
mock_response = Mock()
mock_response.choices = [Mock(message=Mock(content="Test response"))]
mock_response.usage = {"prompt_tokens": 10, "completion_tokens": 5}
mock_response.model = "gpt-3.5-turbo"
mock_response.created = 1234567890
mock_response.id = "test-id"
mock_response.object = "chat.completion"
mock_response.system_fingerprint = "test-fingerprint"
mock_completion.return_value = mock_response
agent = Agent(
role="analyst",
goal="analyze data",
backstory="You are an analyst",
llm_n=2,
llm_logprobs=5,
return_completion_metadata=True,
)
task = Task(
description="Analyze the data",
agent=agent,
expected_output="Analysis results",
)
crew = Crew(agents=[agent], tasks=[task])
with patch.object(crew, 'kickoff') as mock_kickoff:
mock_output = Mock()
mock_output.tasks_output = [Mock(completion_metadata={"choices": mock_response.choices})]
mock_kickoff.return_value = mock_output
result = crew.kickoff()
assert result is not None

View File

@@ -0,0 +1,227 @@
import pytest
from unittest.mock import Mock, patch
from crewai import Agent, Task, LLM
from crewai.tasks.task_output import TaskOutput
from crewai.lite_agent import LiteAgent, LiteAgentOutput
from crewai.utilities.xml_parser import (
extract_xml_content,
extract_all_xml_content,
extract_multiple_xml_tags,
extract_multiple_xml_tags_all,
extract_xml_with_attributes,
remove_xml_tags,
strip_xml_tags_keep_content,
)
class TestLLMGenerationsLogprobs:
"""Test suite for LLM generations and logprobs functionality."""
def test_llm_with_n_parameter(self):
"""Test that LLM accepts n parameter for multiple generations."""
llm = LLM(model="gpt-3.5-turbo", n=3)
assert llm.n == 3
def test_llm_with_logprobs_parameter(self):
"""Test that LLM accepts logprobs parameter."""
llm = LLM(model="gpt-3.5-turbo", logprobs=5)
assert llm.logprobs == 5
def test_llm_with_return_full_completion(self):
"""Test that LLM accepts return_full_completion parameter."""
llm = LLM(model="gpt-3.5-turbo", return_full_completion=True)
assert llm.return_full_completion is True
def test_agent_with_llm_parameters(self):
"""Test that Agent accepts LLM generation parameters."""
agent = Agent(
role="test",
goal="test",
backstory="test",
llm_n=3,
llm_logprobs=5,
llm_top_logprobs=3,
return_completion_metadata=True,
)
assert agent.llm_n == 3
assert agent.llm_logprobs == 5
assert agent.llm_top_logprobs == 3
assert agent.return_completion_metadata is True
@patch('crewai.llm.litellm.completion')
def test_llm_call_returns_full_completion(self, mock_completion):
"""Test that LLM.call can return full completion object."""
mock_response = Mock()
mock_response.choices = [Mock()]
mock_response.choices[0].message.content = "Test response"
mock_response.usage = {"prompt_tokens": 10, "completion_tokens": 5}
mock_response.model = "gpt-3.5-turbo"
mock_response.created = 1234567890
mock_response.id = "test-id"
mock_response.object = "chat.completion"
mock_response.system_fingerprint = "test-fingerprint"
mock_completion.return_value = mock_response
llm = LLM(model="gpt-3.5-turbo", return_full_completion=True)
result = llm.call("Test message")
assert isinstance(result, dict)
assert result["content"] == "Test response"
assert "choices" in result
assert "usage" in result
assert result["model"] == "gpt-3.5-turbo"
def test_task_output_completion_metadata(self):
"""Test TaskOutput with completion metadata."""
mock_choices = [
Mock(message=Mock(content="Generation 1")),
Mock(message=Mock(content="Generation 2")),
]
mock_usage = {"prompt_tokens": 10, "completion_tokens": 15}
completion_metadata = {
"choices": mock_choices,
"usage": mock_usage,
"model": "gpt-3.5-turbo",
}
task_output = TaskOutput(
description="Test task",
raw="Generation 1",
agent="test-agent",
completion_metadata=completion_metadata,
)
generations = task_output.get_generations()
assert generations == ["Generation 1", "Generation 2"]
usage = task_output.get_usage_metrics()
assert usage == mock_usage
def test_lite_agent_output_completion_metadata(self):
"""Test LiteAgentOutput with completion metadata."""
mock_choices = [
Mock(message=Mock(content="Generation 1")),
Mock(message=Mock(content="Generation 2")),
]
mock_usage = {"prompt_tokens": 10, "completion_tokens": 15}
completion_metadata = {
"choices": mock_choices,
"usage": mock_usage,
"model": "gpt-3.5-turbo",
}
output = LiteAgentOutput(
raw="Generation 1",
agent_role="test-agent",
completion_metadata=completion_metadata,
)
generations = output.get_generations()
assert generations == ["Generation 1", "Generation 2"]
usage = output.get_usage_metrics_from_completion()
assert usage == mock_usage
class TestXMLParser:
"""Test suite for XML parsing functionality."""
def test_extract_xml_content_basic(self):
"""Test basic XML content extraction."""
text = "Some text <thinking>This is my thought</thinking> more text"
result = extract_xml_content(text, "thinking")
assert result == "This is my thought"
def test_extract_xml_content_not_found(self):
"""Test XML content extraction when tag not found."""
text = "Some text without the tag"
result = extract_xml_content(text, "thinking")
assert result is None
def test_extract_xml_content_multiline(self):
"""Test XML content extraction with multiline content."""
text = """Some text
<thinking>
This is a multiline
thought process
</thinking>
more text"""
result = extract_xml_content(text, "thinking")
assert "multiline" in result
assert "thought process" in result
def test_extract_all_xml_content(self):
"""Test extracting all occurrences of XML content."""
text = """
<thinking>First thought</thinking>
Some text
<thinking>Second thought</thinking>
"""
result = extract_all_xml_content(text, "thinking")
assert len(result) == 2
assert result[0] == "First thought"
assert result[1] == "Second thought"
def test_extract_multiple_xml_tags(self):
"""Test extracting multiple different XML tags."""
text = """
<thinking>My thoughts</thinking>
<reasoning>My reasoning</reasoning>
<conclusion>My conclusion</conclusion>
"""
result = extract_multiple_xml_tags(text, ["thinking", "reasoning", "conclusion"])
assert result["thinking"] == "My thoughts"
assert result["reasoning"] == "My reasoning"
assert result["conclusion"] == "My conclusion"
def test_extract_multiple_xml_tags_all(self):
"""Test extracting all occurrences of multiple XML tags."""
text = """
<thinking>First thought</thinking>
<reasoning>First reasoning</reasoning>
<thinking>Second thought</thinking>
"""
result = extract_multiple_xml_tags_all(text, ["thinking", "reasoning"])
assert len(result["thinking"]) == 2
assert len(result["reasoning"]) == 1
assert result["thinking"][0] == "First thought"
assert result["thinking"][1] == "Second thought"
def test_extract_xml_with_attributes(self):
"""Test extracting XML with attributes."""
text = '<thinking type="deep" level="2">Complex thought</thinking>'
result = extract_xml_with_attributes(text, "thinking")
assert len(result) == 1
assert result[0]["content"] == "Complex thought"
assert result[0]["attributes"]["type"] == "deep"
assert result[0]["attributes"]["level"] == "2"
def test_remove_xml_tags(self):
"""Test removing XML tags and their content."""
text = "Keep this <thinking>Remove this</thinking> and this"
result = remove_xml_tags(text, ["thinking"])
assert result == "Keep this and this"
def test_strip_xml_tags_keep_content(self):
"""Test stripping XML tags but keeping content."""
text = "Keep this <thinking>Keep this too</thinking> and this"
result = strip_xml_tags_keep_content(text, ["thinking"])
assert result == "Keep this Keep this too and this"
def test_nested_xml_tags(self):
"""Test handling of nested XML tags."""
text = "<outer>Before <inner>nested content</inner> after</outer>"
result = extract_xml_content(text, "outer")
assert "Before" in result
assert "nested content" in result
assert "after" in result
def test_xml_with_special_characters(self):
"""Test XML parsing with special characters."""
text = "<thinking>Content with & < > \" ' characters</thinking>"
result = extract_xml_content(text, "thinking")
assert "&" in result
assert "<" in result
assert ">" in result

View File

@@ -0,0 +1,162 @@
import pytest
from crewai.utilities.xml_parser import (
extract_xml_content,
extract_all_xml_content,
extract_multiple_xml_tags,
remove_xml_tags,
strip_xml_tags_keep_content,
)
class TestXMLParserExamples:
"""Test XML parser with realistic agent output examples."""
def test_agent_thinking_extraction(self):
"""Test extracting thinking content from agent output."""
agent_output = """
I need to solve this problem step by step.
<thinking>
Let me break this down:
1. First, I need to understand the requirements
2. Then, I'll analyze the constraints
3. Finally, I'll propose a solution
The key insight is that we need to balance efficiency with accuracy.
</thinking>
Based on my analysis, here's my recommendation: Use approach A.
"""
thinking = extract_xml_content(agent_output, "thinking")
assert thinking is not None
assert "break this down" in thinking
assert "requirements" in thinking
assert "constraints" in thinking
assert "efficiency with accuracy" in thinking
def test_multiple_reasoning_tags(self):
"""Test extracting multiple reasoning sections."""
agent_output = """
<reasoning>
Initial analysis shows three possible approaches.
</reasoning>
Let me explore each option:
<reasoning>
Option A: Fast but less accurate
Option B: Slow but very accurate
Option C: Balanced approach
</reasoning>
My final recommendation is Option C.
"""
reasoning_sections = extract_all_xml_content(agent_output, "reasoning")
assert len(reasoning_sections) == 2
assert "three possible approaches" in reasoning_sections[0]
assert "Option A" in reasoning_sections[1]
assert "Option B" in reasoning_sections[1]
assert "Option C" in reasoning_sections[1]
def test_complex_agent_workflow(self):
"""Test complex agent output with multiple tag types."""
complex_output = """
<thinking>
This is a complex problem requiring systematic analysis.
I need to consider multiple factors.
</thinking>
<analysis>
Factor 1: Performance requirements
Factor 2: Cost constraints
Factor 3: Time limitations
</analysis>
<reasoning>
Given the analysis above, I believe we should prioritize performance
while keeping costs reasonable. Time is less critical in this case.
</reasoning>
<conclusion>
Recommend Solution X with performance optimizations.
</conclusion>
Final answer: Implement Solution X with the following optimizations...
"""
extracted = extract_multiple_xml_tags(
complex_output,
["thinking", "analysis", "reasoning", "conclusion"]
)
assert extracted["thinking"] is not None
assert "systematic analysis" in extracted["thinking"]
assert extracted["analysis"] is not None
assert "Factor 1" in extracted["analysis"]
assert "Factor 2" in extracted["analysis"]
assert "Factor 3" in extracted["analysis"]
assert extracted["reasoning"] is not None
assert "prioritize performance" in extracted["reasoning"]
assert extracted["conclusion"] is not None
assert "Solution X" in extracted["conclusion"]
def test_clean_output_for_user(self):
"""Test cleaning agent output for user presentation."""
raw_output = """
<thinking>
Internal reasoning that user shouldn't see.
This contains implementation details.
</thinking>
<debug>
Debug information: variable X = 42
</debug>
Here's the answer to your question: The solution is to use method Y.
<internal_notes>
Remember to update the documentation later.
</internal_notes>
This approach will give you the best results.
"""
clean_output = remove_xml_tags(
raw_output,
["thinking", "debug", "internal_notes"]
)
assert "Internal reasoning" not in clean_output
assert "Debug information" not in clean_output
assert "update the documentation" not in clean_output
assert "Here's the answer" in clean_output
assert "method Y" in clean_output
assert "best results" in clean_output
def test_preserve_structured_content(self):
"""Test preserving structured content while removing tags."""
structured_output = """
<steps>
1. Initialize the system
2. Load the configuration
3. Process the data
4. Generate the report
</steps>
Follow these steps to complete the task.
"""
clean_output = strip_xml_tags_keep_content(structured_output, ["steps"])
assert "<steps>" not in clean_output
assert "</steps>" not in clean_output
assert "1. Initialize" in clean_output
assert "2. Load" in clean_output
assert "3. Process" in clean_output
assert "4. Generate" in clean_output
assert "Follow these steps" in clean_output