mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-08 23:58:34 +00:00
This commit fixes the multimodal image handling in CrewAI agents: 1. CrewAgentExecutor._handle_agent_action: Fixed to properly append multimodal messages without double-wrapping. The add_image tool result is now appended directly if it's already a properly formatted message dict with role and content. 2. ToolUsage._use: Fixed to preserve the raw dict result for add_image tool instead of stringifying it via _format_result(). This ensures the multimodal content structure is maintained. 3. Gemini provider _format_messages_for_gemini: Added proper handling for multimodal content with image_url parts. The provider now: - Converts image_url parts to Gemini's inline_data format - Supports HTTP(S) URLs by fetching and converting to base64 - Supports data URLs by parsing and extracting base64 data - Supports local file paths by reading and converting to base64 4. Added comprehensive tests for all multimodal handling fixes. Fixes #4016 Co-Authored-By: João <joao@crewai.com>
489 lines
17 KiB
Python
489 lines
17 KiB
Python
"""Tests for multimodal image handling fixes (Issue #4016).
|
|
|
|
This module tests the fixes for proper multimodal image handling in CrewAI:
|
|
1. CrewAgentExecutor._handle_agent_action properly appends multimodal messages
|
|
2. ToolUsage.use preserves raw dict result for add_image tool
|
|
3. Gemini provider properly converts image_url to Gemini's format
|
|
"""
|
|
|
|
import base64
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
|
|
class TestCrewAgentExecutorMultimodal:
|
|
"""Tests for CrewAgentExecutor._handle_agent_action with multimodal content."""
|
|
|
|
def test_handle_agent_action_with_add_image_tool_dict_result(self):
|
|
"""Test that add_image tool result dict is appended directly without wrapping."""
|
|
from crewai.agents.crew_agent_executor import CrewAgentExecutor
|
|
from crewai.agents.parser import AgentAction
|
|
from crewai.tools.tool_usage import ToolResult
|
|
|
|
# Create a mock executor
|
|
mock_llm = MagicMock()
|
|
mock_task = MagicMock()
|
|
mock_crew = MagicMock()
|
|
mock_agent = MagicMock()
|
|
mock_agent.i18n = MagicMock()
|
|
mock_agent.i18n.tools.return_value = {"name": "Add image to content"}
|
|
|
|
executor = CrewAgentExecutor(
|
|
llm=mock_llm,
|
|
task=mock_task,
|
|
crew=mock_crew,
|
|
agent=mock_agent,
|
|
prompt="test prompt",
|
|
tools=[],
|
|
original_tools=[],
|
|
max_iter=1,
|
|
)
|
|
executor.messages = []
|
|
executor._i18n = mock_agent.i18n
|
|
|
|
# Create a mock add_image tool result (dict with role and content)
|
|
multimodal_result = {
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "text", "text": "Analyze this image"},
|
|
{
|
|
"type": "image_url",
|
|
"image_url": {"url": "https://example.com/image.jpg"},
|
|
},
|
|
],
|
|
}
|
|
|
|
formatted_answer = AgentAction(
|
|
tool="Add image to content",
|
|
tool_input='{"image_url": "https://example.com/image.jpg"}',
|
|
text="Using add image tool",
|
|
thought="I need to add an image",
|
|
result="",
|
|
)
|
|
|
|
tool_result = ToolResult(result=multimodal_result)
|
|
|
|
# Call _handle_agent_action
|
|
result = executor._handle_agent_action(formatted_answer, tool_result)
|
|
|
|
# Verify the message was appended correctly (not double-wrapped)
|
|
assert len(executor.messages) == 1
|
|
appended_message = executor.messages[0]
|
|
|
|
# The message should be the dict directly, not wrapped
|
|
assert appended_message["role"] == "user"
|
|
assert isinstance(appended_message["content"], list)
|
|
assert len(appended_message["content"]) == 2
|
|
assert appended_message["content"][0]["type"] == "text"
|
|
assert appended_message["content"][1]["type"] == "image_url"
|
|
|
|
def test_handle_agent_action_with_add_image_tool_list_result(self):
|
|
"""Test that add_image tool result list is wrapped with user role."""
|
|
from crewai.agents.crew_agent_executor import CrewAgentExecutor
|
|
from crewai.agents.parser import AgentAction
|
|
from crewai.tools.tool_usage import ToolResult
|
|
|
|
# Create a mock executor
|
|
mock_llm = MagicMock()
|
|
mock_task = MagicMock()
|
|
mock_crew = MagicMock()
|
|
mock_agent = MagicMock()
|
|
mock_agent.i18n = MagicMock()
|
|
mock_agent.i18n.tools.return_value = {"name": "Add image to content"}
|
|
|
|
executor = CrewAgentExecutor(
|
|
llm=mock_llm,
|
|
task=mock_task,
|
|
crew=mock_crew,
|
|
agent=mock_agent,
|
|
prompt="test prompt",
|
|
tools=[],
|
|
original_tools=[],
|
|
max_iter=1,
|
|
)
|
|
executor.messages = []
|
|
executor._i18n = mock_agent.i18n
|
|
|
|
# Create a mock add_image tool result (list without role)
|
|
content_list = [
|
|
{"type": "text", "text": "Analyze this image"},
|
|
{
|
|
"type": "image_url",
|
|
"image_url": {"url": "https://example.com/image.jpg"},
|
|
},
|
|
]
|
|
|
|
formatted_answer = AgentAction(
|
|
tool="Add image to content",
|
|
tool_input='{"image_url": "https://example.com/image.jpg"}',
|
|
text="Using add image tool",
|
|
thought="I need to add an image",
|
|
result="",
|
|
)
|
|
|
|
tool_result = ToolResult(result=content_list)
|
|
|
|
# Call _handle_agent_action
|
|
result = executor._handle_agent_action(formatted_answer, tool_result)
|
|
|
|
# Verify the message was wrapped with user role
|
|
assert len(executor.messages) == 1
|
|
appended_message = executor.messages[0]
|
|
|
|
assert appended_message["role"] == "user"
|
|
assert appended_message["content"] == content_list
|
|
|
|
|
|
class TestToolUsageMultimodal:
|
|
"""Tests for ToolUsage.use with add_image tool."""
|
|
|
|
def test_tool_usage_preserves_add_image_dict_result(self):
|
|
"""Test that add_image tool result is not stringified."""
|
|
from crewai.tools.tool_usage import ToolUsage
|
|
|
|
# Create mock components
|
|
mock_tools_handler = MagicMock()
|
|
mock_tools_handler.cache = None
|
|
mock_task = MagicMock()
|
|
mock_task.used_tools = 0
|
|
mock_agent = MagicMock()
|
|
mock_agent.key = "test_key"
|
|
mock_agent.role = "test_role"
|
|
mock_agent.verbose = False
|
|
mock_agent.fingerprint = None
|
|
mock_agent.tools_results = []
|
|
mock_action = MagicMock()
|
|
mock_action.tool = "Add image to content"
|
|
mock_action.tool_input = '{"image_url": "https://example.com/image.jpg"}'
|
|
|
|
# Mock i18n
|
|
mock_i18n = MagicMock()
|
|
mock_i18n.tools.return_value = {"name": "Add image to content"}
|
|
|
|
# Create a mock add_image tool
|
|
mock_tool = MagicMock()
|
|
mock_tool.name = "Add image to content"
|
|
mock_tool.args_schema = MagicMock()
|
|
mock_tool.args_schema.model_json_schema.return_value = {
|
|
"properties": {"image_url": {}, "action": {}}
|
|
}
|
|
|
|
# The tool returns a dict with role and content
|
|
multimodal_result = {
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "text", "text": "Analyze this image"},
|
|
{
|
|
"type": "image_url",
|
|
"image_url": {"url": "https://example.com/image.jpg"},
|
|
},
|
|
],
|
|
}
|
|
mock_tool.invoke.return_value = multimodal_result
|
|
|
|
tool_usage = ToolUsage(
|
|
tools_handler=mock_tools_handler,
|
|
tools=[mock_tool],
|
|
task=mock_task,
|
|
function_calling_llm=MagicMock(),
|
|
agent=mock_agent,
|
|
action=mock_action,
|
|
)
|
|
tool_usage._i18n = mock_i18n
|
|
tool_usage._telemetry = MagicMock()
|
|
|
|
# Create a mock calling object
|
|
mock_calling = MagicMock()
|
|
mock_calling.tool_name = "Add image to content"
|
|
mock_calling.arguments = {"image_url": "https://example.com/image.jpg"}
|
|
|
|
# Call _use directly
|
|
result = tool_usage._use(
|
|
tool_string="test",
|
|
tool=mock_tool,
|
|
calling=mock_calling,
|
|
)
|
|
|
|
# The result should be the dict, not a string
|
|
assert isinstance(result, dict)
|
|
assert result["role"] == "user"
|
|
assert isinstance(result["content"], list)
|
|
|
|
|
|
class TestGeminiMultimodalFormatting:
|
|
"""Tests for Gemini provider's multimodal content handling."""
|
|
|
|
@pytest.fixture
|
|
def mock_gemini_types(self):
|
|
"""Create mock Gemini types for testing."""
|
|
mock_types = MagicMock()
|
|
mock_part = MagicMock()
|
|
mock_content = MagicMock()
|
|
|
|
# Mock Part.from_text
|
|
mock_types.Part.from_text.return_value = mock_part
|
|
|
|
# Mock Part.from_bytes
|
|
mock_types.Part.from_bytes.return_value = mock_part
|
|
|
|
# Mock Content
|
|
mock_types.Content.return_value = mock_content
|
|
|
|
return mock_types
|
|
|
|
def test_convert_content_to_parts_simple_text(self):
|
|
"""Test converting simple text content to Gemini Parts."""
|
|
with patch.dict(
|
|
"sys.modules",
|
|
{
|
|
"google": MagicMock(),
|
|
"google.genai": MagicMock(),
|
|
"google.genai.types": MagicMock(),
|
|
"google.genai.errors": MagicMock(),
|
|
},
|
|
):
|
|
from crewai.llms.providers.gemini.completion import GeminiCompletion
|
|
|
|
# Create a mock instance
|
|
completion = MagicMock(spec=GeminiCompletion)
|
|
completion._convert_content_to_parts = (
|
|
GeminiCompletion._convert_content_to_parts.__get__(
|
|
completion, GeminiCompletion
|
|
)
|
|
)
|
|
|
|
# Mock types.Part.from_text
|
|
mock_part = MagicMock()
|
|
with patch(
|
|
"crewai.llms.providers.gemini.completion.types.Part.from_text",
|
|
return_value=mock_part,
|
|
):
|
|
result = completion._convert_content_to_parts("Hello, world!")
|
|
|
|
assert len(result) == 1
|
|
|
|
def test_convert_content_to_parts_multimodal_with_image_url(self):
|
|
"""Test converting multimodal content with image_url to Gemini Parts."""
|
|
with patch.dict(
|
|
"sys.modules",
|
|
{
|
|
"google": MagicMock(),
|
|
"google.genai": MagicMock(),
|
|
"google.genai.types": MagicMock(),
|
|
"google.genai.errors": MagicMock(),
|
|
},
|
|
):
|
|
from crewai.llms.providers.gemini.completion import GeminiCompletion
|
|
|
|
# Create a mock instance
|
|
completion = MagicMock(spec=GeminiCompletion)
|
|
completion._convert_content_to_parts = (
|
|
GeminiCompletion._convert_content_to_parts.__get__(
|
|
completion, GeminiCompletion
|
|
)
|
|
)
|
|
completion._create_image_part_from_url = MagicMock(
|
|
return_value=MagicMock()
|
|
)
|
|
|
|
# Mock types.Part.from_text
|
|
mock_text_part = MagicMock()
|
|
with patch(
|
|
"crewai.llms.providers.gemini.completion.types.Part.from_text",
|
|
return_value=mock_text_part,
|
|
):
|
|
multimodal_content = [
|
|
{"type": "text", "text": "Analyze this image"},
|
|
{
|
|
"type": "image_url",
|
|
"image_url": {"url": "https://example.com/image.jpg"},
|
|
},
|
|
]
|
|
|
|
result = completion._convert_content_to_parts(multimodal_content)
|
|
|
|
# Should have called _create_image_part_from_url
|
|
completion._create_image_part_from_url.assert_called_once_with(
|
|
"https://example.com/image.jpg"
|
|
)
|
|
|
|
def test_parse_data_url(self):
|
|
"""Test parsing data URLs."""
|
|
with patch.dict(
|
|
"sys.modules",
|
|
{
|
|
"google": MagicMock(),
|
|
"google.genai": MagicMock(),
|
|
"google.genai.types": MagicMock(),
|
|
"google.genai.errors": MagicMock(),
|
|
},
|
|
):
|
|
from crewai.llms.providers.gemini.completion import GeminiCompletion
|
|
|
|
# Create a mock instance
|
|
completion = MagicMock(spec=GeminiCompletion)
|
|
completion._parse_data_url = GeminiCompletion._parse_data_url.__get__(
|
|
completion, GeminiCompletion
|
|
)
|
|
|
|
# Create a test data URL
|
|
test_data = b"test image data"
|
|
base64_data = base64.b64encode(test_data).decode("utf-8")
|
|
data_url = f"data:image/png;base64,{base64_data}"
|
|
|
|
# Mock types.Part.from_bytes
|
|
mock_part = MagicMock()
|
|
with patch(
|
|
"crewai.llms.providers.gemini.completion.types.Part.from_bytes",
|
|
return_value=mock_part,
|
|
) as mock_from_bytes:
|
|
result = completion._parse_data_url(data_url)
|
|
|
|
# Verify from_bytes was called with correct arguments
|
|
mock_from_bytes.assert_called_once()
|
|
call_args = mock_from_bytes.call_args
|
|
assert call_args.kwargs["mime_type"] == "image/png"
|
|
assert call_args.kwargs["data"] == test_data
|
|
|
|
def test_fetch_image_from_url(self):
|
|
"""Test fetching images from HTTP URLs."""
|
|
with patch.dict(
|
|
"sys.modules",
|
|
{
|
|
"google": MagicMock(),
|
|
"google.genai": MagicMock(),
|
|
"google.genai.types": MagicMock(),
|
|
"google.genai.errors": MagicMock(),
|
|
},
|
|
):
|
|
from crewai.llms.providers.gemini.completion import GeminiCompletion
|
|
|
|
# Create a mock instance
|
|
completion = MagicMock(spec=GeminiCompletion)
|
|
completion._fetch_image_from_url = (
|
|
GeminiCompletion._fetch_image_from_url.__get__(
|
|
completion, GeminiCompletion
|
|
)
|
|
)
|
|
|
|
# Mock urllib.request.urlopen
|
|
mock_response = MagicMock()
|
|
mock_response.read.return_value = b"fake image data"
|
|
mock_response.headers.get.return_value = "image/jpeg"
|
|
mock_response.__enter__ = MagicMock(return_value=mock_response)
|
|
mock_response.__exit__ = MagicMock(return_value=False)
|
|
|
|
mock_part = MagicMock()
|
|
with (
|
|
patch("urllib.request.urlopen", return_value=mock_response),
|
|
patch(
|
|
"crewai.llms.providers.gemini.completion.types.Part.from_bytes",
|
|
return_value=mock_part,
|
|
) as mock_from_bytes,
|
|
):
|
|
result = completion._fetch_image_from_url("https://example.com/image.jpg")
|
|
|
|
# Verify from_bytes was called with correct arguments
|
|
mock_from_bytes.assert_called_once()
|
|
call_args = mock_from_bytes.call_args
|
|
assert call_args.kwargs["mime_type"] == "image/jpeg"
|
|
assert call_args.kwargs["data"] == b"fake image data"
|
|
|
|
def test_read_local_image(self):
|
|
"""Test reading local image files."""
|
|
import tempfile
|
|
import os
|
|
|
|
with patch.dict(
|
|
"sys.modules",
|
|
{
|
|
"google": MagicMock(),
|
|
"google.genai": MagicMock(),
|
|
"google.genai.types": MagicMock(),
|
|
"google.genai.errors": MagicMock(),
|
|
},
|
|
):
|
|
from crewai.llms.providers.gemini.completion import GeminiCompletion
|
|
|
|
# Create a mock instance
|
|
completion = MagicMock(spec=GeminiCompletion)
|
|
completion._read_local_image = GeminiCompletion._read_local_image.__get__(
|
|
completion, GeminiCompletion
|
|
)
|
|
|
|
# Create a temporary test file
|
|
with tempfile.NamedTemporaryFile(
|
|
suffix=".png", delete=False
|
|
) as temp_file:
|
|
temp_file.write(b"fake png data")
|
|
temp_path = temp_file.name
|
|
|
|
try:
|
|
mock_part = MagicMock()
|
|
with patch(
|
|
"crewai.llms.providers.gemini.completion.types.Part.from_bytes",
|
|
return_value=mock_part,
|
|
) as mock_from_bytes:
|
|
result = completion._read_local_image(temp_path)
|
|
|
|
# Verify from_bytes was called with correct arguments
|
|
mock_from_bytes.assert_called_once()
|
|
call_args = mock_from_bytes.call_args
|
|
assert call_args.kwargs["mime_type"] == "image/png"
|
|
assert call_args.kwargs["data"] == b"fake png data"
|
|
finally:
|
|
os.unlink(temp_path)
|
|
|
|
def test_read_local_image_file_not_found(self):
|
|
"""Test reading non-existent local image file."""
|
|
with patch.dict(
|
|
"sys.modules",
|
|
{
|
|
"google": MagicMock(),
|
|
"google.genai": MagicMock(),
|
|
"google.genai.types": MagicMock(),
|
|
"google.genai.errors": MagicMock(),
|
|
},
|
|
):
|
|
from crewai.llms.providers.gemini.completion import GeminiCompletion
|
|
|
|
# Create a mock instance
|
|
completion = MagicMock(spec=GeminiCompletion)
|
|
completion._read_local_image = GeminiCompletion._read_local_image.__get__(
|
|
completion, GeminiCompletion
|
|
)
|
|
|
|
result = completion._read_local_image("/nonexistent/path/image.png")
|
|
|
|
# Should return None for non-existent file
|
|
assert result is None
|
|
|
|
|
|
class TestAddImageToolOutput:
|
|
"""Tests for AddImageTool output format."""
|
|
|
|
def test_add_image_tool_returns_correct_format(self):
|
|
"""Test that AddImageTool returns the correct multimodal format."""
|
|
from crewai.tools.agent_tools.add_image_tool import AddImageTool
|
|
|
|
tool = AddImageTool()
|
|
result = tool._run(
|
|
image_url="https://example.com/image.jpg",
|
|
action="Analyze this image",
|
|
)
|
|
|
|
# Verify the result format
|
|
assert isinstance(result, dict)
|
|
assert result["role"] == "user"
|
|
assert isinstance(result["content"], list)
|
|
assert len(result["content"]) == 2
|
|
|
|
# Check text part
|
|
assert result["content"][0]["type"] == "text"
|
|
assert result["content"][0]["text"] == "Analyze this image"
|
|
|
|
# Check image_url part
|
|
assert result["content"][1]["type"] == "image_url"
|
|
assert result["content"][1]["image_url"]["url"] == "https://example.com/image.jpg"
|