From 29d40de65104037dcbb8d8e90161a2a6e08470bc Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 2 Dec 2025 10:13:10 +0000 Subject: [PATCH] fix: properly handle multimodal image content in agents (Issue #4016) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit fixes the multimodal image handling in CrewAI agents: 1. CrewAgentExecutor._handle_agent_action: Fixed to properly append multimodal messages without double-wrapping. The add_image tool result is now appended directly if it's already a properly formatted message dict with role and content. 2. ToolUsage._use: Fixed to preserve the raw dict result for add_image tool instead of stringifying it via _format_result(). This ensures the multimodal content structure is maintained. 3. Gemini provider _format_messages_for_gemini: Added proper handling for multimodal content with image_url parts. The provider now: - Converts image_url parts to Gemini's inline_data format - Supports HTTP(S) URLs by fetching and converting to base64 - Supports data URLs by parsing and extracting base64 data - Supports local file paths by reading and converting to base64 4. Added comprehensive tests for all multimodal handling fixes. Fixes #4016 Co-Authored-By: João --- .../src/crewai/agents/crew_agent_executor.py | 15 +- .../llms/providers/gemini/completion.py | 202 +++++++- lib/crewai/src/crewai/tools/tool_usage.py | 13 +- .../tests/test_multimodal_image_handling.py | 488 ++++++++++++++++++ 4 files changed, 703 insertions(+), 15 deletions(-) create mode 100644 lib/crewai/tests/test_multimodal_image_handling.py diff --git a/lib/crewai/src/crewai/agents/crew_agent_executor.py b/lib/crewai/src/crewai/agents/crew_agent_executor.py index 5286c532e..d2b1d27a8 100644 --- a/lib/crewai/src/crewai/agents/crew_agent_executor.py +++ b/lib/crewai/src/crewai/agents/crew_agent_executor.py @@ -324,14 +324,25 @@ class CrewAgentExecutor(CrewAgentExecutorMixin): Returns: Updated action or final answer. """ - # Special case for add_image_tool + # Special case for add_image_tool - the result is already a properly + # formatted multimodal message dict with role and content add_image_tool = self._i18n.tools("add_image") if ( isinstance(add_image_tool, dict) and formatted_answer.tool.casefold().strip() == add_image_tool.get("name", "").casefold().strip() ): - self.messages.append({"role": "assistant", "content": tool_result.result}) + # tool_result.result is a dict like {"role": "user", "content": [...]} + # Append it directly without wrapping + result = tool_result.result + if isinstance(result, dict) and "role" in result and "content" in result: + self.messages.append(result) + elif isinstance(result, list): + # If it's just the content list, wrap it with user role + self.messages.append({"role": "user", "content": result}) + else: + # Fallback: wrap as user message + self.messages.append({"role": "user", "content": str(result)}) return formatted_answer return handle_agent_action_core( diff --git a/lib/crewai/src/crewai/llms/providers/gemini/completion.py b/lib/crewai/src/crewai/llms/providers/gemini/completion.py index 1b89d0667..a12e2661f 100644 --- a/lib/crewai/src/crewai/llms/providers/gemini/completion.py +++ b/lib/crewai/src/crewai/llms/providers/gemini/completion.py @@ -448,6 +448,7 @@ class GeminiCompletion(BaseLLM): - System messages are separate system_instruction - Content is organized as Content objects with Parts - Roles are 'user' and 'model' (not 'assistant') + - Multimodal content with image_url must be converted to Gemini's format Args: messages: Input messages @@ -465,17 +466,18 @@ class GeminiCompletion(BaseLLM): role = message["role"] content = message["content"] - # Convert content to string if it's a list - if isinstance(content, list): - text_content = " ".join( - str(item.get("text", "")) if isinstance(item, dict) else str(item) - for item in content - ) - else: - text_content = str(content) if content else "" - if role == "system": # Extract system instruction - Gemini handles it separately + if isinstance(content, list): + text_content = " ".join( + str(item.get("text", "")) + if isinstance(item, dict) + else str(item) + for item in content + ) + else: + text_content = str(content) if content else "" + if system_instruction: system_instruction += f"\n\n{text_content}" else: @@ -484,14 +486,190 @@ class GeminiCompletion(BaseLLM): # Convert role for Gemini (assistant -> model) gemini_role = "model" if role == "assistant" else "user" + # Convert content to Gemini Parts + parts = self._convert_content_to_parts(content) + # Create Content object - gemini_content = types.Content( - role=gemini_role, parts=[types.Part.from_text(text=text_content)] - ) + gemini_content = types.Content(role=gemini_role, parts=parts) contents.append(gemini_content) return contents, system_instruction + def _convert_content_to_parts(self, content: Any) -> list[types.Part]: + """Convert message content to Gemini Parts. + + Handles both simple text content and multimodal content with image_url. + + Args: + content: Message content (string or list of content items) + + Returns: + List of Gemini Part objects + """ + if not isinstance(content, list): + # Simple text content + text_content = str(content) if content else "" + return [types.Part.from_text(text=text_content)] + + parts: list[types.Part] = [] + for item in content: + if isinstance(item, dict): + item_type = item.get("type", "") + if item_type == "text": + text = item.get("text", "") + if text: + parts.append(types.Part.from_text(text=str(text))) + elif item_type == "image_url": + # Handle image_url format from OpenAI-compatible APIs + image_url_data = item.get("image_url", {}) + url = image_url_data.get("url", "") if isinstance( + image_url_data, dict + ) else str(image_url_data) + if url: + image_part = self._create_image_part_from_url(url) + if image_part: + parts.append(image_part) + else: + # Unknown type, convert to text + parts.append(types.Part.from_text(text=str(item))) + else: + # Non-dict item, convert to text + parts.append(types.Part.from_text(text=str(item))) + + # Ensure at least one part exists + if not parts: + parts.append(types.Part.from_text(text="")) + + return parts + + def _create_image_part_from_url(self, url: str) -> types.Part | None: + """Create a Gemini Part from an image URL. + + Handles: + - HTTP(S) URLs: Fetches the image and converts to inline_data + - Data URLs: Parses and extracts base64 data + - Local file paths: Reads file and converts to inline_data + + Args: + url: Image URL, data URL, or local file path + + Returns: + Gemini Part object or None if conversion fails + """ + + try: + if url.startswith("data:"): + # Handle data URL (e.g., data:image/png;base64,...) + return self._parse_data_url(url) + if url.startswith(("http://", "https://")): + # Handle HTTP(S) URL - fetch and convert to inline_data + return self._fetch_image_from_url(url) + # Handle local file path + return self._read_local_image(url) + except Exception as e: + logging.warning(f"Failed to create image part from URL '{url}': {e}") + return None + + def _parse_data_url(self, data_url: str) -> types.Part | None: + """Parse a data URL and create a Gemini Part. + + Args: + data_url: Data URL (e.g., data:image/png;base64,...) + + Returns: + Gemini Part object or None if parsing fails + """ + import base64 + + try: + # Parse data URL: data:[][;base64], + if not data_url.startswith("data:"): + return None + + # Split header and data + header_end = data_url.find(",") + if header_end == -1: + return None + + header = data_url[5:header_end] # Skip "data:" + data = data_url[header_end + 1:] + + # Parse mime type and encoding + parts = header.split(";") + mime_type = parts[0] if parts else "application/octet-stream" + is_base64 = "base64" in parts + + if is_base64: + image_bytes = base64.b64decode(data) + else: + image_bytes = data.encode("utf-8") + + return types.Part.from_bytes(data=image_bytes, mime_type=mime_type) + except Exception as e: + logging.warning(f"Failed to parse data URL: {e}") + return None + + def _fetch_image_from_url(self, url: str) -> types.Part | None: + """Fetch an image from a URL and create a Gemini Part. + + Args: + url: HTTP(S) URL of the image + + Returns: + Gemini Part object or None if fetching fails + """ + import mimetypes + import urllib.request + + try: + # Fetch the image with a timeout + # URL scheme is validated in _create_image_part_from_url before calling this method + with urllib.request.urlopen(url, timeout=30) as response: # noqa: S310 + image_bytes = response.read() + content_type = response.headers.get("Content-Type", "") + + # Extract mime type from Content-Type header + if content_type: + mime_type = content_type.split(";")[0].strip() + else: + # Guess from URL extension + mime_type, _ = mimetypes.guess_type(url) + mime_type = mime_type or "image/jpeg" + + return types.Part.from_bytes(data=image_bytes, mime_type=mime_type) + except Exception as e: + logging.warning(f"Failed to fetch image from URL '{url}': {e}") + return None + + def _read_local_image(self, file_path: str) -> types.Part | None: + """Read a local image file and create a Gemini Part. + + Args: + file_path: Path to the local image file + + Returns: + Gemini Part object or None if reading fails + """ + import mimetypes + import os + + try: + if not os.path.exists(file_path): + logging.warning(f"Image file not found: {file_path}") + return None + + # Guess mime type from file extension + mime_type, _ = mimetypes.guess_type(file_path) + mime_type = mime_type or "image/jpeg" + + with open(file_path, "rb") as f: + image_bytes = f.read() + + return types.Part.from_bytes(data=image_bytes, mime_type=mime_type) + except Exception as e: + logging.warning(f"Failed to read local image '{file_path}': {e}") + return None + def _handle_completion( self, contents: list[types.Content], diff --git a/lib/crewai/src/crewai/tools/tool_usage.py b/lib/crewai/src/crewai/tools/tool_usage.py index 6f0e92cb8..79f92ab75 100644 --- a/lib/crewai/src/crewai/tools/tool_usage.py +++ b/lib/crewai/src/crewai/tools/tool_usage.py @@ -313,7 +313,18 @@ class ToolUsage: tool_name=tool.name, attempts=self._run_attempts, ) - result = self._format_result(result=result) + + # For add_image tool, preserve the raw dict result for multimodal handling + # The result is a dict like {"role": "user", "content": [...]} that should + # not be stringified + add_image_tool = self._i18n.tools("add_image") + is_add_image_tool = ( + isinstance(add_image_tool, dict) + and tool.name == add_image_tool.get("name", "") + ) + if not is_add_image_tool: + result = self._format_result(result=result) + data = { "result": result, "tool_name": tool.name, diff --git a/lib/crewai/tests/test_multimodal_image_handling.py b/lib/crewai/tests/test_multimodal_image_handling.py new file mode 100644 index 000000000..bc724b450 --- /dev/null +++ b/lib/crewai/tests/test_multimodal_image_handling.py @@ -0,0 +1,488 @@ +"""Tests for multimodal image handling fixes (Issue #4016). + +This module tests the fixes for proper multimodal image handling in CrewAI: +1. CrewAgentExecutor._handle_agent_action properly appends multimodal messages +2. ToolUsage.use preserves raw dict result for add_image tool +3. Gemini provider properly converts image_url to Gemini's format +""" + +import base64 +from unittest.mock import MagicMock, patch + +import pytest + + +class TestCrewAgentExecutorMultimodal: + """Tests for CrewAgentExecutor._handle_agent_action with multimodal content.""" + + def test_handle_agent_action_with_add_image_tool_dict_result(self): + """Test that add_image tool result dict is appended directly without wrapping.""" + from crewai.agents.crew_agent_executor import CrewAgentExecutor + from crewai.agents.parser import AgentAction + from crewai.tools.tool_usage import ToolResult + + # Create a mock executor + mock_llm = MagicMock() + mock_task = MagicMock() + mock_crew = MagicMock() + mock_agent = MagicMock() + mock_agent.i18n = MagicMock() + mock_agent.i18n.tools.return_value = {"name": "Add image to content"} + + executor = CrewAgentExecutor( + llm=mock_llm, + task=mock_task, + crew=mock_crew, + agent=mock_agent, + prompt="test prompt", + tools=[], + original_tools=[], + max_iter=1, + ) + executor.messages = [] + executor._i18n = mock_agent.i18n + + # Create a mock add_image tool result (dict with role and content) + multimodal_result = { + "role": "user", + "content": [ + {"type": "text", "text": "Analyze this image"}, + { + "type": "image_url", + "image_url": {"url": "https://example.com/image.jpg"}, + }, + ], + } + + formatted_answer = AgentAction( + tool="Add image to content", + tool_input='{"image_url": "https://example.com/image.jpg"}', + text="Using add image tool", + thought="I need to add an image", + result="", + ) + + tool_result = ToolResult(result=multimodal_result) + + # Call _handle_agent_action + result = executor._handle_agent_action(formatted_answer, tool_result) + + # Verify the message was appended correctly (not double-wrapped) + assert len(executor.messages) == 1 + appended_message = executor.messages[0] + + # The message should be the dict directly, not wrapped + assert appended_message["role"] == "user" + assert isinstance(appended_message["content"], list) + assert len(appended_message["content"]) == 2 + assert appended_message["content"][0]["type"] == "text" + assert appended_message["content"][1]["type"] == "image_url" + + def test_handle_agent_action_with_add_image_tool_list_result(self): + """Test that add_image tool result list is wrapped with user role.""" + from crewai.agents.crew_agent_executor import CrewAgentExecutor + from crewai.agents.parser import AgentAction + from crewai.tools.tool_usage import ToolResult + + # Create a mock executor + mock_llm = MagicMock() + mock_task = MagicMock() + mock_crew = MagicMock() + mock_agent = MagicMock() + mock_agent.i18n = MagicMock() + mock_agent.i18n.tools.return_value = {"name": "Add image to content"} + + executor = CrewAgentExecutor( + llm=mock_llm, + task=mock_task, + crew=mock_crew, + agent=mock_agent, + prompt="test prompt", + tools=[], + original_tools=[], + max_iter=1, + ) + executor.messages = [] + executor._i18n = mock_agent.i18n + + # Create a mock add_image tool result (list without role) + content_list = [ + {"type": "text", "text": "Analyze this image"}, + { + "type": "image_url", + "image_url": {"url": "https://example.com/image.jpg"}, + }, + ] + + formatted_answer = AgentAction( + tool="Add image to content", + tool_input='{"image_url": "https://example.com/image.jpg"}', + text="Using add image tool", + thought="I need to add an image", + result="", + ) + + tool_result = ToolResult(result=content_list) + + # Call _handle_agent_action + result = executor._handle_agent_action(formatted_answer, tool_result) + + # Verify the message was wrapped with user role + assert len(executor.messages) == 1 + appended_message = executor.messages[0] + + assert appended_message["role"] == "user" + assert appended_message["content"] == content_list + + +class TestToolUsageMultimodal: + """Tests for ToolUsage.use with add_image tool.""" + + def test_tool_usage_preserves_add_image_dict_result(self): + """Test that add_image tool result is not stringified.""" + from crewai.tools.tool_usage import ToolUsage + + # Create mock components + mock_tools_handler = MagicMock() + mock_tools_handler.cache = None + mock_task = MagicMock() + mock_task.used_tools = 0 + mock_agent = MagicMock() + mock_agent.key = "test_key" + mock_agent.role = "test_role" + mock_agent.verbose = False + mock_agent.fingerprint = None + mock_agent.tools_results = [] + mock_action = MagicMock() + mock_action.tool = "Add image to content" + mock_action.tool_input = '{"image_url": "https://example.com/image.jpg"}' + + # Mock i18n + mock_i18n = MagicMock() + mock_i18n.tools.return_value = {"name": "Add image to content"} + + # Create a mock add_image tool + mock_tool = MagicMock() + mock_tool.name = "Add image to content" + mock_tool.args_schema = MagicMock() + mock_tool.args_schema.model_json_schema.return_value = { + "properties": {"image_url": {}, "action": {}} + } + + # The tool returns a dict with role and content + multimodal_result = { + "role": "user", + "content": [ + {"type": "text", "text": "Analyze this image"}, + { + "type": "image_url", + "image_url": {"url": "https://example.com/image.jpg"}, + }, + ], + } + mock_tool.invoke.return_value = multimodal_result + + tool_usage = ToolUsage( + tools_handler=mock_tools_handler, + tools=[mock_tool], + task=mock_task, + function_calling_llm=MagicMock(), + agent=mock_agent, + action=mock_action, + ) + tool_usage._i18n = mock_i18n + tool_usage._telemetry = MagicMock() + + # Create a mock calling object + mock_calling = MagicMock() + mock_calling.tool_name = "Add image to content" + mock_calling.arguments = {"image_url": "https://example.com/image.jpg"} + + # Call _use directly + result = tool_usage._use( + tool_string="test", + tool=mock_tool, + calling=mock_calling, + ) + + # The result should be the dict, not a string + assert isinstance(result, dict) + assert result["role"] == "user" + assert isinstance(result["content"], list) + + +class TestGeminiMultimodalFormatting: + """Tests for Gemini provider's multimodal content handling.""" + + @pytest.fixture + def mock_gemini_types(self): + """Create mock Gemini types for testing.""" + mock_types = MagicMock() + mock_part = MagicMock() + mock_content = MagicMock() + + # Mock Part.from_text + mock_types.Part.from_text.return_value = mock_part + + # Mock Part.from_bytes + mock_types.Part.from_bytes.return_value = mock_part + + # Mock Content + mock_types.Content.return_value = mock_content + + return mock_types + + def test_convert_content_to_parts_simple_text(self): + """Test converting simple text content to Gemini Parts.""" + with patch.dict( + "sys.modules", + { + "google": MagicMock(), + "google.genai": MagicMock(), + "google.genai.types": MagicMock(), + "google.genai.errors": MagicMock(), + }, + ): + from crewai.llms.providers.gemini.completion import GeminiCompletion + + # Create a mock instance + completion = MagicMock(spec=GeminiCompletion) + completion._convert_content_to_parts = ( + GeminiCompletion._convert_content_to_parts.__get__( + completion, GeminiCompletion + ) + ) + + # Mock types.Part.from_text + mock_part = MagicMock() + with patch( + "crewai.llms.providers.gemini.completion.types.Part.from_text", + return_value=mock_part, + ): + result = completion._convert_content_to_parts("Hello, world!") + + assert len(result) == 1 + + def test_convert_content_to_parts_multimodal_with_image_url(self): + """Test converting multimodal content with image_url to Gemini Parts.""" + with patch.dict( + "sys.modules", + { + "google": MagicMock(), + "google.genai": MagicMock(), + "google.genai.types": MagicMock(), + "google.genai.errors": MagicMock(), + }, + ): + from crewai.llms.providers.gemini.completion import GeminiCompletion + + # Create a mock instance + completion = MagicMock(spec=GeminiCompletion) + completion._convert_content_to_parts = ( + GeminiCompletion._convert_content_to_parts.__get__( + completion, GeminiCompletion + ) + ) + completion._create_image_part_from_url = MagicMock( + return_value=MagicMock() + ) + + # Mock types.Part.from_text + mock_text_part = MagicMock() + with patch( + "crewai.llms.providers.gemini.completion.types.Part.from_text", + return_value=mock_text_part, + ): + multimodal_content = [ + {"type": "text", "text": "Analyze this image"}, + { + "type": "image_url", + "image_url": {"url": "https://example.com/image.jpg"}, + }, + ] + + result = completion._convert_content_to_parts(multimodal_content) + + # Should have called _create_image_part_from_url + completion._create_image_part_from_url.assert_called_once_with( + "https://example.com/image.jpg" + ) + + def test_parse_data_url(self): + """Test parsing data URLs.""" + with patch.dict( + "sys.modules", + { + "google": MagicMock(), + "google.genai": MagicMock(), + "google.genai.types": MagicMock(), + "google.genai.errors": MagicMock(), + }, + ): + from crewai.llms.providers.gemini.completion import GeminiCompletion + + # Create a mock instance + completion = MagicMock(spec=GeminiCompletion) + completion._parse_data_url = GeminiCompletion._parse_data_url.__get__( + completion, GeminiCompletion + ) + + # Create a test data URL + test_data = b"test image data" + base64_data = base64.b64encode(test_data).decode("utf-8") + data_url = f"data:image/png;base64,{base64_data}" + + # Mock types.Part.from_bytes + mock_part = MagicMock() + with patch( + "crewai.llms.providers.gemini.completion.types.Part.from_bytes", + return_value=mock_part, + ) as mock_from_bytes: + result = completion._parse_data_url(data_url) + + # Verify from_bytes was called with correct arguments + mock_from_bytes.assert_called_once() + call_args = mock_from_bytes.call_args + assert call_args.kwargs["mime_type"] == "image/png" + assert call_args.kwargs["data"] == test_data + + def test_fetch_image_from_url(self): + """Test fetching images from HTTP URLs.""" + with patch.dict( + "sys.modules", + { + "google": MagicMock(), + "google.genai": MagicMock(), + "google.genai.types": MagicMock(), + "google.genai.errors": MagicMock(), + }, + ): + from crewai.llms.providers.gemini.completion import GeminiCompletion + + # Create a mock instance + completion = MagicMock(spec=GeminiCompletion) + completion._fetch_image_from_url = ( + GeminiCompletion._fetch_image_from_url.__get__( + completion, GeminiCompletion + ) + ) + + # Mock urllib.request.urlopen + mock_response = MagicMock() + mock_response.read.return_value = b"fake image data" + mock_response.headers.get.return_value = "image/jpeg" + mock_response.__enter__ = MagicMock(return_value=mock_response) + mock_response.__exit__ = MagicMock(return_value=False) + + mock_part = MagicMock() + with ( + patch("urllib.request.urlopen", return_value=mock_response), + patch( + "crewai.llms.providers.gemini.completion.types.Part.from_bytes", + return_value=mock_part, + ) as mock_from_bytes, + ): + result = completion._fetch_image_from_url("https://example.com/image.jpg") + + # Verify from_bytes was called with correct arguments + mock_from_bytes.assert_called_once() + call_args = mock_from_bytes.call_args + assert call_args.kwargs["mime_type"] == "image/jpeg" + assert call_args.kwargs["data"] == b"fake image data" + + def test_read_local_image(self): + """Test reading local image files.""" + import tempfile + import os + + with patch.dict( + "sys.modules", + { + "google": MagicMock(), + "google.genai": MagicMock(), + "google.genai.types": MagicMock(), + "google.genai.errors": MagicMock(), + }, + ): + from crewai.llms.providers.gemini.completion import GeminiCompletion + + # Create a mock instance + completion = MagicMock(spec=GeminiCompletion) + completion._read_local_image = GeminiCompletion._read_local_image.__get__( + completion, GeminiCompletion + ) + + # Create a temporary test file + with tempfile.NamedTemporaryFile( + suffix=".png", delete=False + ) as temp_file: + temp_file.write(b"fake png data") + temp_path = temp_file.name + + try: + mock_part = MagicMock() + with patch( + "crewai.llms.providers.gemini.completion.types.Part.from_bytes", + return_value=mock_part, + ) as mock_from_bytes: + result = completion._read_local_image(temp_path) + + # Verify from_bytes was called with correct arguments + mock_from_bytes.assert_called_once() + call_args = mock_from_bytes.call_args + assert call_args.kwargs["mime_type"] == "image/png" + assert call_args.kwargs["data"] == b"fake png data" + finally: + os.unlink(temp_path) + + def test_read_local_image_file_not_found(self): + """Test reading non-existent local image file.""" + with patch.dict( + "sys.modules", + { + "google": MagicMock(), + "google.genai": MagicMock(), + "google.genai.types": MagicMock(), + "google.genai.errors": MagicMock(), + }, + ): + from crewai.llms.providers.gemini.completion import GeminiCompletion + + # Create a mock instance + completion = MagicMock(spec=GeminiCompletion) + completion._read_local_image = GeminiCompletion._read_local_image.__get__( + completion, GeminiCompletion + ) + + result = completion._read_local_image("/nonexistent/path/image.png") + + # Should return None for non-existent file + assert result is None + + +class TestAddImageToolOutput: + """Tests for AddImageTool output format.""" + + def test_add_image_tool_returns_correct_format(self): + """Test that AddImageTool returns the correct multimodal format.""" + from crewai.tools.agent_tools.add_image_tool import AddImageTool + + tool = AddImageTool() + result = tool._run( + image_url="https://example.com/image.jpg", + action="Analyze this image", + ) + + # Verify the result format + assert isinstance(result, dict) + assert result["role"] == "user" + assert isinstance(result["content"], list) + assert len(result["content"]) == 2 + + # Check text part + assert result["content"][0]["type"] == "text" + assert result["content"][0]["text"] == "Analyze this image" + + # Check image_url part + assert result["content"][1]["type"] == "image_url" + assert result["content"][1]["image_url"]["url"] == "https://example.com/image.jpg"