From b16ad09ec73f437fdce68675511f4a65645358ff Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 10 Apr 2025 11:11:38 +0000 Subject: [PATCH] This fixes the issue where multimodal agents fail to properly format image data when sending requests to LLMs. The image URL is now correctly structured as a multimodal input with proper type specifications according to the OpenAI API. Co-Authored-By: Joe Moura --- src/crewai/llm.py | 64 +++++++++++++++++++++++++++--------- tests/crew_test.py | 82 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+), 16 deletions(-) diff --git a/src/crewai/llm.py b/src/crewai/llm.py index 741544662..b879631df 100644 --- a/src/crewai/llm.py +++ b/src/crewai/llm.py @@ -819,8 +819,8 @@ class LLM(BaseLLM): ) def _format_messages_for_provider( - self, messages: List[Dict[str, str]] - ) -> List[Dict[str, str]]: + self, messages: List[Dict[str, Any]] + ) -> List[Dict[str, Any]]: """Format messages according to provider requirements. Args: @@ -830,6 +830,7 @@ class LLM(BaseLLM): Returns: List of formatted messages according to provider requirements. For Anthropic models, ensures first message has 'user' role. + For multimodal content, ensures proper formatting of image URLs. Raises: TypeError: If messages is None or contains invalid message format. @@ -844,40 +845,71 @@ class LLM(BaseLLM): "Invalid message format. Each message must be a dict with 'role' and 'content' keys" ) + formatted_messages = [] + for msg in messages: + if isinstance(msg["content"], list): + formatted_content = [] + for item in msg["content"]: + if isinstance(item, dict): + if item.get("type") == "text": + formatted_content.append({ + "type": "text", + "text": item["text"] + }) + elif item.get("type") == "image_url": + formatted_content.append({ + "type": "image_url", + "image_url": item["image_url"] + }) + else: + formatted_content.append(item) + else: + formatted_content.append({ + "type": "text", + "text": str(item) + }) + + formatted_messages.append({ + "role": msg["role"], + "content": formatted_content + }) + else: + formatted_messages.append(msg) + # Handle O1 models specially if "o1" in self.model.lower(): - formatted_messages = [] - for msg in messages: + result = [] + for msg in formatted_messages: # Convert system messages to assistant messages if msg["role"] == "system": - formatted_messages.append( + result.append( {"role": "assistant", "content": msg["content"]} ) else: - formatted_messages.append(msg) - return formatted_messages + result.append(msg) + return result # Handle Mistral models - they require the last message to have a role of 'user' or 'tool' if "mistral" in self.model.lower(): # Check if the last message has a role of 'assistant' - if messages and messages[-1]["role"] == "assistant": + if formatted_messages and formatted_messages[-1]["role"] == "assistant": # Add a dummy user message to ensure the last message has a role of 'user' - messages = ( - messages.copy() + formatted_messages = ( + formatted_messages.copy() ) # Create a copy to avoid modifying the original - messages.append({"role": "user", "content": "Please continue."}) - return messages + formatted_messages.append({"role": "user", "content": "Please continue."}) + return formatted_messages # Handle Anthropic models if not self.is_anthropic: - return messages + return formatted_messages # Anthropic requires messages to start with 'user' role - if not messages or messages[0]["role"] == "system": + if not formatted_messages or formatted_messages[0]["role"] == "system": # If first message is system or empty, add a placeholder user message - return [{"role": "user", "content": "."}, *messages] + return [{"role": "user", "content": "."}, *formatted_messages] - return messages + return formatted_messages def _get_custom_llm_provider(self) -> Optional[str]: """ diff --git a/tests/crew_test.py b/tests/crew_test.py index d7e4740cd..c465841a4 100644 --- a/tests/crew_test.py +++ b/tests/crew_test.py @@ -3820,6 +3820,88 @@ def test_multimodal_agent_live_image_analysis(): assert "error" not in result.raw.lower() # No error messages in response +def test_format_messages_for_provider_with_multimodal_content(): + """ + Test that the _format_messages_for_provider method correctly formats multimodal content. + This specifically tests that image URLs are formatted as structured multimodal content + rather than plain text. + """ + llm = LLM(model="gpt-4o") + + messages = [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Analyze this image:" + }, + { + "type": "image_url", + "image_url": "https://example.com/test-image.jpg" + } + ] + } + ] + + formatted_messages = llm._format_messages_for_provider(messages) + + assert len(formatted_messages) == 1 + assert formatted_messages[0]["role"] == "user" + assert isinstance(formatted_messages[0]["content"], list) + + content = formatted_messages[0]["content"] + assert len(content) == 2 + + assert content[0]["type"] == "text" + assert content[0]["text"] == "Analyze this image:" + + assert content[1]["type"] == "image_url" + assert "image_url" in content[1] + + messages_with_string_url = [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Analyze this image:" + }, + { + "type": "image_url", + "image_url": "https://example.com/test-image.jpg" + } + ] + } + ] + + formatted_messages_string_url = llm._format_messages_for_provider(messages_with_string_url) + assert formatted_messages_string_url[0]["content"][1]["type"] == "image_url" + assert "image_url" in formatted_messages_string_url[0]["content"][1] + + messages_with_dict_url = [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Analyze this image:" + }, + { + "type": "image_url", + "image_url": { + "url": "https://example.com/test-image.jpg" + } + } + ] + } + ] + + formatted_messages_dict_url = llm._format_messages_for_provider(messages_with_dict_url) + assert formatted_messages_dict_url[0]["content"][1]["type"] == "image_url" + assert "image_url" in formatted_messages_dict_url[0]["content"][1] + + @pytest.mark.vcr(filter_headers=["authorization"]) def test_crew_with_failing_task_guardrails(): """Test that crew properly handles failing guardrails and retries with validation feedback."""