This fixes the issue where multimodal agents fail to properly format image data when sending requests to LLMs. The image URL is now correctly structured as a multimodal input with proper type specifications according to the OpenAI API.

Co-Authored-By: Joe Moura <joao@crewai.com>
This commit is contained in:
Devin AI
2025-04-10 11:11:38 +00:00
parent 98ccbeb4bd
commit b16ad09ec7
2 changed files with 130 additions and 16 deletions

View File

@@ -819,8 +819,8 @@ class LLM(BaseLLM):
) )
def _format_messages_for_provider( def _format_messages_for_provider(
self, messages: List[Dict[str, str]] self, messages: List[Dict[str, Any]]
) -> List[Dict[str, str]]: ) -> List[Dict[str, Any]]:
"""Format messages according to provider requirements. """Format messages according to provider requirements.
Args: Args:
@@ -830,6 +830,7 @@ class LLM(BaseLLM):
Returns: Returns:
List of formatted messages according to provider requirements. List of formatted messages according to provider requirements.
For Anthropic models, ensures first message has 'user' role. For Anthropic models, ensures first message has 'user' role.
For multimodal content, ensures proper formatting of image URLs.
Raises: Raises:
TypeError: If messages is None or contains invalid message format. TypeError: If messages is None or contains invalid message format.
@@ -844,40 +845,71 @@ class LLM(BaseLLM):
"Invalid message format. Each message must be a dict with 'role' and 'content' keys" "Invalid message format. Each message must be a dict with 'role' and 'content' keys"
) )
formatted_messages = []
for msg in messages:
if isinstance(msg["content"], list):
formatted_content = []
for item in msg["content"]:
if isinstance(item, dict):
if item.get("type") == "text":
formatted_content.append({
"type": "text",
"text": item["text"]
})
elif item.get("type") == "image_url":
formatted_content.append({
"type": "image_url",
"image_url": item["image_url"]
})
else:
formatted_content.append(item)
else:
formatted_content.append({
"type": "text",
"text": str(item)
})
formatted_messages.append({
"role": msg["role"],
"content": formatted_content
})
else:
formatted_messages.append(msg)
# Handle O1 models specially # Handle O1 models specially
if "o1" in self.model.lower(): if "o1" in self.model.lower():
formatted_messages = [] result = []
for msg in messages: for msg in formatted_messages:
# Convert system messages to assistant messages # Convert system messages to assistant messages
if msg["role"] == "system": if msg["role"] == "system":
formatted_messages.append( result.append(
{"role": "assistant", "content": msg["content"]} {"role": "assistant", "content": msg["content"]}
) )
else: else:
formatted_messages.append(msg) result.append(msg)
return formatted_messages return result
# Handle Mistral models - they require the last message to have a role of 'user' or 'tool' # Handle Mistral models - they require the last message to have a role of 'user' or 'tool'
if "mistral" in self.model.lower(): if "mistral" in self.model.lower():
# Check if the last message has a role of 'assistant' # Check if the last message has a role of 'assistant'
if messages and messages[-1]["role"] == "assistant": if formatted_messages and formatted_messages[-1]["role"] == "assistant":
# Add a dummy user message to ensure the last message has a role of 'user' # Add a dummy user message to ensure the last message has a role of 'user'
messages = ( formatted_messages = (
messages.copy() formatted_messages.copy()
) # Create a copy to avoid modifying the original ) # Create a copy to avoid modifying the original
messages.append({"role": "user", "content": "Please continue."}) formatted_messages.append({"role": "user", "content": "Please continue."})
return messages return formatted_messages
# Handle Anthropic models # Handle Anthropic models
if not self.is_anthropic: if not self.is_anthropic:
return messages return formatted_messages
# Anthropic requires messages to start with 'user' role # Anthropic requires messages to start with 'user' role
if not messages or messages[0]["role"] == "system": if not formatted_messages or formatted_messages[0]["role"] == "system":
# If first message is system or empty, add a placeholder user message # If first message is system or empty, add a placeholder user message
return [{"role": "user", "content": "."}, *messages] return [{"role": "user", "content": "."}, *formatted_messages]
return messages return formatted_messages
def _get_custom_llm_provider(self) -> Optional[str]: def _get_custom_llm_provider(self) -> Optional[str]:
""" """

View File

@@ -3820,6 +3820,88 @@ def test_multimodal_agent_live_image_analysis():
assert "error" not in result.raw.lower() # No error messages in response assert "error" not in result.raw.lower() # No error messages in response
def test_format_messages_for_provider_with_multimodal_content():
"""
Test that the _format_messages_for_provider method correctly formats multimodal content.
This specifically tests that image URLs are formatted as structured multimodal content
rather than plain text.
"""
llm = LLM(model="gpt-4o")
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Analyze this image:"
},
{
"type": "image_url",
"image_url": "https://example.com/test-image.jpg"
}
]
}
]
formatted_messages = llm._format_messages_for_provider(messages)
assert len(formatted_messages) == 1
assert formatted_messages[0]["role"] == "user"
assert isinstance(formatted_messages[0]["content"], list)
content = formatted_messages[0]["content"]
assert len(content) == 2
assert content[0]["type"] == "text"
assert content[0]["text"] == "Analyze this image:"
assert content[1]["type"] == "image_url"
assert "image_url" in content[1]
messages_with_string_url = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Analyze this image:"
},
{
"type": "image_url",
"image_url": "https://example.com/test-image.jpg"
}
]
}
]
formatted_messages_string_url = llm._format_messages_for_provider(messages_with_string_url)
assert formatted_messages_string_url[0]["content"][1]["type"] == "image_url"
assert "image_url" in formatted_messages_string_url[0]["content"][1]
messages_with_dict_url = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Analyze this image:"
},
{
"type": "image_url",
"image_url": {
"url": "https://example.com/test-image.jpg"
}
}
]
}
]
formatted_messages_dict_url = llm._format_messages_for_provider(messages_with_dict_url)
assert formatted_messages_dict_url[0]["content"][1]["type"] == "image_url"
assert "image_url" in formatted_messages_dict_url[0]["content"][1]
@pytest.mark.vcr(filter_headers=["authorization"]) @pytest.mark.vcr(filter_headers=["authorization"])
def test_crew_with_failing_task_guardrails(): def test_crew_with_failing_task_guardrails():
"""Test that crew properly handles failing guardrails and retries with validation feedback.""" """Test that crew properly handles failing guardrails and retries with validation feedback."""