mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-25 08:08:14 +00:00
This fixes the issue where multimodal agents fail to properly format image data when sending requests to LLMs. The image URL is now correctly structured as a multimodal input with proper type specifications according to the OpenAI API.
Co-Authored-By: Joe Moura <joao@crewai.com>
This commit is contained in:
@@ -819,8 +819,8 @@ class LLM(BaseLLM):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def _format_messages_for_provider(
|
def _format_messages_for_provider(
|
||||||
self, messages: List[Dict[str, str]]
|
self, messages: List[Dict[str, Any]]
|
||||||
) -> List[Dict[str, str]]:
|
) -> List[Dict[str, Any]]:
|
||||||
"""Format messages according to provider requirements.
|
"""Format messages according to provider requirements.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -830,6 +830,7 @@ class LLM(BaseLLM):
|
|||||||
Returns:
|
Returns:
|
||||||
List of formatted messages according to provider requirements.
|
List of formatted messages according to provider requirements.
|
||||||
For Anthropic models, ensures first message has 'user' role.
|
For Anthropic models, ensures first message has 'user' role.
|
||||||
|
For multimodal content, ensures proper formatting of image URLs.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
TypeError: If messages is None or contains invalid message format.
|
TypeError: If messages is None or contains invalid message format.
|
||||||
@@ -844,40 +845,71 @@ class LLM(BaseLLM):
|
|||||||
"Invalid message format. Each message must be a dict with 'role' and 'content' keys"
|
"Invalid message format. Each message must be a dict with 'role' and 'content' keys"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
formatted_messages = []
|
||||||
|
for msg in messages:
|
||||||
|
if isinstance(msg["content"], list):
|
||||||
|
formatted_content = []
|
||||||
|
for item in msg["content"]:
|
||||||
|
if isinstance(item, dict):
|
||||||
|
if item.get("type") == "text":
|
||||||
|
formatted_content.append({
|
||||||
|
"type": "text",
|
||||||
|
"text": item["text"]
|
||||||
|
})
|
||||||
|
elif item.get("type") == "image_url":
|
||||||
|
formatted_content.append({
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": item["image_url"]
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
formatted_content.append(item)
|
||||||
|
else:
|
||||||
|
formatted_content.append({
|
||||||
|
"type": "text",
|
||||||
|
"text": str(item)
|
||||||
|
})
|
||||||
|
|
||||||
|
formatted_messages.append({
|
||||||
|
"role": msg["role"],
|
||||||
|
"content": formatted_content
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
formatted_messages.append(msg)
|
||||||
|
|
||||||
# Handle O1 models specially
|
# Handle O1 models specially
|
||||||
if "o1" in self.model.lower():
|
if "o1" in self.model.lower():
|
||||||
formatted_messages = []
|
result = []
|
||||||
for msg in messages:
|
for msg in formatted_messages:
|
||||||
# Convert system messages to assistant messages
|
# Convert system messages to assistant messages
|
||||||
if msg["role"] == "system":
|
if msg["role"] == "system":
|
||||||
formatted_messages.append(
|
result.append(
|
||||||
{"role": "assistant", "content": msg["content"]}
|
{"role": "assistant", "content": msg["content"]}
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
formatted_messages.append(msg)
|
result.append(msg)
|
||||||
return formatted_messages
|
return result
|
||||||
|
|
||||||
# Handle Mistral models - they require the last message to have a role of 'user' or 'tool'
|
# Handle Mistral models - they require the last message to have a role of 'user' or 'tool'
|
||||||
if "mistral" in self.model.lower():
|
if "mistral" in self.model.lower():
|
||||||
# Check if the last message has a role of 'assistant'
|
# Check if the last message has a role of 'assistant'
|
||||||
if messages and messages[-1]["role"] == "assistant":
|
if formatted_messages and formatted_messages[-1]["role"] == "assistant":
|
||||||
# Add a dummy user message to ensure the last message has a role of 'user'
|
# Add a dummy user message to ensure the last message has a role of 'user'
|
||||||
messages = (
|
formatted_messages = (
|
||||||
messages.copy()
|
formatted_messages.copy()
|
||||||
) # Create a copy to avoid modifying the original
|
) # Create a copy to avoid modifying the original
|
||||||
messages.append({"role": "user", "content": "Please continue."})
|
formatted_messages.append({"role": "user", "content": "Please continue."})
|
||||||
return messages
|
return formatted_messages
|
||||||
|
|
||||||
# Handle Anthropic models
|
# Handle Anthropic models
|
||||||
if not self.is_anthropic:
|
if not self.is_anthropic:
|
||||||
return messages
|
return formatted_messages
|
||||||
|
|
||||||
# Anthropic requires messages to start with 'user' role
|
# Anthropic requires messages to start with 'user' role
|
||||||
if not messages or messages[0]["role"] == "system":
|
if not formatted_messages or formatted_messages[0]["role"] == "system":
|
||||||
# If first message is system or empty, add a placeholder user message
|
# If first message is system or empty, add a placeholder user message
|
||||||
return [{"role": "user", "content": "."}, *messages]
|
return [{"role": "user", "content": "."}, *formatted_messages]
|
||||||
|
|
||||||
return messages
|
return formatted_messages
|
||||||
|
|
||||||
def _get_custom_llm_provider(self) -> Optional[str]:
|
def _get_custom_llm_provider(self) -> Optional[str]:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -3820,6 +3820,88 @@ def test_multimodal_agent_live_image_analysis():
|
|||||||
assert "error" not in result.raw.lower() # No error messages in response
|
assert "error" not in result.raw.lower() # No error messages in response
|
||||||
|
|
||||||
|
|
||||||
|
def test_format_messages_for_provider_with_multimodal_content():
|
||||||
|
"""
|
||||||
|
Test that the _format_messages_for_provider method correctly formats multimodal content.
|
||||||
|
This specifically tests that image URLs are formatted as structured multimodal content
|
||||||
|
rather than plain text.
|
||||||
|
"""
|
||||||
|
llm = LLM(model="gpt-4o")
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "Analyze this image:"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": "https://example.com/test-image.jpg"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
formatted_messages = llm._format_messages_for_provider(messages)
|
||||||
|
|
||||||
|
assert len(formatted_messages) == 1
|
||||||
|
assert formatted_messages[0]["role"] == "user"
|
||||||
|
assert isinstance(formatted_messages[0]["content"], list)
|
||||||
|
|
||||||
|
content = formatted_messages[0]["content"]
|
||||||
|
assert len(content) == 2
|
||||||
|
|
||||||
|
assert content[0]["type"] == "text"
|
||||||
|
assert content[0]["text"] == "Analyze this image:"
|
||||||
|
|
||||||
|
assert content[1]["type"] == "image_url"
|
||||||
|
assert "image_url" in content[1]
|
||||||
|
|
||||||
|
messages_with_string_url = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "Analyze this image:"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": "https://example.com/test-image.jpg"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
formatted_messages_string_url = llm._format_messages_for_provider(messages_with_string_url)
|
||||||
|
assert formatted_messages_string_url[0]["content"][1]["type"] == "image_url"
|
||||||
|
assert "image_url" in formatted_messages_string_url[0]["content"][1]
|
||||||
|
|
||||||
|
messages_with_dict_url = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "Analyze this image:"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": "https://example.com/test-image.jpg"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
formatted_messages_dict_url = llm._format_messages_for_provider(messages_with_dict_url)
|
||||||
|
assert formatted_messages_dict_url[0]["content"][1]["type"] == "image_url"
|
||||||
|
assert "image_url" in formatted_messages_dict_url[0]["content"][1]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||||
def test_crew_with_failing_task_guardrails():
|
def test_crew_with_failing_task_guardrails():
|
||||||
"""Test that crew properly handles failing guardrails and retries with validation feedback."""
|
"""Test that crew properly handles failing guardrails and retries with validation feedback."""
|
||||||
|
|||||||
Reference in New Issue
Block a user