mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-04-14 23:12:37 +00:00
fix: allow text files with non-multimodal models (#5137)
TextFiles passed via input_files incorrectly triggered a 'Model does not support multimodal input' error for non-vision-capable models. Text files are now inlined as text content in the message instead of being rejected. Changes: - base_llm.py: Rewrite _process_message_files to distinguish text files from binary files; add _is_text_file helper - llm.py: Apply same text-file inlining logic in both sync and async _process_message_files methods - crew.py: Recognize text file MIME types as auto-injectable so they don't require the read_file tool - task.py: Same text-file auto-injection logic in prompt method - tests: Add 17 tests covering text file inlining, image rejection, mixed files, _is_text_file helper, and edge cases Co-Authored-By: João <joao@crewai.com>
This commit is contained in:
@@ -1338,7 +1338,13 @@ class Crew(FlowTrackable, BaseModel):
|
||||
api = getattr(agent.llm, "api", None)
|
||||
supported_types = get_supported_content_types(provider, api)
|
||||
|
||||
# Text files are always auto-injected (inlined as text), even
|
||||
# when the model does not support multimodal input.
|
||||
text_prefixes = ("text/", "application/json", "application/xml", "application/x-yaml")
|
||||
|
||||
def is_auto_injected(content_type: str) -> bool:
|
||||
if any(content_type.startswith(t) for t in text_prefixes):
|
||||
return True
|
||||
return any(content_type.startswith(t) for t in supported_types)
|
||||
|
||||
# Only add read_file tool if there are files that need it
|
||||
|
||||
@@ -54,7 +54,10 @@ from crewai.utilities.string_utils import sanitize_tool_name
|
||||
|
||||
|
||||
try:
|
||||
from crewai_files import aformat_multimodal_content, format_multimodal_content
|
||||
from crewai_files import (
|
||||
aformat_multimodal_content,
|
||||
format_multimodal_content,
|
||||
)
|
||||
|
||||
HAS_CREWAI_FILES = True
|
||||
except ImportError:
|
||||
@@ -2039,6 +2042,10 @@ class LLM(BaseLLM):
|
||||
For each message with a `files` field, formats the files into
|
||||
provider-specific content blocks and updates the message content.
|
||||
|
||||
Text files (TextFile instances or files with text/* / application/json /
|
||||
application/xml / application/x-yaml content types) are always inlined
|
||||
as text content, even when the model does not support multimodal input.
|
||||
|
||||
Args:
|
||||
messages: List of messages that may contain file attachments.
|
||||
|
||||
@@ -2049,11 +2056,55 @@ class LLM(BaseLLM):
|
||||
return messages
|
||||
|
||||
if not self.supports_multimodal():
|
||||
if any(msg.get("files") for msg in messages):
|
||||
# Inline text files as text; reject non-text files
|
||||
has_non_text = False
|
||||
for msg in messages:
|
||||
files = msg.get("files")
|
||||
if not files:
|
||||
continue
|
||||
|
||||
text_parts: list[str] = []
|
||||
non_text_files: dict[str, Any] = {}
|
||||
for name, file_input in files.items():
|
||||
if self._is_text_file(file_input):
|
||||
try:
|
||||
content = file_input.read_text()
|
||||
text_parts.append(
|
||||
f"--- Content of file '{name}' ---\n{content}"
|
||||
)
|
||||
except Exception:
|
||||
non_text_files[name] = file_input
|
||||
else:
|
||||
non_text_files[name] = file_input
|
||||
|
||||
if non_text_files:
|
||||
has_non_text = True
|
||||
|
||||
if text_parts:
|
||||
existing_content = msg.get("content", "")
|
||||
inlined = "\n\n".join(text_parts)
|
||||
if isinstance(existing_content, str):
|
||||
msg["content"] = (
|
||||
f"{existing_content}\n\n{inlined}"
|
||||
if existing_content
|
||||
else inlined
|
||||
)
|
||||
elif isinstance(existing_content, list):
|
||||
msg["content"] = [
|
||||
*existing_content,
|
||||
self.format_text_content(inlined),
|
||||
]
|
||||
|
||||
if non_text_files:
|
||||
msg["files"] = non_text_files
|
||||
else:
|
||||
msg.pop("files", None)
|
||||
|
||||
if has_non_text:
|
||||
raise ValueError(
|
||||
f"Model '{self.model}' does not support multimodal input, "
|
||||
"but files were provided via 'input_files'. "
|
||||
"Use a vision-capable model or remove the file inputs."
|
||||
"but non-text files were provided via 'input_files'. "
|
||||
"Use a vision-capable model or remove the non-text file inputs."
|
||||
)
|
||||
return messages
|
||||
|
||||
@@ -2090,6 +2141,10 @@ class LLM(BaseLLM):
|
||||
For each message with a `files` field, formats the files into
|
||||
provider-specific content blocks and updates the message content.
|
||||
|
||||
Text files (TextFile instances or files with text/* / application/json /
|
||||
application/xml / application/x-yaml content types) are always inlined
|
||||
as text content, even when the model does not support multimodal input.
|
||||
|
||||
Args:
|
||||
messages: List of messages that may contain file attachments.
|
||||
|
||||
@@ -2100,11 +2155,55 @@ class LLM(BaseLLM):
|
||||
return messages
|
||||
|
||||
if not self.supports_multimodal():
|
||||
if any(msg.get("files") for msg in messages):
|
||||
# Inline text files as text; reject non-text files
|
||||
has_non_text = False
|
||||
for msg in messages:
|
||||
files = msg.get("files")
|
||||
if not files:
|
||||
continue
|
||||
|
||||
text_parts: list[str] = []
|
||||
non_text_files: dict[str, Any] = {}
|
||||
for name, file_input in files.items():
|
||||
if self._is_text_file(file_input):
|
||||
try:
|
||||
content = file_input.read_text()
|
||||
text_parts.append(
|
||||
f"--- Content of file '{name}' ---\n{content}"
|
||||
)
|
||||
except Exception:
|
||||
non_text_files[name] = file_input
|
||||
else:
|
||||
non_text_files[name] = file_input
|
||||
|
||||
if non_text_files:
|
||||
has_non_text = True
|
||||
|
||||
if text_parts:
|
||||
existing_content = msg.get("content", "")
|
||||
inlined = "\n\n".join(text_parts)
|
||||
if isinstance(existing_content, str):
|
||||
msg["content"] = (
|
||||
f"{existing_content}\n\n{inlined}"
|
||||
if existing_content
|
||||
else inlined
|
||||
)
|
||||
elif isinstance(existing_content, list):
|
||||
msg["content"] = [
|
||||
*existing_content,
|
||||
self.format_text_content(inlined),
|
||||
]
|
||||
|
||||
if non_text_files:
|
||||
msg["files"] = non_text_files
|
||||
else:
|
||||
msg.pop("files", None)
|
||||
|
||||
if has_non_text:
|
||||
raise ValueError(
|
||||
f"Model '{self.model}' does not support multimodal input, "
|
||||
"but files were provided via 'input_files'. "
|
||||
"Use a vision-capable model or remove the file inputs."
|
||||
"but non-text files were provided via 'input_files'. "
|
||||
"Use a vision-capable model or remove the non-text file inputs."
|
||||
)
|
||||
return messages
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ from crewai.types.usage_metrics import UsageMetrics
|
||||
|
||||
|
||||
try:
|
||||
from crewai_files import format_multimodal_content
|
||||
from crewai_files import TextFile, format_multimodal_content
|
||||
|
||||
HAS_CREWAI_FILES = True
|
||||
except ImportError:
|
||||
@@ -635,6 +635,10 @@ class BaseLLM(ABC):
|
||||
For each message with a `files` field, formats the files into
|
||||
provider-specific content blocks and updates the message content.
|
||||
|
||||
Text files (TextFile instances or files with text/* / application/json /
|
||||
application/xml / application/x-yaml content types) are always inlined
|
||||
as text content, even when the model does not support multimodal input.
|
||||
|
||||
Args:
|
||||
messages: List of messages that may contain file attachments.
|
||||
|
||||
@@ -644,12 +648,61 @@ class BaseLLM(ABC):
|
||||
if not HAS_CREWAI_FILES:
|
||||
return messages
|
||||
|
||||
if not self.supports_multimodal():
|
||||
if any(msg.get("files") for msg in messages):
|
||||
is_multimodal = self.supports_multimodal()
|
||||
|
||||
if not is_multimodal:
|
||||
# Inline text files as text; reject non-text files
|
||||
has_non_text = False
|
||||
for msg in messages:
|
||||
files = msg.get("files")
|
||||
if not files:
|
||||
continue
|
||||
|
||||
text_parts: list[str] = []
|
||||
non_text_files: dict[str, Any] = {}
|
||||
for name, file_input in files.items():
|
||||
if self._is_text_file(file_input):
|
||||
try:
|
||||
content = file_input.read_text()
|
||||
text_parts.append(
|
||||
f"--- Content of file '{name}' ---\n{content}"
|
||||
)
|
||||
except Exception:
|
||||
# If reading fails, fall back to tool-based access
|
||||
non_text_files[name] = file_input
|
||||
else:
|
||||
non_text_files[name] = file_input
|
||||
|
||||
if non_text_files:
|
||||
has_non_text = True
|
||||
|
||||
# Append inlined text content to the message
|
||||
if text_parts:
|
||||
existing_content = msg.get("content", "")
|
||||
inlined = "\n\n".join(text_parts)
|
||||
if isinstance(existing_content, str):
|
||||
msg["content"] = (
|
||||
f"{existing_content}\n\n{inlined}"
|
||||
if existing_content
|
||||
else inlined
|
||||
)
|
||||
elif isinstance(existing_content, list):
|
||||
msg["content"] = [
|
||||
*existing_content,
|
||||
self.format_text_content(inlined),
|
||||
]
|
||||
|
||||
# Keep only non-text files (for tool-based access)
|
||||
if non_text_files:
|
||||
msg["files"] = non_text_files
|
||||
else:
|
||||
msg.pop("files", None)
|
||||
|
||||
if has_non_text:
|
||||
raise ValueError(
|
||||
f"Model '{self.model}' does not support multimodal input, "
|
||||
"but files were provided via 'input_files'. "
|
||||
"Use a vision-capable model or remove the file inputs."
|
||||
"but non-text files were provided via 'input_files'. "
|
||||
"Use a vision-capable model or remove the non-text file inputs."
|
||||
)
|
||||
return messages
|
||||
|
||||
@@ -680,6 +733,25 @@ class BaseLLM(ABC):
|
||||
|
||||
return messages
|
||||
|
||||
@staticmethod
|
||||
def _is_text_file(file_input: Any) -> bool:
|
||||
"""Check whether a file input is a text file.
|
||||
|
||||
Returns True for TextFile instances or files whose content_type
|
||||
starts with ``text/`` or matches common text-based MIME types
|
||||
(application/json, application/xml, application/x-yaml).
|
||||
"""
|
||||
if HAS_CREWAI_FILES and isinstance(file_input, TextFile):
|
||||
return True
|
||||
content_type = getattr(file_input, "content_type", "")
|
||||
if content_type.startswith("text/"):
|
||||
return True
|
||||
return content_type in (
|
||||
"application/json",
|
||||
"application/xml",
|
||||
"application/x-yaml",
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _validate_structured_output(
|
||||
response: str,
|
||||
|
||||
@@ -824,7 +824,13 @@ class Task(BaseModel):
|
||||
api: str | None = getattr(self.agent.llm, "api", None)
|
||||
supported_types = get_supported_content_types(provider, api)
|
||||
|
||||
# Text files are always auto-injected (inlined as text), even
|
||||
# when the model does not support multimodal input.
|
||||
text_prefixes = ("text/", "application/json", "application/xml", "application/x-yaml")
|
||||
|
||||
def is_auto_injected(content_type: str) -> bool:
|
||||
if any(content_type.startswith(t) for t in text_prefixes):
|
||||
return True
|
||||
return any(content_type.startswith(t) for t in supported_types)
|
||||
|
||||
auto_injected_files = {
|
||||
|
||||
@@ -338,6 +338,280 @@ class TestBaseLLMMultimodal:
|
||||
assert result == {"type": "text", "text": "Hello"}
|
||||
|
||||
|
||||
class TestTextFileInliningNonMultimodal:
|
||||
"""Tests for text file inlining on non-multimodal models (issue #5137).
|
||||
|
||||
When a model does not support multimodal input, text files should be
|
||||
inlined as plain text in the message content rather than raising a
|
||||
ValueError.
|
||||
"""
|
||||
|
||||
# --- BaseLLM (native provider path) ---
|
||||
|
||||
def test_base_text_file_inlined_on_non_multimodal(self) -> None:
|
||||
"""TextFile content is inlined when model is not multimodal (BaseLLM)."""
|
||||
from crewai.llms.base_llm import BaseLLM
|
||||
|
||||
class NonMultimodalLLM(BaseLLM):
|
||||
def call(self, messages, tools=None, callbacks=None):
|
||||
return "test"
|
||||
|
||||
llm = NonMultimodalLLM(model="test-model")
|
||||
assert llm.supports_multimodal() is False
|
||||
|
||||
text_content = b"Hello from a text file!"
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Analyse this file",
|
||||
"files": {"readme": TextFile(source=text_content)},
|
||||
}
|
||||
]
|
||||
|
||||
result = llm._process_message_files(messages)
|
||||
|
||||
assert "files" not in result[0]
|
||||
assert "Hello from a text file!" in result[0]["content"]
|
||||
assert "readme" in result[0]["content"]
|
||||
|
||||
def test_base_multiple_text_files_inlined(self) -> None:
|
||||
"""Multiple text files are all inlined on non-multimodal model."""
|
||||
from crewai.llms.base_llm import BaseLLM
|
||||
|
||||
class NonMultimodalLLM(BaseLLM):
|
||||
def call(self, messages, tools=None, callbacks=None):
|
||||
return "test"
|
||||
|
||||
llm = NonMultimodalLLM(model="test-model")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Analyse these files",
|
||||
"files": {
|
||||
"file1": TextFile(source=b"Content of file 1"),
|
||||
"file2": TextFile(source=b"Content of file 2"),
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
result = llm._process_message_files(messages)
|
||||
|
||||
assert "files" not in result[0]
|
||||
assert "Content of file 1" in result[0]["content"]
|
||||
assert "Content of file 2" in result[0]["content"]
|
||||
|
||||
def test_base_image_file_still_rejected_on_non_multimodal(self) -> None:
|
||||
"""ImageFile still raises ValueError on non-multimodal model."""
|
||||
from crewai.llms.base_llm import BaseLLM
|
||||
|
||||
class NonMultimodalLLM(BaseLLM):
|
||||
def call(self, messages, tools=None, callbacks=None):
|
||||
return "test"
|
||||
|
||||
llm = NonMultimodalLLM(model="test-model")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Describe this image",
|
||||
"files": {"photo": ImageFile(source=MINIMAL_PNG)},
|
||||
}
|
||||
]
|
||||
|
||||
with pytest.raises(ValueError, match="non-text files"):
|
||||
llm._process_message_files(messages)
|
||||
|
||||
def test_base_mixed_text_and_image_rejects_but_inlines_text(self) -> None:
|
||||
"""Mixed text+image: text is inlined, but error is raised for image."""
|
||||
from crewai.llms.base_llm import BaseLLM
|
||||
|
||||
class NonMultimodalLLM(BaseLLM):
|
||||
def call(self, messages, tools=None, callbacks=None):
|
||||
return "test"
|
||||
|
||||
llm = NonMultimodalLLM(model="test-model")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Process these",
|
||||
"files": {
|
||||
"readme": TextFile(source=b"Some text content"),
|
||||
"photo": ImageFile(source=MINIMAL_PNG),
|
||||
},
|
||||
}
|
||||
]
|
||||
|
||||
with pytest.raises(ValueError, match="non-text files"):
|
||||
llm._process_message_files(messages)
|
||||
|
||||
# Text file should have been inlined before the error
|
||||
assert "Some text content" in messages[0]["content"]
|
||||
|
||||
def test_base_no_files_no_error(self) -> None:
|
||||
"""Messages without files pass through unchanged."""
|
||||
from crewai.llms.base_llm import BaseLLM
|
||||
|
||||
class NonMultimodalLLM(BaseLLM):
|
||||
def call(self, messages, tools=None, callbacks=None):
|
||||
return "test"
|
||||
|
||||
llm = NonMultimodalLLM(model="test-model")
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": "No files here"},
|
||||
]
|
||||
|
||||
result = llm._process_message_files(messages)
|
||||
assert result[0]["content"] == "No files here"
|
||||
|
||||
def test_base_text_file_with_empty_existing_content(self) -> None:
|
||||
"""TextFile inlined when existing content is empty string."""
|
||||
from crewai.llms.base_llm import BaseLLM
|
||||
|
||||
class NonMultimodalLLM(BaseLLM):
|
||||
def call(self, messages, tools=None, callbacks=None):
|
||||
return "test"
|
||||
|
||||
llm = NonMultimodalLLM(model="test-model")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "",
|
||||
"files": {"doc": TextFile(source=b"File content here")},
|
||||
}
|
||||
]
|
||||
|
||||
result = llm._process_message_files(messages)
|
||||
|
||||
assert "files" not in result[0]
|
||||
assert "File content here" in result[0]["content"]
|
||||
# Should not start with newlines when existing content is empty
|
||||
assert not result[0]["content"].startswith("\n")
|
||||
|
||||
# --- LiteLLM LLM class ---
|
||||
|
||||
def test_litellm_text_file_inlined_on_non_multimodal(self) -> None:
|
||||
"""TextFile content is inlined when litellm model is not multimodal."""
|
||||
llm = LLM(model="gpt-3.5-turbo", is_litellm=True)
|
||||
assert llm.supports_multimodal() is False
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Analyse this file",
|
||||
"files": {"readme": TextFile(source=b"Hello from litellm test")},
|
||||
}
|
||||
]
|
||||
|
||||
result = llm._process_message_files(messages)
|
||||
|
||||
assert "files" not in result[0]
|
||||
assert "Hello from litellm test" in result[0]["content"]
|
||||
|
||||
def test_litellm_image_file_rejected_on_non_multimodal(self) -> None:
|
||||
"""ImageFile raises ValueError on non-multimodal litellm model."""
|
||||
llm = LLM(model="gpt-3.5-turbo", is_litellm=True)
|
||||
assert llm.supports_multimodal() is False
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Describe this",
|
||||
"files": {"photo": ImageFile(source=MINIMAL_PNG)},
|
||||
}
|
||||
]
|
||||
|
||||
with pytest.raises(ValueError, match="non-text files"):
|
||||
llm._process_message_files(messages)
|
||||
|
||||
def test_litellm_json_file_inlined_on_non_multimodal(self) -> None:
|
||||
"""JSON file (application/json) is treated as text and inlined."""
|
||||
llm = LLM(model="gpt-3.5-turbo", is_litellm=True)
|
||||
assert llm.supports_multimodal() is False
|
||||
|
||||
json_content = b'{"key": "value"}'
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Parse this JSON",
|
||||
"files": {"data": TextFile(source=json_content)},
|
||||
}
|
||||
]
|
||||
|
||||
result = llm._process_message_files(messages)
|
||||
|
||||
assert "files" not in result[0]
|
||||
assert '{"key": "value"}' in result[0]["content"]
|
||||
|
||||
# --- _is_text_file helper ---
|
||||
|
||||
def test_is_text_file_with_text_file_instance(self) -> None:
|
||||
"""_is_text_file returns True for TextFile instances."""
|
||||
from crewai.llms.base_llm import BaseLLM
|
||||
|
||||
assert BaseLLM._is_text_file(TextFile(source=b"hello")) is True
|
||||
|
||||
def test_is_text_file_with_image_file_instance(self) -> None:
|
||||
"""_is_text_file returns False for ImageFile instances."""
|
||||
from crewai.llms.base_llm import BaseLLM
|
||||
|
||||
assert BaseLLM._is_text_file(ImageFile(source=MINIMAL_PNG)) is False
|
||||
|
||||
def test_is_text_file_with_pdf_file_instance(self) -> None:
|
||||
"""_is_text_file returns False for PDFFile instances."""
|
||||
from crewai.llms.base_llm import BaseLLM
|
||||
|
||||
assert BaseLLM._is_text_file(PDFFile(source=MINIMAL_PDF)) is False
|
||||
|
||||
def test_is_text_file_with_text_content_type(self) -> None:
|
||||
"""_is_text_file returns True for objects with text/* content_type."""
|
||||
from crewai.llms.base_llm import BaseLLM
|
||||
|
||||
class MockFile:
|
||||
content_type = "text/plain"
|
||||
|
||||
assert BaseLLM._is_text_file(MockFile()) is True
|
||||
|
||||
def test_is_text_file_with_json_content_type(self) -> None:
|
||||
"""_is_text_file returns True for application/json content_type."""
|
||||
from crewai.llms.base_llm import BaseLLM
|
||||
|
||||
class MockFile:
|
||||
content_type = "application/json"
|
||||
|
||||
assert BaseLLM._is_text_file(MockFile()) is True
|
||||
|
||||
def test_is_text_file_with_xml_content_type(self) -> None:
|
||||
"""_is_text_file returns True for application/xml content_type."""
|
||||
from crewai.llms.base_llm import BaseLLM
|
||||
|
||||
class MockFile:
|
||||
content_type = "application/xml"
|
||||
|
||||
assert BaseLLM._is_text_file(MockFile()) is True
|
||||
|
||||
def test_is_text_file_with_yaml_content_type(self) -> None:
|
||||
"""_is_text_file returns True for application/x-yaml content_type."""
|
||||
from crewai.llms.base_llm import BaseLLM
|
||||
|
||||
class MockFile:
|
||||
content_type = "application/x-yaml"
|
||||
|
||||
assert BaseLLM._is_text_file(MockFile()) is True
|
||||
|
||||
def test_is_text_file_with_image_content_type(self) -> None:
|
||||
"""_is_text_file returns False for image/* content_type."""
|
||||
from crewai.llms.base_llm import BaseLLM
|
||||
|
||||
class MockFile:
|
||||
content_type = "image/png"
|
||||
|
||||
assert BaseLLM._is_text_file(MockFile()) is False
|
||||
|
||||
|
||||
class TestMultipleFilesFormatting:
|
||||
"""Tests for formatting multiple files at once."""
|
||||
|
||||
@@ -372,4 +646,4 @@ class TestMultipleFilesFormatting:
|
||||
|
||||
result = format_multimodal_content({}, llm.model)
|
||||
|
||||
assert result == []
|
||||
assert result == []
|
||||
|
||||
Reference in New Issue
Block a user