fix: allow text files with non-multimodal models (#5137)

TextFiles passed via input_files incorrectly triggered a 'Model does not
support multimodal input' error for non-vision-capable models. Text files
are now inlined as text content in the message instead of being rejected.

Changes:
- base_llm.py: Rewrite _process_message_files to distinguish text files
  from binary files; add _is_text_file helper
- llm.py: Apply same text-file inlining logic in both sync and async
  _process_message_files methods
- crew.py: Recognize text file MIME types as auto-injectable so they
  don't require the read_file tool
- task.py: Same text-file auto-injection logic in prompt method
- tests: Add 17 tests covering text file inlining, image rejection,
  mixed files, _is_text_file helper, and edge cases

Co-Authored-By: João <joao@crewai.com>
This commit is contained in:
Devin AI
2026-03-27 08:41:27 +00:00
parent 9fe0c15549
commit d0793e5ec3
5 changed files with 470 additions and 13 deletions

View File

@@ -1338,7 +1338,13 @@ class Crew(FlowTrackable, BaseModel):
api = getattr(agent.llm, "api", None)
supported_types = get_supported_content_types(provider, api)
# Text files are always auto-injected (inlined as text), even
# when the model does not support multimodal input.
text_prefixes = ("text/", "application/json", "application/xml", "application/x-yaml")
def is_auto_injected(content_type: str) -> bool:
if any(content_type.startswith(t) for t in text_prefixes):
return True
return any(content_type.startswith(t) for t in supported_types)
# Only add read_file tool if there are files that need it

View File

@@ -54,7 +54,10 @@ from crewai.utilities.string_utils import sanitize_tool_name
try:
from crewai_files import aformat_multimodal_content, format_multimodal_content
from crewai_files import (
aformat_multimodal_content,
format_multimodal_content,
)
HAS_CREWAI_FILES = True
except ImportError:
@@ -2039,6 +2042,10 @@ class LLM(BaseLLM):
For each message with a `files` field, formats the files into
provider-specific content blocks and updates the message content.
Text files (TextFile instances or files with text/* / application/json /
application/xml / application/x-yaml content types) are always inlined
as text content, even when the model does not support multimodal input.
Args:
messages: List of messages that may contain file attachments.
@@ -2049,11 +2056,55 @@ class LLM(BaseLLM):
return messages
if not self.supports_multimodal():
if any(msg.get("files") for msg in messages):
# Inline text files as text; reject non-text files
has_non_text = False
for msg in messages:
files = msg.get("files")
if not files:
continue
text_parts: list[str] = []
non_text_files: dict[str, Any] = {}
for name, file_input in files.items():
if self._is_text_file(file_input):
try:
content = file_input.read_text()
text_parts.append(
f"--- Content of file '{name}' ---\n{content}"
)
except Exception:
non_text_files[name] = file_input
else:
non_text_files[name] = file_input
if non_text_files:
has_non_text = True
if text_parts:
existing_content = msg.get("content", "")
inlined = "\n\n".join(text_parts)
if isinstance(existing_content, str):
msg["content"] = (
f"{existing_content}\n\n{inlined}"
if existing_content
else inlined
)
elif isinstance(existing_content, list):
msg["content"] = [
*existing_content,
self.format_text_content(inlined),
]
if non_text_files:
msg["files"] = non_text_files
else:
msg.pop("files", None)
if has_non_text:
raise ValueError(
f"Model '{self.model}' does not support multimodal input, "
"but files were provided via 'input_files'. "
"Use a vision-capable model or remove the file inputs."
"but non-text files were provided via 'input_files'. "
"Use a vision-capable model or remove the non-text file inputs."
)
return messages
@@ -2090,6 +2141,10 @@ class LLM(BaseLLM):
For each message with a `files` field, formats the files into
provider-specific content blocks and updates the message content.
Text files (TextFile instances or files with text/* / application/json /
application/xml / application/x-yaml content types) are always inlined
as text content, even when the model does not support multimodal input.
Args:
messages: List of messages that may contain file attachments.
@@ -2100,11 +2155,55 @@ class LLM(BaseLLM):
return messages
if not self.supports_multimodal():
if any(msg.get("files") for msg in messages):
# Inline text files as text; reject non-text files
has_non_text = False
for msg in messages:
files = msg.get("files")
if not files:
continue
text_parts: list[str] = []
non_text_files: dict[str, Any] = {}
for name, file_input in files.items():
if self._is_text_file(file_input):
try:
content = file_input.read_text()
text_parts.append(
f"--- Content of file '{name}' ---\n{content}"
)
except Exception:
non_text_files[name] = file_input
else:
non_text_files[name] = file_input
if non_text_files:
has_non_text = True
if text_parts:
existing_content = msg.get("content", "")
inlined = "\n\n".join(text_parts)
if isinstance(existing_content, str):
msg["content"] = (
f"{existing_content}\n\n{inlined}"
if existing_content
else inlined
)
elif isinstance(existing_content, list):
msg["content"] = [
*existing_content,
self.format_text_content(inlined),
]
if non_text_files:
msg["files"] = non_text_files
else:
msg.pop("files", None)
if has_non_text:
raise ValueError(
f"Model '{self.model}' does not support multimodal input, "
"but files were provided via 'input_files'. "
"Use a vision-capable model or remove the file inputs."
"but non-text files were provided via 'input_files'. "
"Use a vision-capable model or remove the non-text file inputs."
)
return messages

View File

@@ -37,7 +37,7 @@ from crewai.types.usage_metrics import UsageMetrics
try:
from crewai_files import format_multimodal_content
from crewai_files import TextFile, format_multimodal_content
HAS_CREWAI_FILES = True
except ImportError:
@@ -635,6 +635,10 @@ class BaseLLM(ABC):
For each message with a `files` field, formats the files into
provider-specific content blocks and updates the message content.
Text files (TextFile instances or files with text/* / application/json /
application/xml / application/x-yaml content types) are always inlined
as text content, even when the model does not support multimodal input.
Args:
messages: List of messages that may contain file attachments.
@@ -644,12 +648,61 @@ class BaseLLM(ABC):
if not HAS_CREWAI_FILES:
return messages
if not self.supports_multimodal():
if any(msg.get("files") for msg in messages):
is_multimodal = self.supports_multimodal()
if not is_multimodal:
# Inline text files as text; reject non-text files
has_non_text = False
for msg in messages:
files = msg.get("files")
if not files:
continue
text_parts: list[str] = []
non_text_files: dict[str, Any] = {}
for name, file_input in files.items():
if self._is_text_file(file_input):
try:
content = file_input.read_text()
text_parts.append(
f"--- Content of file '{name}' ---\n{content}"
)
except Exception:
# If reading fails, fall back to tool-based access
non_text_files[name] = file_input
else:
non_text_files[name] = file_input
if non_text_files:
has_non_text = True
# Append inlined text content to the message
if text_parts:
existing_content = msg.get("content", "")
inlined = "\n\n".join(text_parts)
if isinstance(existing_content, str):
msg["content"] = (
f"{existing_content}\n\n{inlined}"
if existing_content
else inlined
)
elif isinstance(existing_content, list):
msg["content"] = [
*existing_content,
self.format_text_content(inlined),
]
# Keep only non-text files (for tool-based access)
if non_text_files:
msg["files"] = non_text_files
else:
msg.pop("files", None)
if has_non_text:
raise ValueError(
f"Model '{self.model}' does not support multimodal input, "
"but files were provided via 'input_files'. "
"Use a vision-capable model or remove the file inputs."
"but non-text files were provided via 'input_files'. "
"Use a vision-capable model or remove the non-text file inputs."
)
return messages
@@ -680,6 +733,25 @@ class BaseLLM(ABC):
return messages
@staticmethod
def _is_text_file(file_input: Any) -> bool:
"""Check whether a file input is a text file.
Returns True for TextFile instances or files whose content_type
starts with ``text/`` or matches common text-based MIME types
(application/json, application/xml, application/x-yaml).
"""
if HAS_CREWAI_FILES and isinstance(file_input, TextFile):
return True
content_type = getattr(file_input, "content_type", "")
if content_type.startswith("text/"):
return True
return content_type in (
"application/json",
"application/xml",
"application/x-yaml",
)
@staticmethod
def _validate_structured_output(
response: str,

View File

@@ -824,7 +824,13 @@ class Task(BaseModel):
api: str | None = getattr(self.agent.llm, "api", None)
supported_types = get_supported_content_types(provider, api)
# Text files are always auto-injected (inlined as text), even
# when the model does not support multimodal input.
text_prefixes = ("text/", "application/json", "application/xml", "application/x-yaml")
def is_auto_injected(content_type: str) -> bool:
if any(content_type.startswith(t) for t in text_prefixes):
return True
return any(content_type.startswith(t) for t in supported_types)
auto_injected_files = {

View File

@@ -338,6 +338,280 @@ class TestBaseLLMMultimodal:
assert result == {"type": "text", "text": "Hello"}
class TestTextFileInliningNonMultimodal:
"""Tests for text file inlining on non-multimodal models (issue #5137).
When a model does not support multimodal input, text files should be
inlined as plain text in the message content rather than raising a
ValueError.
"""
# --- BaseLLM (native provider path) ---
def test_base_text_file_inlined_on_non_multimodal(self) -> None:
"""TextFile content is inlined when model is not multimodal (BaseLLM)."""
from crewai.llms.base_llm import BaseLLM
class NonMultimodalLLM(BaseLLM):
def call(self, messages, tools=None, callbacks=None):
return "test"
llm = NonMultimodalLLM(model="test-model")
assert llm.supports_multimodal() is False
text_content = b"Hello from a text file!"
messages = [
{
"role": "user",
"content": "Analyse this file",
"files": {"readme": TextFile(source=text_content)},
}
]
result = llm._process_message_files(messages)
assert "files" not in result[0]
assert "Hello from a text file!" in result[0]["content"]
assert "readme" in result[0]["content"]
def test_base_multiple_text_files_inlined(self) -> None:
"""Multiple text files are all inlined on non-multimodal model."""
from crewai.llms.base_llm import BaseLLM
class NonMultimodalLLM(BaseLLM):
def call(self, messages, tools=None, callbacks=None):
return "test"
llm = NonMultimodalLLM(model="test-model")
messages = [
{
"role": "user",
"content": "Analyse these files",
"files": {
"file1": TextFile(source=b"Content of file 1"),
"file2": TextFile(source=b"Content of file 2"),
},
}
]
result = llm._process_message_files(messages)
assert "files" not in result[0]
assert "Content of file 1" in result[0]["content"]
assert "Content of file 2" in result[0]["content"]
def test_base_image_file_still_rejected_on_non_multimodal(self) -> None:
"""ImageFile still raises ValueError on non-multimodal model."""
from crewai.llms.base_llm import BaseLLM
class NonMultimodalLLM(BaseLLM):
def call(self, messages, tools=None, callbacks=None):
return "test"
llm = NonMultimodalLLM(model="test-model")
messages = [
{
"role": "user",
"content": "Describe this image",
"files": {"photo": ImageFile(source=MINIMAL_PNG)},
}
]
with pytest.raises(ValueError, match="non-text files"):
llm._process_message_files(messages)
def test_base_mixed_text_and_image_rejects_but_inlines_text(self) -> None:
"""Mixed text+image: text is inlined, but error is raised for image."""
from crewai.llms.base_llm import BaseLLM
class NonMultimodalLLM(BaseLLM):
def call(self, messages, tools=None, callbacks=None):
return "test"
llm = NonMultimodalLLM(model="test-model")
messages = [
{
"role": "user",
"content": "Process these",
"files": {
"readme": TextFile(source=b"Some text content"),
"photo": ImageFile(source=MINIMAL_PNG),
},
}
]
with pytest.raises(ValueError, match="non-text files"):
llm._process_message_files(messages)
# Text file should have been inlined before the error
assert "Some text content" in messages[0]["content"]
def test_base_no_files_no_error(self) -> None:
"""Messages without files pass through unchanged."""
from crewai.llms.base_llm import BaseLLM
class NonMultimodalLLM(BaseLLM):
def call(self, messages, tools=None, callbacks=None):
return "test"
llm = NonMultimodalLLM(model="test-model")
messages = [
{"role": "user", "content": "No files here"},
]
result = llm._process_message_files(messages)
assert result[0]["content"] == "No files here"
def test_base_text_file_with_empty_existing_content(self) -> None:
"""TextFile inlined when existing content is empty string."""
from crewai.llms.base_llm import BaseLLM
class NonMultimodalLLM(BaseLLM):
def call(self, messages, tools=None, callbacks=None):
return "test"
llm = NonMultimodalLLM(model="test-model")
messages = [
{
"role": "user",
"content": "",
"files": {"doc": TextFile(source=b"File content here")},
}
]
result = llm._process_message_files(messages)
assert "files" not in result[0]
assert "File content here" in result[0]["content"]
# Should not start with newlines when existing content is empty
assert not result[0]["content"].startswith("\n")
# --- LiteLLM LLM class ---
def test_litellm_text_file_inlined_on_non_multimodal(self) -> None:
"""TextFile content is inlined when litellm model is not multimodal."""
llm = LLM(model="gpt-3.5-turbo", is_litellm=True)
assert llm.supports_multimodal() is False
messages = [
{
"role": "user",
"content": "Analyse this file",
"files": {"readme": TextFile(source=b"Hello from litellm test")},
}
]
result = llm._process_message_files(messages)
assert "files" not in result[0]
assert "Hello from litellm test" in result[0]["content"]
def test_litellm_image_file_rejected_on_non_multimodal(self) -> None:
"""ImageFile raises ValueError on non-multimodal litellm model."""
llm = LLM(model="gpt-3.5-turbo", is_litellm=True)
assert llm.supports_multimodal() is False
messages = [
{
"role": "user",
"content": "Describe this",
"files": {"photo": ImageFile(source=MINIMAL_PNG)},
}
]
with pytest.raises(ValueError, match="non-text files"):
llm._process_message_files(messages)
def test_litellm_json_file_inlined_on_non_multimodal(self) -> None:
"""JSON file (application/json) is treated as text and inlined."""
llm = LLM(model="gpt-3.5-turbo", is_litellm=True)
assert llm.supports_multimodal() is False
json_content = b'{"key": "value"}'
messages = [
{
"role": "user",
"content": "Parse this JSON",
"files": {"data": TextFile(source=json_content)},
}
]
result = llm._process_message_files(messages)
assert "files" not in result[0]
assert '{"key": "value"}' in result[0]["content"]
# --- _is_text_file helper ---
def test_is_text_file_with_text_file_instance(self) -> None:
"""_is_text_file returns True for TextFile instances."""
from crewai.llms.base_llm import BaseLLM
assert BaseLLM._is_text_file(TextFile(source=b"hello")) is True
def test_is_text_file_with_image_file_instance(self) -> None:
"""_is_text_file returns False for ImageFile instances."""
from crewai.llms.base_llm import BaseLLM
assert BaseLLM._is_text_file(ImageFile(source=MINIMAL_PNG)) is False
def test_is_text_file_with_pdf_file_instance(self) -> None:
"""_is_text_file returns False for PDFFile instances."""
from crewai.llms.base_llm import BaseLLM
assert BaseLLM._is_text_file(PDFFile(source=MINIMAL_PDF)) is False
def test_is_text_file_with_text_content_type(self) -> None:
"""_is_text_file returns True for objects with text/* content_type."""
from crewai.llms.base_llm import BaseLLM
class MockFile:
content_type = "text/plain"
assert BaseLLM._is_text_file(MockFile()) is True
def test_is_text_file_with_json_content_type(self) -> None:
"""_is_text_file returns True for application/json content_type."""
from crewai.llms.base_llm import BaseLLM
class MockFile:
content_type = "application/json"
assert BaseLLM._is_text_file(MockFile()) is True
def test_is_text_file_with_xml_content_type(self) -> None:
"""_is_text_file returns True for application/xml content_type."""
from crewai.llms.base_llm import BaseLLM
class MockFile:
content_type = "application/xml"
assert BaseLLM._is_text_file(MockFile()) is True
def test_is_text_file_with_yaml_content_type(self) -> None:
"""_is_text_file returns True for application/x-yaml content_type."""
from crewai.llms.base_llm import BaseLLM
class MockFile:
content_type = "application/x-yaml"
assert BaseLLM._is_text_file(MockFile()) is True
def test_is_text_file_with_image_content_type(self) -> None:
"""_is_text_file returns False for image/* content_type."""
from crewai.llms.base_llm import BaseLLM
class MockFile:
content_type = "image/png"
assert BaseLLM._is_text_file(MockFile()) is False
class TestMultipleFilesFormatting:
"""Tests for formatting multiple files at once."""
@@ -372,4 +646,4 @@ class TestMultipleFilesFormatting:
result = format_multimodal_content({}, llm.model)
assert result == []
assert result == []