fix: allow text files with non-multimodal models (#5137)

TextFiles passed via input_files incorrectly triggered a 'Model does not support multimodal input' error for non-vision-capable models. Text files are now inlined as text content in the message instead of being rejected. Changes: - base_llm.py: Rewrite _process_message_files to distinguish text files from binary files; add _is_text_file helper - llm.py: Apply same text-file inlining logic in both sync and async _process_message_files methods - crew.py: Recognize text file MIME types as auto-injectable so they don't require the read_file tool - task.py: Same text-file auto-injection logic in prompt method - tests: Add 17 tests covering text file inlining, image rejection, mixed files, _is_text_file helper, and edge cases Co-Authored-By: João <joao@crewai.com>
2026-07-02 13:48:09 +00:00 · 2026-03-27 08:41:27 +00:00
parent 9fe0c15549
commit d0793e5ec3
5 changed files with 470 additions and 13 deletions
--- a/lib/crewai/src/crewai/crew.py
+++ b/lib/crewai/src/crewai/crew.py
@@ -1338,7 +1338,13 @@ class Crew(FlowTrackable, BaseModel):
                api = getattr(agent.llm, "api", None)
                supported_types = get_supported_content_types(provider, api)

+            # Text files are always auto-injected (inlined as text), even
+            # when the model does not support multimodal input.
+            text_prefixes = ("text/", "application/json", "application/xml", "application/x-yaml")
+
            def is_auto_injected(content_type: str) -> bool:
+                if any(content_type.startswith(t) for t in text_prefixes):
+                    return True
                return any(content_type.startswith(t) for t in supported_types)

            # Only add read_file tool if there are files that need it
--- a/lib/crewai/src/crewai/llm.py
+++ b/lib/crewai/src/crewai/llm.py
@@ -54,7 +54,10 @@ from crewai.utilities.string_utils import sanitize_tool_name


 try:
-    from crewai_files import aformat_multimodal_content, format_multimodal_content
+    from crewai_files import (
+        aformat_multimodal_content,
+        format_multimodal_content,
+    )

    HAS_CREWAI_FILES = True
 except ImportError:
@@ -2039,6 +2042,10 @@ class LLM(BaseLLM):
        For each message with a `files` field, formats the files into
        provider-specific content blocks and updates the message content.

+        Text files (TextFile instances or files with text/* / application/json /
+        application/xml / application/x-yaml content types) are always inlined
+        as text content, even when the model does not support multimodal input.
+
        Args:
            messages: List of messages that may contain file attachments.

@@ -2049,11 +2056,55 @@ class LLM(BaseLLM):
            return messages

        if not self.supports_multimodal():
-            if any(msg.get("files") for msg in messages):
+            # Inline text files as text; reject non-text files
+            has_non_text = False
+            for msg in messages:
+                files = msg.get("files")
+                if not files:
+                    continue
+
+                text_parts: list[str] = []
+                non_text_files: dict[str, Any] = {}
+                for name, file_input in files.items():
+                    if self._is_text_file(file_input):
+                        try:
+                            content = file_input.read_text()
+                            text_parts.append(
+                                f"--- Content of file '{name}' ---\n{content}"
+                            )
+                        except Exception:
+                            non_text_files[name] = file_input
+                    else:
+                        non_text_files[name] = file_input
+
+                if non_text_files:
+                    has_non_text = True
+
+                if text_parts:
+                    existing_content = msg.get("content", "")
+                    inlined = "\n\n".join(text_parts)
+                    if isinstance(existing_content, str):
+                        msg["content"] = (
+                            f"{existing_content}\n\n{inlined}"
+                            if existing_content
+                            else inlined
+                        )
+                    elif isinstance(existing_content, list):
+                        msg["content"] = [
+                            *existing_content,
+                            self.format_text_content(inlined),
+                        ]
+
+                if non_text_files:
+                    msg["files"] = non_text_files
+                else:
+                    msg.pop("files", None)
+
+            if has_non_text:
                raise ValueError(
                    f"Model '{self.model}' does not support multimodal input, "
-                    "but files were provided via 'input_files'. "
-                    "Use a vision-capable model or remove the file inputs."
+                    "but non-text files were provided via 'input_files'. "
+                    "Use a vision-capable model or remove the non-text file inputs."
                )
            return messages

@@ -2090,6 +2141,10 @@ class LLM(BaseLLM):
        For each message with a `files` field, formats the files into
        provider-specific content blocks and updates the message content.

+        Text files (TextFile instances or files with text/* / application/json /
+        application/xml / application/x-yaml content types) are always inlined
+        as text content, even when the model does not support multimodal input.
+
        Args:
            messages: List of messages that may contain file attachments.

@@ -2100,11 +2155,55 @@ class LLM(BaseLLM):
            return messages

        if not self.supports_multimodal():
-            if any(msg.get("files") for msg in messages):
+            # Inline text files as text; reject non-text files
+            has_non_text = False
+            for msg in messages:
+                files = msg.get("files")
+                if not files:
+                    continue
+
+                text_parts: list[str] = []
+                non_text_files: dict[str, Any] = {}
+                for name, file_input in files.items():
+                    if self._is_text_file(file_input):
+                        try:
+                            content = file_input.read_text()
+                            text_parts.append(
+                                f"--- Content of file '{name}' ---\n{content}"
+                            )
+                        except Exception:
+                            non_text_files[name] = file_input
+                    else:
+                        non_text_files[name] = file_input
+
+                if non_text_files:
+                    has_non_text = True
+
+                if text_parts:
+                    existing_content = msg.get("content", "")
+                    inlined = "\n\n".join(text_parts)
+                    if isinstance(existing_content, str):
+                        msg["content"] = (
+                            f"{existing_content}\n\n{inlined}"
+                            if existing_content
+                            else inlined
+                        )
+                    elif isinstance(existing_content, list):
+                        msg["content"] = [
+                            *existing_content,
+                            self.format_text_content(inlined),
+                        ]
+
+                if non_text_files:
+                    msg["files"] = non_text_files
+                else:
+                    msg.pop("files", None)
+
+            if has_non_text:
                raise ValueError(
                    f"Model '{self.model}' does not support multimodal input, "
-                    "but files were provided via 'input_files'. "
-                    "Use a vision-capable model or remove the file inputs."
+                    "but non-text files were provided via 'input_files'. "
+                    "Use a vision-capable model or remove the non-text file inputs."
                )
            return messages

--- a/lib/crewai/src/crewai/llms/base_llm.py
+++ b/lib/crewai/src/crewai/llms/base_llm.py
@@ -37,7 +37,7 @@ from crewai.types.usage_metrics import UsageMetrics


 try:
-    from crewai_files import format_multimodal_content
+    from crewai_files import TextFile, format_multimodal_content

    HAS_CREWAI_FILES = True
 except ImportError:
@@ -635,6 +635,10 @@ class BaseLLM(ABC):
        For each message with a `files` field, formats the files into
        provider-specific content blocks and updates the message content.

+        Text files (TextFile instances or files with text/* / application/json /
+        application/xml / application/x-yaml content types) are always inlined
+        as text content, even when the model does not support multimodal input.
+
        Args:
            messages: List of messages that may contain file attachments.

@@ -644,12 +648,61 @@ class BaseLLM(ABC):
        if not HAS_CREWAI_FILES:
            return messages

-        if not self.supports_multimodal():
-            if any(msg.get("files") for msg in messages):
+        is_multimodal = self.supports_multimodal()
+
+        if not is_multimodal:
+            # Inline text files as text; reject non-text files
+            has_non_text = False
+            for msg in messages:
+                files = msg.get("files")
+                if not files:
+                    continue
+
+                text_parts: list[str] = []
+                non_text_files: dict[str, Any] = {}
+                for name, file_input in files.items():
+                    if self._is_text_file(file_input):
+                        try:
+                            content = file_input.read_text()
+                            text_parts.append(
+                                f"--- Content of file '{name}' ---\n{content}"
+                            )
+                        except Exception:
+                            # If reading fails, fall back to tool-based access
+                            non_text_files[name] = file_input
+                    else:
+                        non_text_files[name] = file_input
+
+                if non_text_files:
+                    has_non_text = True
+
+                # Append inlined text content to the message
+                if text_parts:
+                    existing_content = msg.get("content", "")
+                    inlined = "\n\n".join(text_parts)
+                    if isinstance(existing_content, str):
+                        msg["content"] = (
+                            f"{existing_content}\n\n{inlined}"
+                            if existing_content
+                            else inlined
+                        )
+                    elif isinstance(existing_content, list):
+                        msg["content"] = [
+                            *existing_content,
+                            self.format_text_content(inlined),
+                        ]
+
+                # Keep only non-text files (for tool-based access)
+                if non_text_files:
+                    msg["files"] = non_text_files
+                else:
+                    msg.pop("files", None)
+
+            if has_non_text:
                raise ValueError(
                    f"Model '{self.model}' does not support multimodal input, "
-                    "but files were provided via 'input_files'. "
-                    "Use a vision-capable model or remove the file inputs."
+                    "but non-text files were provided via 'input_files'. "
+                    "Use a vision-capable model or remove the non-text file inputs."
                )
            return messages

@@ -680,6 +733,25 @@ class BaseLLM(ABC):

        return messages

+    @staticmethod
+    def _is_text_file(file_input: Any) -> bool:
+        """Check whether a file input is a text file.
+
+        Returns True for TextFile instances or files whose content_type
+        starts with ``text/`` or matches common text-based MIME types
+        (application/json, application/xml, application/x-yaml).
+        """
+        if HAS_CREWAI_FILES and isinstance(file_input, TextFile):
+            return True
+        content_type = getattr(file_input, "content_type", "")
+        if content_type.startswith("text/"):
+            return True
+        return content_type in (
+            "application/json",
+            "application/xml",
+            "application/x-yaml",
+        )
+
    @staticmethod
    def _validate_structured_output(
        response: str,
--- a/lib/crewai/src/crewai/task.py
+++ b/lib/crewai/src/crewai/task.py
@@ -824,7 +824,13 @@ class Task(BaseModel):
                    api: str | None = getattr(self.agent.llm, "api", None)
                    supported_types = get_supported_content_types(provider, api)

+                # Text files are always auto-injected (inlined as text), even
+                # when the model does not support multimodal input.
+                text_prefixes = ("text/", "application/json", "application/xml", "application/x-yaml")
+
                def is_auto_injected(content_type: str) -> bool:
+                    if any(content_type.startswith(t) for t in text_prefixes):
+                        return True
                    return any(content_type.startswith(t) for t in supported_types)

                auto_injected_files = {
--- a/lib/crewai/tests/llms/test_multimodal.py
+++ b/lib/crewai/tests/llms/test_multimodal.py
@@ -338,6 +338,280 @@ class TestBaseLLMMultimodal:
        assert result == {"type": "text", "text": "Hello"}


+class TestTextFileInliningNonMultimodal:
+    """Tests for text file inlining on non-multimodal models (issue #5137).
+
+    When a model does not support multimodal input, text files should be
+    inlined as plain text in the message content rather than raising a
+    ValueError.
+    """
+
+    # --- BaseLLM (native provider path) ---
+
+    def test_base_text_file_inlined_on_non_multimodal(self) -> None:
+        """TextFile content is inlined when model is not multimodal (BaseLLM)."""
+        from crewai.llms.base_llm import BaseLLM
+
+        class NonMultimodalLLM(BaseLLM):
+            def call(self, messages, tools=None, callbacks=None):
+                return "test"
+
+        llm = NonMultimodalLLM(model="test-model")
+        assert llm.supports_multimodal() is False
+
+        text_content = b"Hello from a text file!"
+        messages = [
+            {
+                "role": "user",
+                "content": "Analyse this file",
+                "files": {"readme": TextFile(source=text_content)},
+            }
+        ]
+
+        result = llm._process_message_files(messages)
+
+        assert "files" not in result[0]
+        assert "Hello from a text file!" in result[0]["content"]
+        assert "readme" in result[0]["content"]
+
+    def test_base_multiple_text_files_inlined(self) -> None:
+        """Multiple text files are all inlined on non-multimodal model."""
+        from crewai.llms.base_llm import BaseLLM
+
+        class NonMultimodalLLM(BaseLLM):
+            def call(self, messages, tools=None, callbacks=None):
+                return "test"
+
+        llm = NonMultimodalLLM(model="test-model")
+
+        messages = [
+            {
+                "role": "user",
+                "content": "Analyse these files",
+                "files": {
+                    "file1": TextFile(source=b"Content of file 1"),
+                    "file2": TextFile(source=b"Content of file 2"),
+                },
+            }
+        ]
+
+        result = llm._process_message_files(messages)
+
+        assert "files" not in result[0]
+        assert "Content of file 1" in result[0]["content"]
+        assert "Content of file 2" in result[0]["content"]
+
+    def test_base_image_file_still_rejected_on_non_multimodal(self) -> None:
+        """ImageFile still raises ValueError on non-multimodal model."""
+        from crewai.llms.base_llm import BaseLLM
+
+        class NonMultimodalLLM(BaseLLM):
+            def call(self, messages, tools=None, callbacks=None):
+                return "test"
+
+        llm = NonMultimodalLLM(model="test-model")
+
+        messages = [
+            {
+                "role": "user",
+                "content": "Describe this image",
+                "files": {"photo": ImageFile(source=MINIMAL_PNG)},
+            }
+        ]
+
+        with pytest.raises(ValueError, match="non-text files"):
+            llm._process_message_files(messages)
+
+    def test_base_mixed_text_and_image_rejects_but_inlines_text(self) -> None:
+        """Mixed text+image: text is inlined, but error is raised for image."""
+        from crewai.llms.base_llm import BaseLLM
+
+        class NonMultimodalLLM(BaseLLM):
+            def call(self, messages, tools=None, callbacks=None):
+                return "test"
+
+        llm = NonMultimodalLLM(model="test-model")
+
+        messages = [
+            {
+                "role": "user",
+                "content": "Process these",
+                "files": {
+                    "readme": TextFile(source=b"Some text content"),
+                    "photo": ImageFile(source=MINIMAL_PNG),
+                },
+            }
+        ]
+
+        with pytest.raises(ValueError, match="non-text files"):
+            llm._process_message_files(messages)
+
+        # Text file should have been inlined before the error
+        assert "Some text content" in messages[0]["content"]
+
+    def test_base_no_files_no_error(self) -> None:
+        """Messages without files pass through unchanged."""
+        from crewai.llms.base_llm import BaseLLM
+
+        class NonMultimodalLLM(BaseLLM):
+            def call(self, messages, tools=None, callbacks=None):
+                return "test"
+
+        llm = NonMultimodalLLM(model="test-model")
+
+        messages = [
+            {"role": "user", "content": "No files here"},
+        ]
+
+        result = llm._process_message_files(messages)
+        assert result[0]["content"] == "No files here"
+
+    def test_base_text_file_with_empty_existing_content(self) -> None:
+        """TextFile inlined when existing content is empty string."""
+        from crewai.llms.base_llm import BaseLLM
+
+        class NonMultimodalLLM(BaseLLM):
+            def call(self, messages, tools=None, callbacks=None):
+                return "test"
+
+        llm = NonMultimodalLLM(model="test-model")
+
+        messages = [
+            {
+                "role": "user",
+                "content": "",
+                "files": {"doc": TextFile(source=b"File content here")},
+            }
+        ]
+
+        result = llm._process_message_files(messages)
+
+        assert "files" not in result[0]
+        assert "File content here" in result[0]["content"]
+        # Should not start with newlines when existing content is empty
+        assert not result[0]["content"].startswith("\n")
+
+    # --- LiteLLM LLM class ---
+
+    def test_litellm_text_file_inlined_on_non_multimodal(self) -> None:
+        """TextFile content is inlined when litellm model is not multimodal."""
+        llm = LLM(model="gpt-3.5-turbo", is_litellm=True)
+        assert llm.supports_multimodal() is False
+
+        messages = [
+            {
+                "role": "user",
+                "content": "Analyse this file",
+                "files": {"readme": TextFile(source=b"Hello from litellm test")},
+            }
+        ]
+
+        result = llm._process_message_files(messages)
+
+        assert "files" not in result[0]
+        assert "Hello from litellm test" in result[0]["content"]
+
+    def test_litellm_image_file_rejected_on_non_multimodal(self) -> None:
+        """ImageFile raises ValueError on non-multimodal litellm model."""
+        llm = LLM(model="gpt-3.5-turbo", is_litellm=True)
+        assert llm.supports_multimodal() is False
+
+        messages = [
+            {
+                "role": "user",
+                "content": "Describe this",
+                "files": {"photo": ImageFile(source=MINIMAL_PNG)},
+            }
+        ]
+
+        with pytest.raises(ValueError, match="non-text files"):
+            llm._process_message_files(messages)
+
+    def test_litellm_json_file_inlined_on_non_multimodal(self) -> None:
+        """JSON file (application/json) is treated as text and inlined."""
+        llm = LLM(model="gpt-3.5-turbo", is_litellm=True)
+        assert llm.supports_multimodal() is False
+
+        json_content = b'{"key": "value"}'
+        messages = [
+            {
+                "role": "user",
+                "content": "Parse this JSON",
+                "files": {"data": TextFile(source=json_content)},
+            }
+        ]
+
+        result = llm._process_message_files(messages)
+
+        assert "files" not in result[0]
+        assert '{"key": "value"}' in result[0]["content"]
+
+    # --- _is_text_file helper ---
+
+    def test_is_text_file_with_text_file_instance(self) -> None:
+        """_is_text_file returns True for TextFile instances."""
+        from crewai.llms.base_llm import BaseLLM
+
+        assert BaseLLM._is_text_file(TextFile(source=b"hello")) is True
+
+    def test_is_text_file_with_image_file_instance(self) -> None:
+        """_is_text_file returns False for ImageFile instances."""
+        from crewai.llms.base_llm import BaseLLM
+
+        assert BaseLLM._is_text_file(ImageFile(source=MINIMAL_PNG)) is False
+
+    def test_is_text_file_with_pdf_file_instance(self) -> None:
+        """_is_text_file returns False for PDFFile instances."""
+        from crewai.llms.base_llm import BaseLLM
+
+        assert BaseLLM._is_text_file(PDFFile(source=MINIMAL_PDF)) is False
+
+    def test_is_text_file_with_text_content_type(self) -> None:
+        """_is_text_file returns True for objects with text/* content_type."""
+        from crewai.llms.base_llm import BaseLLM
+
+        class MockFile:
+            content_type = "text/plain"
+
+        assert BaseLLM._is_text_file(MockFile()) is True
+
+    def test_is_text_file_with_json_content_type(self) -> None:
+        """_is_text_file returns True for application/json content_type."""
+        from crewai.llms.base_llm import BaseLLM
+
+        class MockFile:
+            content_type = "application/json"
+
+        assert BaseLLM._is_text_file(MockFile()) is True
+
+    def test_is_text_file_with_xml_content_type(self) -> None:
+        """_is_text_file returns True for application/xml content_type."""
+        from crewai.llms.base_llm import BaseLLM
+
+        class MockFile:
+            content_type = "application/xml"
+
+        assert BaseLLM._is_text_file(MockFile()) is True
+
+    def test_is_text_file_with_yaml_content_type(self) -> None:
+        """_is_text_file returns True for application/x-yaml content_type."""
+        from crewai.llms.base_llm import BaseLLM
+
+        class MockFile:
+            content_type = "application/x-yaml"
+
+        assert BaseLLM._is_text_file(MockFile()) is True
+
+    def test_is_text_file_with_image_content_type(self) -> None:
+        """_is_text_file returns False for image/* content_type."""
+        from crewai.llms.base_llm import BaseLLM
+
+        class MockFile:
+            content_type = "image/png"
+
+        assert BaseLLM._is_text_file(MockFile()) is False
+
+
 class TestMultipleFilesFormatting:
    """Tests for formatting multiple files at once."""

@@ -372,4 +646,4 @@ class TestMultipleFilesFormatting:

        result = format_multimodal_content({}, llm.model)

-        assert result == []
+        assert result == []