From 3d667a75ddcdef1349b08379b36ad29b787c9055 Mon Sep 17 00:00:00 2001
From: Alex <alex@crewai.com>
Date: Tue, 31 Mar 2026 02:07:22 -0700
Subject: [PATCH] fix: add GPT-5, o3, o4-mini to multimodal vision prefixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Added verified vision-capable models:
- gpt-5 (all GPT-5 family — confirmed multimodal via openai.com)
- o3, o3-pro (full multimodal — openai.com/index/thinking-with-images)
- o4-mini, o4 (full multimodal)

Added text-only exclusion list to prevent false positives:
- o3-mini (text-only, replaced by o4-mini)
- o1-mini (text-only)
- o1-preview (text-only)

Existing prefixes unchanged (Claude 3+, Gemini, GPT-4).
---
 lib/crewai/src/crewai/llm.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/lib/crewai/src/crewai/llm.py b/lib/crewai/src/crewai/llm.py
index 352bec16d..1355dbf59 100644
--- a/lib/crewai/src/crewai/llm.py
+++ b/lib/crewai/src/crewai/llm.py
@@ -2521,18 +2521,37 @@ class LLM(BaseLLM):
             True if the model likely supports images.
         """
         vision_prefixes = (
+            # OpenAI — GPT-4 vision models
             "gpt-4o",
             "gpt-4-turbo",
             "gpt-4-vision",
             "gpt-4.1",
+            # OpenAI — GPT-5 family (all support multimodal, see openai.com/index/introducing-gpt-5)
+            "gpt-5",
+            # Anthropic — all Claude 3+ models support vision (platform.claude.com/docs/en/build-with-claude/vision)
             "claude-3",
             "claude-4",
             "claude-sonnet-4",
             "claude-opus-4",
             "claude-haiku-4",
+            # Google — all Gemini models support multimodal
             "gemini",
+            # OpenAI — o-series reasoning models with vision
+            # o3 and o4-mini support multimodal (openai.com/index/thinking-with-images)
+            # o3-mini is text-only — handled via exclusion below
+            "o3",
+            "o4-mini",
+            "o4",
         )
+        # Text-only models that would otherwise match vision prefixes
+        text_only_models = ("o3-mini", "o1-mini", "o1-preview")
+
         model_lower = self.model.lower()
+
+        # Check exclusion first
+        if any(model_lower.startswith(m) or f"/{m}" in model_lower for m in text_only_models):
+            return False
+
         return any(
             model_lower.startswith(p) or f"/{p}" in model_lower for p in vision_prefixes
         )