fix: add GPT-5 and o-series to multimodal vision prefixes (#5183)

* fix: add GPT-5, o3, o4-mini to multimodal vision prefixes Added verified vision-capable models: - gpt-5 (all GPT-5 family — confirmed multimodal via openai.com) - o3, o3-pro (full multimodal — openai.com/index/thinking-with-images) - o4-mini, o4 (full multimodal) Added text-only exclusion list to prevent false positives: - o3-mini (text-only, replaced by o4-mini) - o1-mini (text-only) - o1-preview (text-only) Existing prefixes unchanged (Claude 3+, Gemini, GPT-4). * fix: add o1 to vision prefixes + ruff format Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix: guard _sync_executor access in test utils for lazy-init event bus * fix: expand vision model coverage — Claude 5, Grok, Pixtral, Qwen VL, LLaVA Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * ci: retrigger — flaky test_hierarchical_verbose_false_manager_agent (ConnectionError) * fix: remove hallucinated claude-5 models from vision prefixes — verified against official docs Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com> Co-authored-by: João Moura <joaomdmoura@gmail.com>
2026-04-30 14:52:36 +00:00 · 2026-04-01 14:08:37 -07:00
parent f10d320ddb
commit c14abf1758
2 changed files with 40 additions and 5 deletions
--- a/lib/crewai/src/crewai/llm.py
+++ b/lib/crewai/src/crewai/llm.py
@@ -2513,18 +2513,51 @@ class LLM(BaseLLM):
            True if the model likely supports images.
        """
        vision_prefixes = (
+            # OpenAI — GPT-4 vision models
            "gpt-4o",
            "gpt-4-turbo",
            "gpt-4-vision",
            "gpt-4.1",
+            # OpenAI — GPT-5 family (all variants support multimodal)
+            "gpt-5",
+            # OpenAI — o-series reasoning models with vision
+            # o1, o3, o4, o4-mini support multimodal
+            # o1-mini, o1-preview, o3-mini are text-only — handled via exclusion below
+            "o1",
+            "o3",
+            "o4-mini",
+            "o4",
+            # Anthropic — Claude 3+ models support vision
            "claude-3",
            "claude-4",
            "claude-sonnet-4",
            "claude-opus-4",
            "claude-haiku-4",
+            # Google — all Gemini models support multimodal
            "gemini",
+            # xAI — Grok models support vision
+            "grok",
+            # Mistral — Pixtral vision model
+            "pixtral",
+            # Open-source vision models
+            "llava",
+            # Alibaba — Qwen vision-language models
+            "qwen-vl",
+            "qwen2-vl",
+            "qwen3-vl",
        )
+        # Text-only models that would otherwise match vision prefixes
+        text_only_models = ("o3-mini", "o1-mini", "o1-preview")
+
        model_lower = self.model.lower()
+
+        # Check exclusion first
+        if any(
+            model_lower.startswith(m) or f"/{m}" in model_lower
+            for m in text_only_models
+        ):
+            return False
+
        return any(
            model_lower.startswith(p) or f"/{p}" in model_lower for p in vision_prefixes
        )
--- a/lib/crewai/tests/utils.py
+++ b/lib/crewai/tests/utils.py
@@ -32,8 +32,10 @@ def wait_for_event_handlers(timeout: float = 5.0) -> None:
        except Exception:  # noqa: S110
            pass

-    crewai_event_bus._sync_executor.shutdown(wait=True)
-    crewai_event_bus._sync_executor = ThreadPoolExecutor(
-        max_workers=10,
-        thread_name_prefix="CrewAISyncHandler",
-    )
+    # Guard against lazy-initialized executor (may not exist if no events were emitted)
+    if getattr(crewai_event_bus, "_executor_initialized", False):
+        crewai_event_bus._sync_executor.shutdown(wait=True)
+        crewai_event_bus._sync_executor = ThreadPoolExecutor(
+            max_workers=10,
+            thread_name_prefix="CrewAISyncHandler",
+        )