supporting image tool

2026-07-07 16:09:30 +00:00 · 2024-12-26 23:24:41 -03:00
parent 93bee87324
commit e61f2f50c9
5 changed files with 634 additions and 13 deletions
--- a/src/crewai/agents/crew_agent_executor.py
+++ b/src/crewai/agents/crew_agent_executor.py
@@ -144,9 +144,13 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
                            formatted_answer
                        )

+                        # Directly append the result to the messages if the
+                        # tool is "Add image to content" in case of multimodal
+                        # agents
                        if formatted_answer.tool == "Add image to content":
                            self.messages.append(tool_result.result)
                            continue
+
                        else:
                            if self.step_callback:
                                self.step_callback(tool_result)
--- a/src/crewai/tools/agent_tools/add_image_tool.py
+++ b/src/crewai/tools/agent_tools/add_image_tool.py
@@ -4,31 +4,37 @@ from crewai.tools.base_tool import BaseTool

 class AddImageToolSchema(BaseModel):
    image_url: str = Field(..., description="The URL or path of the image to add")
-    action: str = Field(..., description="The context or purpose of why this image is being added and how it should be used")
+    action: str = Field(
+        default="Please provide a detailed description of this image, including all visual elements, context, and any notable details you can observe.",
+        description="Optional context or question about the image"
+    )


 class AddImageTool(BaseTool):
    """Tool for adding images to the content"""

    name: str = "Add image to content"
-    description: str = "See image to understand it's content"
+    description: str = "See image to understand it's content, you can optionally ask a question about the image"
    args_schema: type[BaseModel] = AddImageToolSchema

    def _run(
        self,
        image_url: str,
-        action: str,
+        action: str = None,
        **kwargs,
    ) -> dict:
+        action = action or "Please provide a detailed description of this image, including all visual elements, context, and any notable details you can observe."
+        content = [
+            {"type": "text", "text": action},
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url,
+                },
+            }
+        ]
+
        return {
            "role": "user",
-            "content": [
-                {"type": "text", "text": action},
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url,
-                    },
-                },
-            ],
+            "content": content
        }
--- a/src/crewai/tools/tool_usage.py
+++ b/src/crewai/tools/tool_usage.py
@@ -5,6 +5,7 @@ from difflib import SequenceMatcher
 from textwrap import dedent
 from typing import Any, List, Union

+from crewai.tools.structured_tool import CrewStructuredTool
 import crewai.utilities.events as events
 from crewai.agents.tools_handler import ToolsHandler
 from crewai.task import Task
@@ -103,6 +104,19 @@ class ToolUsage:
            if self.agent.verbose:
                self._printer.print(content=f"\n\n{error}\n", color="red")
            return error
+
+        if isinstance(tool, CrewStructuredTool) and tool.name == 'Add image to content':
+            try:
+                result = self._use(tool_string=tool_string, tool=tool, calling=calling)
+                return result
+
+            except Exception as e:
+                error = getattr(e, "message", str(e))
+                self.task.increment_tools_errors()
+                if self.agent.verbose:
+                    self._printer.print(content=f"\n\n{error}\n", color="red")
+                return error
+
        return f"{self._use(tool_string=tool_string, tool=tool, calling=calling)}"  # type: ignore # BUG?: "_use" of "ToolUsage" does not return a value (it only ever returns None)

    def _use(
--- a/tests/cassettes/test_multimodal_agent_live_image_analysis.yaml
+++ b/tests/cassettes/test_multimodal_agent_live_image_analysis.yaml
@@ -0,0 +1,481 @@
+interactions:
+- request:
+    body: '{"messages": [{"role": "system", "content": "You are Image Analyst. You''re
+      an expert at visual analysis, trained to notice and describe details in images.\nYour
+      personal goal is: Analyze images with high attention to detail\nYou ONLY have
+      access to the following tools, and should NEVER make up tools that are not listed
+      here:\n\nTool Name: Add image to content\nTool Arguments: {''image_url'': {''description'':
+      ''The URL or path of the image to add'', ''type'': ''str''}, ''action'': {''description'':
+      ''Optional context or question about the image'', ''type'': ''str''}}\nTool
+      Description: See image to understand it''s content, you can optionally ask a
+      question about the image\n\nUse the following format:\n\nThought: you should
+      always think about what to do\nAction: the action to take, only one name of
+      [Add image to content], just the name, exactly as it''s written.\nAction Input:
+      the input to the action, just a simple python dictionary, enclosed in curly
+      braces, using \" to wrap keys and values.\nObservation: the result of the action\n\nOnce
+      all necessary information is gathered:\n\nThought: I now know the final answer\nFinal
+      Answer: the final answer to the original input question"}, {"role": "user",
+      "content": "\nCurrent Task: \n        Analyze the provided image and describe
+      what you see in detail.\n        Focus on main elements, colors, composition,
+      and any notable details.\n        Image: https://media.istockphoto.com/id/946087016/photo/aerial-view-of-lower-manhattan-new-york.jpg?s=612x612&w=0&k=20&c=viLiMRznQ8v5LzKTt_LvtfPFUVl1oiyiemVdSlm29_k=\n        \n\nThis
+      is the expect criteria for your final answer: A comprehensive description of
+      the image contents.\nyou MUST return the actual complete content as the final
+      answer, not a summary.\n\nBegin! This is VERY important to you, use the tools
+      available and give your best Final Answer, your job depends on it!\n\nThought:"}],
+      "model": "gpt-4o", "stop": ["\nObservation:"], "stream": false}'
+    headers:
+      accept:
+      - application/json
+      accept-encoding:
+      - gzip, deflate
+      connection:
+      - keep-alive
+      content-length:
+      - '1948'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      user-agent:
+      - OpenAI/Python 1.52.1
+      x-stainless-arch:
+      - arm64
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - MacOS
+      x-stainless-package-version:
+      - 1.52.1
+      x-stainless-raw-response:
+      - 'true'
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.11.7
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    content: "{\n  \"id\": \"chatcmpl-AiuIfzzcje5KdvKIG5CkFeORroiKk\",\n  \"object\":
+      \"chat.completion\",\n  \"created\": 1735266213,\n  \"model\": \"gpt-4o-2024-08-06\",\n
+      \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+      \"assistant\",\n        \"content\": \"Action: Add image to content\\nAction
+      Input: {\\\"image_url\\\": \\\"https://media.istockphoto.com/id/946087016/photo/aerial-view-of-lower-manhattan-new-york.jpg?s=612x612&w=0&k=20&c=viLiMRznQ8v5LzKTt_LvtfPFUVl1oiyiemVdSlm29_k=\\\",
+      \\\"action\\\": \\\"Analyze the provided image and describe what you see in
+      detail.\\\"}\",\n        \"refusal\": null\n      },\n      \"logprobs\": null,\n
+      \     \"finish_reason\": \"stop\"\n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\":
+      417,\n    \"completion_tokens\": 103,\n    \"total_tokens\": 520,\n    \"prompt_tokens_details\":
+      {\n      \"cached_tokens\": 0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
+      {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+      0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"system_fingerprint\":
+      \"fp_5f20662549\"\n}\n"
+    headers:
+      CF-Cache-Status:
+      - DYNAMIC
+      CF-RAY:
+      - 8f85d96b280df217-GRU
+      Connection:
+      - keep-alive
+      Content-Encoding:
+      - gzip
+      Content-Type:
+      - application/json
+      Date:
+      - Fri, 27 Dec 2024 02:23:35 GMT
+      Server:
+      - cloudflare
+      Set-Cookie:
+      - __cf_bm=kJ1pw1xjCMSxjHSS8iJC5z_j2PZxl.i387KCpj9xNZU-1735266215-1.0.1.1-Ybg0wVTsrBlpVZmtQyA1ullY8m3v2Ix0N_SYlhr9z7zKfbLeqGZEVL37YSY.dvIiLVY3XPZzMtG8Xwo6UucW6A;
+        path=/; expires=Fri, 27-Dec-24 02:53:35 GMT; domain=.api.openai.com; HttpOnly;
+        Secure; SameSite=None
+      - _cfuvid=v_wJZ5m7qCjrnRfks0gT2GAk9yR14BdIDAQiQR7xxI8-1735266215000-0.0.1.1-604800000;
+        path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - nosniff
+      access-control-expose-headers:
+      - X-Request-ID
+      alt-svc:
+      - h3=":443"; ma=86400
+      openai-organization:
+      - crewai-iuxna1
+      openai-processing-ms:
+      - '1212'
+      openai-version:
+      - '2020-10-01'
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      x-ratelimit-limit-requests:
+      - '10000'
+      x-ratelimit-limit-tokens:
+      - '30000000'
+      x-ratelimit-remaining-requests:
+      - '9999'
+      x-ratelimit-remaining-tokens:
+      - '29999539'
+      x-ratelimit-reset-requests:
+      - 6ms
+      x-ratelimit-reset-tokens:
+      - 0s
+      x-request-id:
+      - req_663a2b18099a18361d6b02befc175289
+    http_version: HTTP/1.1
+    status_code: 200
+- request:
+    body: !!binary |
+      Co4LCiQKIgoMc2VydmljZS5uYW1lEhIKEGNyZXdBSS10ZWxlbWV0cnkS5QoKEgoQY3Jld2FpLnRl
+      bGVtZXRyeRKjBwoQHmzzumMNXHOgpJ4zCIxJSxII72WnLlLfRyYqDENyZXcgQ3JlYXRlZDABOQjB
+      gFxt5xQYQYhMiVxt5xQYShoKDmNyZXdhaV92ZXJzaW9uEggKBjAuODYuMEoaCg5weXRob25fdmVy
+      c2lvbhIICgYzLjExLjdKLgoIY3Jld19rZXkSIgogZTM5NTY3YjUwNTI5MDljYTMzNDA5ODRiODM4
+      OTgwZWFKMQoHY3Jld19pZBImCiQ4MDA0YTA1NC0zYjNkLTQ4OGEtYTlkNC1kZWQzMDVhMDIxY2FK
+      HAoMY3Jld19wcm9jZXNzEgwKCnNlcXVlbnRpYWxKEQoLY3Jld19tZW1vcnkSAhAAShoKFGNyZXdf
+      bnVtYmVyX29mX3Rhc2tzEgIYAUobChVjcmV3X251bWJlcl9vZl9hZ2VudHMSAhgBSs4CCgtjcmV3
+      X2FnZW50cxK+Agq7Alt7ImtleSI6ICI5ZGM4Y2NlMDMwNDY4MTk2MDQxYjRjMzgwYjYxN2NiMCIs
+      ICJpZCI6ICJjNTZhZGI2Mi1lMGIwLTQzYzAtYmQ4OC0xYzEwYTNhNmU5NDQiLCAicm9sZSI6ICJJ
+      bWFnZSBBbmFseXN0IiwgInZlcmJvc2U/IjogdHJ1ZSwgIm1heF9pdGVyIjogMjAsICJtYXhfcnBt
+      IjogbnVsbCwgImZ1bmN0aW9uX2NhbGxpbmdfbGxtIjogIiIsICJsbG0iOiAiZ3B0LTRvIiwgImRl
+      bGVnYXRpb25fZW5hYmxlZD8iOiBmYWxzZSwgImFsbG93X2NvZGVfZXhlY3V0aW9uPyI6IGZhbHNl
+      LCAibWF4X3JldHJ5X2xpbWl0IjogMiwgInRvb2xzX25hbWVzIjogW119XUqCAgoKY3Jld190YXNr
+      cxLzAQrwAVt7ImtleSI6ICJhOWE3NmNhNjk1N2QwYmZmYTY5ZWFiMjBiNjY0ODIyYiIsICJpZCI6
+      ICJhNzFiZDllNC0wNzdkLTRmMTQtODg0MS03MGMwZWM4MGZkMmMiLCAiYXN5bmNfZXhlY3V0aW9u
+      PyI6IGZhbHNlLCAiaHVtYW5faW5wdXQ/IjogZmFsc2UsICJhZ2VudF9yb2xlIjogIkltYWdlIEFu
+      YWx5c3QiLCAiYWdlbnRfa2V5IjogIjlkYzhjY2UwMzA0NjgxOTYwNDFiNGMzODBiNjE3Y2IwIiwg
+      InRvb2xzX25hbWVzIjogW119XXoCGAGFAQABAAASjgIKEOZ5pMdq9ep85DrP1Vv8Y8MSCE7ahOkm
+      2IDHKgxUYXNrIENyZWF0ZWQwATlIg85cbecUGEGQ9M5cbecUGEouCghjcmV3X2tleRIiCiBlMzk1
+      NjdiNTA1MjkwOWNhMzM0MDk4NGI4Mzg5ODBlYUoxCgdjcmV3X2lkEiYKJDgwMDRhMDU0LTNiM2Qt
+      NDg4YS1hOWQ0LWRlZDMwNWEwMjFjYUouCgh0YXNrX2tleRIiCiBhOWE3NmNhNjk1N2QwYmZmYTY5
+      ZWFiMjBiNjY0ODIyYkoxCgd0YXNrX2lkEiYKJGE3MWJkOWU0LTA3N2QtNGYxNC04ODQxLTcwYzBl
+      YzgwZmQyY3oCGAGFAQABAAASlwEKECyaQQK8JkKLh6S2mWHTeDgSCPWCpr7v9CQZKgpUb29sIFVz
+      YWdlMAE5MLyst23nFBhBOJy/t23nFBhKGgoOY3Jld2FpX3ZlcnNpb24SCAoGMC44Ni4wSiMKCXRv
+      b2xfbmFtZRIWChRBZGQgaW1hZ2UgdG8gY29udGVudEoOCghhdHRlbXB0cxICGAF6AhgBhQEAAQAA
+    headers:
+      Accept:
+      - '*/*'
+      Accept-Encoding:
+      - gzip, deflate
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '1425'
+      Content-Type:
+      - application/x-protobuf
+      User-Agent:
+      - OTel-OTLP-Exporter-Python/1.27.0
+    method: POST
+    uri: https://telemetry.crewai.com:4319/v1/traces
+  response:
+    body:
+      string: "\n\0"
+    headers:
+      Content-Length:
+      - '2'
+      Content-Type:
+      - application/x-protobuf
+      Date:
+      - Fri, 27 Dec 2024 02:23:39 GMT
+    status:
+      code: 200
+      message: OK
+- request:
+    body: '{"messages": [{"role": "system", "content": "You are Image Analyst. You''re
+      an expert at visual analysis, trained to notice and describe details in images.\nYour
+      personal goal is: Analyze images with high attention to detail\nYou ONLY have
+      access to the following tools, and should NEVER make up tools that are not listed
+      here:\n\nTool Name: Add image to content\nTool Arguments: {''image_url'': {''description'':
+      ''The URL or path of the image to add'', ''type'': ''str''}, ''action'': {''description'':
+      ''Optional context or question about the image'', ''type'': ''str''}}\nTool
+      Description: See image to understand it''s content, you can optionally ask a
+      question about the image\n\nUse the following format:\n\nThought: you should
+      always think about what to do\nAction: the action to take, only one name of
+      [Add image to content], just the name, exactly as it''s written.\nAction Input:
+      the input to the action, just a simple python dictionary, enclosed in curly
+      braces, using \" to wrap keys and values.\nObservation: the result of the action\n\nOnce
+      all necessary information is gathered:\n\nThought: I now know the final answer\nFinal
+      Answer: the final answer to the original input question"}, {"role": "user",
+      "content": "\nCurrent Task: \n        Analyze the provided image and describe
+      what you see in detail.\n        Focus on main elements, colors, composition,
+      and any notable details.\n        Image: https://media.istockphoto.com/id/946087016/photo/aerial-view-of-lower-manhattan-new-york.jpg?s=612x612&w=0&k=20&c=viLiMRznQ8v5LzKTt_LvtfPFUVl1oiyiemVdSlm29_k=\n        \n\nThis
+      is the expect criteria for your final answer: A comprehensive description of
+      the image contents.\nyou MUST return the actual complete content as the final
+      answer, not a summary.\n\nBegin! This is VERY important to you, use the tools
+      available and give your best Final Answer, your job depends on it!\n\nThought:"},
+      {"role": "user", "content": [{"type": "text", "text": "Analyze the provided
+      image and describe what you see in detail."}, {"type": "image_url", "image_url":
+      {"url": "https://media.istockphoto.com/id/946087016/photo/aerial-view-of-lower-manhattan-new-york.jpg?s=612x612&w=0&k=20&c=viLiMRznQ8v5LzKTt_LvtfPFUVl1oiyiemVdSlm29_k="}}]}],
+      "model": "gpt-4o", "stop": ["\nObservation:"], "stream": false}'
+    headers:
+      accept:
+      - application/json
+      accept-encoding:
+      - gzip, deflate
+      connection:
+      - keep-alive
+      content-length:
+      - '2279'
+      content-type:
+      - application/json
+      cookie:
+      - __cf_bm=kJ1pw1xjCMSxjHSS8iJC5z_j2PZxl.i387KCpj9xNZU-1735266215-1.0.1.1-Ybg0wVTsrBlpVZmtQyA1ullY8m3v2Ix0N_SYlhr9z7zKfbLeqGZEVL37YSY.dvIiLVY3XPZzMtG8Xwo6UucW6A;
+        _cfuvid=v_wJZ5m7qCjrnRfks0gT2GAk9yR14BdIDAQiQR7xxI8-1735266215000-0.0.1.1-604800000
+      host:
+      - api.openai.com
+      user-agent:
+      - OpenAI/Python 1.52.1
+      x-stainless-arch:
+      - arm64
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - MacOS
+      x-stainless-package-version:
+      - 1.52.1
+      x-stainless-raw-response:
+      - 'true'
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.11.7
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    content: "{\n  \"id\": \"chatcmpl-AiuIiqT33ROFMdw1gNmqH9jiw6PfF\",\n  \"object\":
+      \"chat.completion\",\n  \"created\": 1735266216,\n  \"model\": \"gpt-4o-2024-08-06\",\n
+      \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+      \"assistant\",\n        \"content\": \"The image is an aerial view of Lower
+      Manhattan in New York City. \\n\\nMain Elements:\\n- The One World Trade Center
+      tower stands prominently, distinguishable by its sleek, tapering structure reaching
+      into the sky, surrounded by other skyscrapers.\\n- Skyscrapers in varying heights
+      and architectural styles, fill the densely packed urban landscape.\\n- A waterfront
+      is visible at the edges, with docks and piers extending into the water.\\n\\nColors:\\n-
+      The buildings exhibit a mix of colors, predominantly grays, whites, and browns,
+      against the blues of the sky and water.\\n- There's a section of greenery visible,
+      likely a park or recreational space, offering a contrast with its vibrant green
+      hues.\\n\\nComposition:\\n- The angle of the photograph showcases the expanse
+      of the city, highlighting the density and scale of the buildings.\\n- Water
+      borders the city on two prominent sides, creating a natural boundary and enhancing
+      the island's urban island feel.\\n\\nNotable Details:\\n- The image captures
+      the iconic layout of Manhattan, with the surrounding Hudson River and New York
+      Harbor visible in the background.\\n- Beyond Lower Manhattan, more of the cityscape
+      stretches into the distance, illustrating the vastness of New York City.\\n-
+      The day appears clear and sunny, with shadows casting from the buildings, indicating
+      time in the morning or late afternoon.\\n\\nOverall, the image is a striking
+      depiction of the dynamic and bustling environment of New York's Lower Manhattan,
+      encapsulating its urban character and proximity to the water.\",\n        \"refusal\":
+      null\n      },\n      \"logprobs\": null,\n      \"finish_reason\": \"stop\"\n
+      \   }\n  ],\n  \"usage\": {\n    \"prompt_tokens\": 858,\n    \"completion_tokens\":
+      295,\n    \"total_tokens\": 1153,\n    \"prompt_tokens_details\": {\n      \"cached_tokens\":
+      0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\": {\n
+      \     \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+      0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"system_fingerprint\":
+      \"fp_5f20662549\"\n}\n"
+    headers:
+      CF-Cache-Status:
+      - DYNAMIC
+      CF-RAY:
+      - 8f85d9741d0cf217-GRU
+      Connection:
+      - keep-alive
+      Content-Encoding:
+      - gzip
+      Content-Type:
+      - application/json
+      Date:
+      - Fri, 27 Dec 2024 02:23:40 GMT
+      Server:
+      - cloudflare
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - nosniff
+      access-control-expose-headers:
+      - X-Request-ID
+      alt-svc:
+      - h3=":443"; ma=86400
+      openai-organization:
+      - crewai-iuxna1
+      openai-processing-ms:
+      - '5136'
+      openai-version:
+      - '2020-10-01'
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      x-ratelimit-limit-input-images:
+      - '50000'
+      x-ratelimit-limit-requests:
+      - '10000'
+      x-ratelimit-limit-tokens:
+      - '30000000'
+      x-ratelimit-remaining-input-images:
+      - '49999'
+      x-ratelimit-remaining-requests:
+      - '9999'
+      x-ratelimit-remaining-tokens:
+      - '29998756'
+      x-ratelimit-reset-input-images:
+      - 1ms
+      x-ratelimit-reset-requests:
+      - 6ms
+      x-ratelimit-reset-tokens:
+      - 2ms
+      x-request-id:
+      - req_57a7430712d4ff4a81f600ffb94d3b6e
+    http_version: HTTP/1.1
+    status_code: 200
+- request:
+    body: '{"messages": [{"role": "system", "content": "You are Image Analyst. You''re
+      an expert at visual analysis, trained to notice and describe details in images.\nYour
+      personal goal is: Analyze images with high attention to detail\nYou ONLY have
+      access to the following tools, and should NEVER make up tools that are not listed
+      here:\n\nTool Name: Add image to content\nTool Arguments: {''image_url'': {''description'':
+      ''The URL or path of the image to add'', ''type'': ''str''}, ''action'': {''description'':
+      ''Optional context or question about the image'', ''type'': ''str''}}\nTool
+      Description: See image to understand it''s content, you can optionally ask a
+      question about the image\n\nUse the following format:\n\nThought: you should
+      always think about what to do\nAction: the action to take, only one name of
+      [Add image to content], just the name, exactly as it''s written.\nAction Input:
+      the input to the action, just a simple python dictionary, enclosed in curly
+      braces, using \" to wrap keys and values.\nObservation: the result of the action\n\nOnce
+      all necessary information is gathered:\n\nThought: I now know the final answer\nFinal
+      Answer: the final answer to the original input question"}, {"role": "user",
+      "content": "\nCurrent Task: \n        Analyze the provided image and describe
+      what you see in detail.\n        Focus on main elements, colors, composition,
+      and any notable details.\n        Image: https://media.istockphoto.com/id/946087016/photo/aerial-view-of-lower-manhattan-new-york.jpg?s=612x612&w=0&k=20&c=viLiMRznQ8v5LzKTt_LvtfPFUVl1oiyiemVdSlm29_k=\n        \n\nThis
+      is the expect criteria for your final answer: A comprehensive description of
+      the image contents.\nyou MUST return the actual complete content as the final
+      answer, not a summary.\n\nBegin! This is VERY important to you, use the tools
+      available and give your best Final Answer, your job depends on it!\n\nThought:"},
+      {"role": "user", "content": [{"type": "text", "text": "Analyze the provided
+      image and describe what you see in detail."}, {"type": "image_url", "image_url":
+      {"url": "https://media.istockphoto.com/id/946087016/photo/aerial-view-of-lower-manhattan-new-york.jpg?s=612x612&w=0&k=20&c=viLiMRznQ8v5LzKTt_LvtfPFUVl1oiyiemVdSlm29_k="}}]},
+      {"role": "user", "content": "I did it wrong. Invalid Format: I missed the ''Action:''
+      after ''Thought:''. I will do right next, and don''t use a tool I have already
+      used.\n\nIf you don''t need to use any more tools, you must give your best complete
+      final answer, make sure it satisfies the expected criteria, use the EXACT format
+      below:\n\nThought: I now can give a great answer\nFinal Answer: my best complete
+      final answer to the task.\n\n"}], "model": "gpt-4o", "stop": ["\nObservation:"],
+      "stream": false}'
+    headers:
+      accept:
+      - application/json
+      accept-encoding:
+      - gzip, deflate
+      connection:
+      - keep-alive
+      content-length:
+      - '2717'
+      content-type:
+      - application/json
+      cookie:
+      - __cf_bm=kJ1pw1xjCMSxjHSS8iJC5z_j2PZxl.i387KCpj9xNZU-1735266215-1.0.1.1-Ybg0wVTsrBlpVZmtQyA1ullY8m3v2Ix0N_SYlhr9z7zKfbLeqGZEVL37YSY.dvIiLVY3XPZzMtG8Xwo6UucW6A;
+        _cfuvid=v_wJZ5m7qCjrnRfks0gT2GAk9yR14BdIDAQiQR7xxI8-1735266215000-0.0.1.1-604800000
+      host:
+      - api.openai.com
+      user-agent:
+      - OpenAI/Python 1.52.1
+      x-stainless-arch:
+      - arm64
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - MacOS
+      x-stainless-package-version:
+      - 1.52.1
+      x-stainless-raw-response:
+      - 'true'
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.11.7
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    content: "{\n  \"id\": \"chatcmpl-AiuInuYNldaQVo6B1EsEquT1VFMN7\",\n  \"object\":
+      \"chat.completion\",\n  \"created\": 1735266221,\n  \"model\": \"gpt-4o-2024-08-06\",\n
+      \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+      \"assistant\",\n        \"content\": \"Thought: I now can give a great answer\\nFinal
+      Answer: The image is an aerial view of Lower Manhattan in New York City. The
+      photograph prominently features the cluster of skyscrapers that characterizes
+      the area, with One World Trade Center standing out as a particularly tall and
+      iconic structure. The buildings vary in color, with shades of glassy blue, grey,
+      and natural stone dominating the skyline. In the bottom part of the image, there
+      is a green space, likely Battery Park, providing a stark contrast to the dense
+      urban environment, with trees and pathways visible. The water surrounding Manhattan
+      is a deep blue, and several piers jut into the harbor. The Hudson River is visible
+      on the left, and the East River can be seen on the right, framing the island.
+      The overall composition captures the bustling and vibrant nature of New York\u2019s
+      financial hub, with bright sunlight illuminating the buildings, casting sharp
+      shadows and enhancing the depth of the cityscape. The sky is clear, suggesting
+      a sunny day with good visibility.\",\n        \"refusal\": null\n      },\n
+      \     \"logprobs\": null,\n      \"finish_reason\": \"stop\"\n    }\n  ],\n
+      \ \"usage\": {\n    \"prompt_tokens\": 952,\n    \"completion_tokens\": 203,\n
+      \   \"total_tokens\": 1155,\n    \"prompt_tokens_details\": {\n      \"cached_tokens\":
+      0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\": {\n
+      \     \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+      0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"system_fingerprint\":
+      \"fp_5f20662549\"\n}\n"
+    headers:
+      CF-Cache-Status:
+      - DYNAMIC
+      CF-RAY:
+      - 8f85d995ad1ef217-GRU
+      Connection:
+      - keep-alive
+      Content-Encoding:
+      - gzip
+      Content-Type:
+      - application/json
+      Date:
+      - Fri, 27 Dec 2024 02:23:43 GMT
+      Server:
+      - cloudflare
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - nosniff
+      access-control-expose-headers:
+      - X-Request-ID
+      alt-svc:
+      - h3=":443"; ma=86400
+      openai-organization:
+      - crewai-iuxna1
+      openai-processing-ms:
+      - '3108'
+      openai-version:
+      - '2020-10-01'
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      x-ratelimit-limit-input-images:
+      - '50000'
+      x-ratelimit-limit-requests:
+      - '10000'
+      x-ratelimit-limit-tokens:
+      - '30000000'
+      x-ratelimit-remaining-input-images:
+      - '49999'
+      x-ratelimit-remaining-requests:
+      - '9999'
+      x-ratelimit-remaining-tokens:
+      - '29998656'
+      x-ratelimit-reset-input-images:
+      - 1ms
+      x-ratelimit-reset-requests:
+      - 6ms
+      x-ratelimit-reset-tokens:
+      - 2ms
+      x-request-id:
+      - req_45f0e3d457a18f973a59074d16f137b6
+    http_version: HTTP/1.1
+    status_code: 200
+version: 1
--- a/tests/crew_test.py
+++ b/tests/crew_test.py
@@ -2999,4 +2999,120 @@ def test_multimodal_flag_adds_multimodal_tools():
        )

        # Verify we have exactly one tool (just the AddImageTool)
-        assert len(used_tools) == 1, "Should only have the AddImageTool"
+        assert len(used_tools) == 1, "Should only have the AddImageTool"
+
+@pytest.mark.vcr(filter_headers=["authorization"])
+def test_multimodal_agent_image_tool_handling():
+    """
+    Test that multimodal agents properly handle image tools in the CrewAgentExecutor
+    """
+    # Create a multimodal agent
+    multimodal_agent = Agent(
+        role="Image Analyst",
+        goal="Analyze images and provide descriptions",
+        backstory="You're an expert at analyzing and describing images.",
+        allow_delegation=False,
+        multimodal=True,
+    )
+
+    # Create a task that involves image analysis
+    task = Task(
+        description="Analyze this image and describe what you see.",
+        expected_output="A detailed description of the image.",
+        agent=multimodal_agent,
+    )
+
+    crew = Crew(agents=[multimodal_agent], tasks=[task])
+
+    # Mock the image tool response
+    mock_image_tool_result = {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Please analyze this image"},
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": "https://example.com/test-image.jpg",
+                },
+            },
+        ],
+    }
+
+    # Create a mock task output for the final result
+    mock_task_output = TaskOutput(
+        description="Mock description",
+        raw="A detailed analysis of the image",
+        agent="Image Analyst"
+    )
+
+    with patch.object(Task, 'execute_sync') as mock_execute_sync:
+        # Set up the mock to return our task output
+        mock_execute_sync.return_value = mock_task_output
+
+        # Execute the crew
+        crew.kickoff()
+
+        # Get the tools that were passed to execute_sync
+        _, kwargs = mock_execute_sync.call_args
+        tools = kwargs['tools']
+
+        # Verify the AddImageTool is present and properly configured
+        image_tools = [tool for tool in tools if tool.name == "Add image to content"]
+        assert len(image_tools) == 1, "Should have exactly one AddImageTool"
+
+        # Test the tool's execution
+        image_tool = image_tools[0]
+        result = image_tool._run(
+            image_url="https://example.com/test-image.jpg",
+            action="Please analyze this image"
+        )
+
+        # Verify the tool returns the expected format
+        assert result == mock_image_tool_result
+        assert result["role"] == "user"
+        assert len(result["content"]) == 2
+        assert result["content"][0]["type"] == "text"
+        assert result["content"][1]["type"] == "image_url"
+
+@pytest.mark.vcr(filter_headers=["authorization"])
+def test_multimodal_agent_live_image_analysis():
+    """
+    Test that multimodal agents can analyze images through a real API call
+    """
+    # Create a multimodal agent
+    image_analyst = Agent(
+        role="Image Analyst",
+        goal="Analyze images with high attention to detail",
+        backstory="You're an expert at visual analysis, trained to notice and describe details in images.",
+        allow_delegation=False,
+        multimodal=True,
+        verbose=True,
+        llm="gpt-4o"
+    )
+
+    # Create a task for image analysis
+    analyze_image = Task(
+        description="""
+        Analyze the provided image and describe what you see in detail.
+        Focus on main elements, colors, composition, and any notable details.
+        Image: {image_url}
+        """,
+        expected_output="A comprehensive description of the image contents.",
+        agent=image_analyst
+    )
+
+    # Create and run the crew
+    crew = Crew(
+        agents=[image_analyst],
+        tasks=[analyze_image]
+    )
+
+    # Execute with an image URL
+    result = crew.kickoff(inputs={
+        "image_url": "https://media.istockphoto.com/id/946087016/photo/aerial-view-of-lower-manhattan-new-york.jpg?s=612x612&w=0&k=20&c=viLiMRznQ8v5LzKTt_LvtfPFUVl1oiyiemVdSlm29_k="
+    })
+
+    # Verify we got a meaningful response
+    assert isinstance(result.raw, str)
+    assert len(result.raw) > 100  # Expecting a detailed analysis
+    assert "error" not in result.raw.lower()  # No error messages in response