supporting image tool

2026-01-11 00:58:30 +00:00 · 2024-12-26 23:24:41 -03:00
parent 93bee87324
commit e61f2f50c9
5 changed files with 634 additions and 13 deletions
--- a/src/crewai/agents/crew_agent_executor.py
+++ b/src/crewai/agents/crew_agent_executor.py
@@ -144,9 +144,13 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
                            formatted_answer
                        )

+                        # Directly append the result to the messages if the
+                        # tool is "Add image to content" in case of multimodal
+                        # agents
                        if formatted_answer.tool == "Add image to content":
                            self.messages.append(tool_result.result)
                            continue
+
                        else:
                            if self.step_callback:
                                self.step_callback(tool_result)
--- a/src/crewai/tools/agent_tools/add_image_tool.py
+++ b/src/crewai/tools/agent_tools/add_image_tool.py
@@ -4,31 +4,37 @@ from crewai.tools.base_tool import BaseTool

 class AddImageToolSchema(BaseModel):
    image_url: str = Field(..., description="The URL or path of the image to add")
-    action: str = Field(..., description="The context or purpose of why this image is being added and how it should be used")
+    action: str = Field(
+        default="Please provide a detailed description of this image, including all visual elements, context, and any notable details you can observe.",
+        description="Optional context or question about the image"
+    )


 class AddImageTool(BaseTool):
    """Tool for adding images to the content"""

    name: str = "Add image to content"
-    description: str = "See image to understand it's content"
+    description: str = "See image to understand it's content, you can optionally ask a question about the image"
    args_schema: type[BaseModel] = AddImageToolSchema

    def _run(
        self,
        image_url: str,
-        action: str,
+        action: str = None,
        **kwargs,
    ) -> dict:
+        action = action or "Please provide a detailed description of this image, including all visual elements, context, and any notable details you can observe."
+        content = [
+            {"type": "text", "text": action},
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url,
+                },
+            }
+        ]
+
        return {
            "role": "user",
-            "content": [
-                {"type": "text", "text": action},
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url,
-                    },
-                },
-            ],
+            "content": content
        }
--- a/src/crewai/tools/tool_usage.py
+++ b/src/crewai/tools/tool_usage.py
@@ -5,6 +5,7 @@ from difflib import SequenceMatcher
 from textwrap import dedent
 from typing import Any, List, Union

+from crewai.tools.structured_tool import CrewStructuredTool
 import crewai.utilities.events as events
 from crewai.agents.tools_handler import ToolsHandler
 from crewai.task import Task
@@ -103,6 +104,19 @@ class ToolUsage:
            if self.agent.verbose:
                self._printer.print(content=f"\n\n{error}\n", color="red")
            return error
+
+        if isinstance(tool, CrewStructuredTool) and tool.name == 'Add image to content':
+            try:
+                result = self._use(tool_string=tool_string, tool=tool, calling=calling)
+                return result
+
+            except Exception as e:
+                error = getattr(e, "message", str(e))
+                self.task.increment_tools_errors()
+                if self.agent.verbose:
+                    self._printer.print(content=f"\n\n{error}\n", color="red")
+                return error
+
        return f"{self._use(tool_string=tool_string, tool=tool, calling=calling)}"  # type: ignore # BUG?: "_use" of "ToolUsage" does not return a value (it only ever returns None)

    def _use(