Refactor prepare tool and adding initial add images logic

2026-01-08 15:48:29 +00:00 · 2024-12-26 13:30:59 -03:00
parent e6be4ed66d
commit 93bee87324
5 changed files with 133 additions and 34 deletions
--- a/src/crewai/agent.py
+++ b/src/crewai/agent.py
@@ -410,6 +410,10 @@ class Agent(BaseAgent):
        tools = agent_tools.tools()
        return tools

+    def get_multimodal_tools(self):
+        from crewai.tools.agent_tools.add_image_tool import AddImageTool
+        return [AddImageTool()]
+
    def get_code_execution_tools(self):
        try:
            from crewai_tools import CodeInterpreterTool
--- a/src/crewai/crew.py
+++ b/src/crewai/crew.py
@@ -726,13 +726,11 @@ class Crew(BaseModel):

            # Determine which tools to use - task tools take precedence over agent tools
            tools_for_task = task.tools if task.tools else agent_to_use.tools or []
-            # Add delegation tools if agent allows delegation
-            if agent_to_use.allow_delegation:
-                tools_for_task = self._prepare_tools(task, tools_for_task)
-
-            # Add code execution tools if agent allows code execution
-            if agent_to_use.allow_code_execution:
-                tools_for_task += agent_to_use.get_code_execution_tools()
+            tools_for_task = self._prepare_tools(
+                agent_to_use,
+                task,
+                tools_for_task
+            )

            self._log_task_start(task, agent_to_use.role)

@@ -799,18 +797,24 @@ class Crew(BaseModel):
            return skipped_task_output
        return None

-    def _prepare_tools(self, task: Task, tools: List[Tool]):
-        if self.process == Process.hierarchical:
-            if self.manager_agent:
-                tools = self._update_manager_tools(task, tools)
-            else:
-                raise ValueError("Manager agent is required for hierarchical process.")
+    def _prepare_tools(self, agent: BaseAgent, task: Task, tools: List[Tool]):
+        # Add delegation tools if agent allows delegation
+        if agent.allow_delegation:
+            if self.process == Process.hierarchical:
+                if self.manager_agent:
+                    tools = self._update_manager_tools(task, tools)
+                else:
+                    raise ValueError("Manager agent is required for hierarchical process.")

-        elif task.agent and task.agent.allow_delegation:
-            tools = self._add_delegation_tools(task, tools)
+            elif agent and agent.allow_delegation:
+                tools = self._add_delegation_tools(task, tools)

-        if task.agent and task.agent.multimodal:
-            tools = self._add_multimodal_tools(task, tools)
+        # Add code execution tools if agent allows code execution
+        if agent.allow_code_execution:
+            tools = self._add_code_execution_tools(agent, tools)
+
+        if agent and agent.multimodal:
+            tools = self._add_multimodal_tools(agent, tools)

        return tools

@@ -819,10 +823,34 @@ class Crew(BaseModel):
            return self.manager_agent
        return task.agent

-    def _add_multimodal_tools(self, task: Task, tools: List[Tool]):
-        tools.extend(task.agent.get_multimodal_tools())
+    def _merge_tools(self, existing_tools: List[Tool], new_tools: List[Tool]) -> List[Tool]:
+        """Merge new tools into existing tools list, avoiding duplicates by tool name."""
+        if not new_tools:
+            return existing_tools
+
+        # Create mapping of tool names to new tools
+        new_tool_map = {tool.name: tool for tool in new_tools}
+
+        # Remove any existing tools that will be replaced
+        tools = [tool for tool in existing_tools if tool.name not in new_tool_map]
+
+        # Add all new tools
+        tools.extend(new_tools)
+
        return tools

+    def _inject_delegation_tools(self, tools: List[Tool], task_agent: BaseAgent, agents: List[BaseAgent]):
+        delegation_tools = task_agent.get_delegation_tools(agents)
+        return self._merge_tools(tools, delegation_tools)
+
+    def _add_multimodal_tools(self, agent: BaseAgent, tools: List[Tool]):
+        multimodal_tools = agent.get_multimodal_tools()
+        return self._merge_tools(tools, multimodal_tools)
+
+    def _add_code_execution_tools(self, agent: BaseAgent, tools: List[Tool]):
+        code_tools = agent.get_code_execution_tools()
+        return self._merge_tools(tools, code_tools)
+
    def _add_delegation_tools(self, task: Task, tools: List[Tool]):
        agents_for_delegation = [agent for agent in self.agents if agent != task.agent]
        if len(self.agents) > 1 and len(agents_for_delegation) > 0 and task.agent:
@@ -846,19 +874,6 @@ class Crew(BaseModel):
        # self.manager_agent.tools = tools
        return tools

-    def _inject_delegation_tools(self, tools: List[Tool], task_agent: BaseAgent, agents: List[BaseAgent]):
-        delegation_tools = task_agent.get_delegation_tools(agents)
-        # Create mapping of tool names to new delegation tools
-        delegation_tool_map = {tool.name: tool for tool in delegation_tools}
-
-        # Remove any existing tools that will be replaced
-        tools = [tool for tool in tools if tool.name not in delegation_tool_map]
-
-        # Add all delegation tools
-        tools.extend(delegation_tools)
-
-        return tools
-
    def _get_context(self, task: Task, task_outputs: List[TaskOutput]):
        context = (
            aggregate_raw_outputs_from_tasks(task.context)
--- a/src/crewai/tools/agent_tools/add_image_tool.py
+++ b/src/crewai/tools/agent_tools/add_image_tool.py
@@ -0,0 +1,34 @@
+from pydantic import BaseModel, Field
+from crewai.tools.base_tool import BaseTool
+
+
+class AddImageToolSchema(BaseModel):
+    image_url: str = Field(..., description="The URL or path of the image to add")
+    action: str = Field(..., description="The context or purpose of why this image is being added and how it should be used")
+
+
+class AddImageTool(BaseTool):
+    """Tool for adding images to the content"""
+
+    name: str = "Add image to content"
+    description: str = "See image to understand it's content"
+    args_schema: type[BaseModel] = AddImageToolSchema
+
+    def _run(
+        self,
+        image_url: str,
+        action: str,
+        **kwargs,
+    ) -> dict:
+        return {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": action},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url,
+                    },
+                },
+            ],
+        }
--- a/src/crewai/tools/agent_tools/delegate_work_tool.py
+++ b/src/crewai/tools/agent_tools/delegate_work_tool.py
@@ -1,7 +1,5 @@
 from typing import Optional
-
 from pydantic import BaseModel, Field
-
 from crewai.tools.agent_tools.base_agent_tools import BaseAgentTool


--- a/tests/crew_test.py
+++ b/tests/crew_test.py
@@ -2952,3 +2952,51 @@ def test_task_tools_preserve_code_execution_tools():

        # Verify the total number of tools (TestTool + CodeInterpreter + 2 delegation tools)
        assert len(used_tools) == 4, "Should have TestTool, CodeInterpreter, and 2 delegation tools"
+
+@pytest.mark.vcr(filter_headers=["authorization"])
+def test_multimodal_flag_adds_multimodal_tools():
+    """
+    Test that an agent with multimodal=True automatically has multimodal tools added to the task execution.
+    """
+    from crewai.tools.agent_tools.add_image_tool import AddImageTool
+
+    # Create an agent that supports multimodal
+    multimodal_agent = Agent(
+        role="Multimodal Analyst",
+        goal="Handle multiple media types (text, images, etc.).",
+        backstory="You're an agent specialized in analyzing text, images, and other media.",
+        allow_delegation=False,
+        multimodal=True,  # crucial for adding the multimodal tool
+    )
+
+    # Create a dummy task
+    task = Task(
+        description="Describe what's in this image and generate relevant metadata.",
+        expected_output="An image description plus any relevant metadata.",
+        agent=multimodal_agent,
+    )
+
+    # Define a crew with the multimodal agent
+    crew = Crew(agents=[multimodal_agent], tasks=[task], process=Process.sequential)
+
+    mock_task_output = TaskOutput(
+        description="Mock description",
+        raw="mocked output",
+        agent="mocked agent"
+    )
+
+    # Mock execute_sync to verify the tools passed at runtime
+    with patch.object(Task, "execute_sync", return_value=mock_task_output) as mock_execute_sync:
+        crew.kickoff()
+
+        # Get the tools that were actually used in execution
+        _, kwargs = mock_execute_sync.call_args
+        used_tools = kwargs["tools"]
+
+        # Check that the multimodal tool was added
+        assert any(isinstance(tool, AddImageTool) for tool in used_tools), (
+            "AddImageTool should be present when agent is multimodal"
+        )
+
+        # Verify we have exactly one tool (just the AddImageTool)
+        assert len(used_tools) == 1, "Should only have the AddImageTool"