Adding Multimodal Abilities to Crew (#1805)

* initial fix on delegation tools

* fixing tests for delegations and coding

* Refactor prepare tool and adding initial add images logic

* supporting image tool

* fixing linter

* fix linter

* Making sure multimodal feature support i18n

* fix linter and types

* mixxing translations

* fix types and linter

* Revert "fixing linter"

This reverts commit ef323e3487e62ee4f5bce7f86378068a5ac77e16.

* fix linters

* test

* fix

* fix

* fix linter

* fix

* ignore

* type improvements
This commit is contained in:
João Moura
2024-12-27 17:03:35 -03:00
committed by GitHub
parent 1aa6cfdf2d
commit 0006fdb655
15 changed files with 2992 additions and 546 deletions

View File

@@ -17,6 +17,7 @@ from crewai.memory.contextual.contextual_memory import ContextualMemory
from crewai.task import Task
from crewai.tools import BaseTool
from crewai.tools.agent_tools.agent_tools import AgentTools
from crewai.tools.base_tool import Tool
from crewai.utilities import Converter, Prompts
from crewai.utilities.constants import TRAINED_AGENTS_DATA_FILE, TRAINING_DATA_FILE
from crewai.utilities.converter import generate_model_description
@@ -114,6 +115,10 @@ class Agent(BaseAgent):
default=2,
description="Maximum number of retries for an agent to execute a task when an error occurs.",
)
multimodal: bool = Field(
default=False,
description="Whether the agent is multimodal.",
)
code_execution_mode: Literal["safe", "unsafe"] = Field(
default="safe",
description="Mode for code execution: 'safe' (using Docker) or 'unsafe' (direct execution).",
@@ -406,6 +411,10 @@ class Agent(BaseAgent):
tools = agent_tools.tools()
return tools
def get_multimodal_tools(self) -> List[Tool]:
from crewai.tools.agent_tools.add_image_tool import AddImageTool
return [AddImageTool()]
def get_code_execution_tools(self):
try:
from crewai_tools import CodeInterpreterTool

View File

@@ -143,10 +143,20 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
tool_result = self._execute_tool_and_check_finality(
formatted_answer
)
if self.step_callback:
self.step_callback(tool_result)
formatted_answer.text += f"\nObservation: {tool_result.result}"
# Directly append the result to the messages if the
# tool is "Add image to content" in case of multimodal
# agents
if formatted_answer.tool == self._i18n.tools("add_image")["name"]:
self.messages.append(tool_result.result)
continue
else:
if self.step_callback:
self.step_callback(tool_result)
formatted_answer.text += f"\nObservation: {tool_result.result}"
formatted_answer.result = tool_result.result
if tool_result.result_as_answer:
return AgentFinish(

View File

@@ -35,6 +35,7 @@ from crewai.tasks.conditional_task import ConditionalTask
from crewai.tasks.task_output import TaskOutput
from crewai.telemetry import Telemetry
from crewai.tools.agent_tools.agent_tools import AgentTools
from crewai.tools.base_tool import Tool
from crewai.types.usage_metrics import UsageMetrics
from crewai.utilities import I18N, FileHandler, Logger, RPMController
from crewai.utilities.constants import TRAINING_DATA_FILE
@@ -533,9 +534,6 @@ class Crew(BaseModel):
if not agent.function_calling_llm: # type: ignore # "BaseAgent" has no attribute "function_calling_llm"
agent.function_calling_llm = self.function_calling_llm # type: ignore # "BaseAgent" has no attribute "function_calling_llm"
if agent.allow_code_execution: # type: ignore # BaseAgent" has no attribute "allow_code_execution"
agent.tools += agent.get_code_execution_tools() # type: ignore # "BaseAgent" has no attribute "get_code_execution_tools"; maybe "get_delegation_tools"?
if not agent.step_callback: # type: ignore # "BaseAgent" has no attribute "step_callback"
agent.step_callback = self.step_callback # type: ignore # "BaseAgent" has no attribute "step_callback"
@@ -672,7 +670,6 @@ class Crew(BaseModel):
)
manager.tools = []
raise Exception("Manager agent should not have tools")
manager.tools = self.manager_agent.get_delegation_tools(self.agents)
else:
self.manager_llm = (
getattr(self.manager_llm, "model_name", None)
@@ -684,6 +681,7 @@ class Crew(BaseModel):
goal=i18n.retrieve("hierarchical_manager_agent", "goal"),
backstory=i18n.retrieve("hierarchical_manager_agent", "backstory"),
tools=AgentTools(agents=self.agents).tools(),
allow_delegation=True,
llm=self.manager_llm,
verbose=self.verbose,
)
@@ -726,7 +724,14 @@ class Crew(BaseModel):
f"No agent available for task: {task.description}. Ensure that either the task has an assigned agent or a manager agent is provided."
)
self._prepare_agent_tools(task)
# Determine which tools to use - task tools take precedence over agent tools
tools_for_task = task.tools or agent_to_use.tools or []
tools_for_task = self._prepare_tools(
agent_to_use,
task,
tools_for_task
)
self._log_task_start(task, agent_to_use.role)
if isinstance(task, ConditionalTask):
@@ -743,7 +748,7 @@ class Crew(BaseModel):
future = task.execute_async(
agent=agent_to_use,
context=context,
tools=agent_to_use.tools,
tools=tools_for_task,
)
futures.append((task, future, task_index))
else:
@@ -755,7 +760,7 @@ class Crew(BaseModel):
task_output = task.execute_sync(
agent=agent_to_use,
context=context,
tools=agent_to_use.tools,
tools=tools_for_task,
)
task_outputs = [task_output]
self._process_task_result(task, task_output)
@@ -792,45 +797,67 @@ class Crew(BaseModel):
return skipped_task_output
return None
def _prepare_agent_tools(self, task: Task):
if self.process == Process.hierarchical:
if self.manager_agent:
self._update_manager_tools(task)
else:
raise ValueError("Manager agent is required for hierarchical process.")
elif task.agent and task.agent.allow_delegation:
self._add_delegation_tools(task)
def _prepare_tools(self, agent: BaseAgent, task: Task, tools: List[Tool]) -> List[Tool]:
# Add delegation tools if agent allows delegation
if agent.allow_delegation:
if self.process == Process.hierarchical:
if self.manager_agent:
tools = self._update_manager_tools(task, tools)
else:
raise ValueError("Manager agent is required for hierarchical process.")
elif agent and agent.allow_delegation:
tools = self._add_delegation_tools(task, tools)
# Add code execution tools if agent allows code execution
if agent.allow_code_execution:
tools = self._add_code_execution_tools(agent, tools)
if agent and agent.multimodal:
tools = self._add_multimodal_tools(agent, tools)
return tools
def _get_agent_to_use(self, task: Task) -> Optional[BaseAgent]:
if self.process == Process.hierarchical:
return self.manager_agent
return task.agent
def _add_delegation_tools(self, task: Task):
def _merge_tools(self, existing_tools: List[Tool], new_tools: List[Tool]) -> List[Tool]:
"""Merge new tools into existing tools list, avoiding duplicates by tool name."""
if not new_tools:
return existing_tools
# Create mapping of tool names to new tools
new_tool_map = {tool.name: tool for tool in new_tools}
# Remove any existing tools that will be replaced
tools = [tool for tool in existing_tools if tool.name not in new_tool_map]
# Add all new tools
tools.extend(new_tools)
return tools
def _inject_delegation_tools(self, tools: List[Tool], task_agent: BaseAgent, agents: List[BaseAgent]):
delegation_tools = task_agent.get_delegation_tools(agents)
return self._merge_tools(tools, delegation_tools)
def _add_multimodal_tools(self, agent: BaseAgent, tools: List[Tool]):
multimodal_tools = agent.get_multimodal_tools()
return self._merge_tools(tools, multimodal_tools)
def _add_code_execution_tools(self, agent: BaseAgent, tools: List[Tool]):
code_tools = agent.get_code_execution_tools()
return self._merge_tools(tools, code_tools)
def _add_delegation_tools(self, task: Task, tools: List[Tool]):
agents_for_delegation = [agent for agent in self.agents if agent != task.agent]
if len(self.agents) > 1 and len(agents_for_delegation) > 0 and task.agent:
delegation_tools = task.agent.get_delegation_tools(agents_for_delegation)
# Add tools if they are not already in task.tools
for new_tool in delegation_tools:
# Find the index of the tool with the same name
existing_tool_index = next(
(
index
for index, tool in enumerate(task.tools or [])
if tool.name == new_tool.name
),
None,
)
if not task.tools:
task.tools = []
if existing_tool_index is not None:
# Replace the existing tool
task.tools[existing_tool_index] = new_tool
else:
# Add the new tool
task.tools.append(new_tool)
if not tools:
tools = []
tools = self._inject_delegation_tools(tools, task.agent, agents_for_delegation)
return tools
def _log_task_start(self, task: Task, role: str = "None"):
if self.output_log_file:
@@ -838,14 +865,13 @@ class Crew(BaseModel):
task_name=task.name, task=task.description, agent=role, status="started"
)
def _update_manager_tools(self, task: Task):
def _update_manager_tools(self, task: Task, tools: List[Tool]):
if self.manager_agent:
if task.agent:
self.manager_agent.tools = task.agent.get_delegation_tools([task.agent])
tools = self._inject_delegation_tools(tools, task.agent, [task.agent])
else:
self.manager_agent.tools = self.manager_agent.get_delegation_tools(
self.agents
)
tools = self._inject_delegation_tools(tools, self.manager_agent, self.agents)
return tools
def _get_context(self, task: Task, task_outputs: List[TaskOutput]):
context = (

View File

@@ -64,6 +64,8 @@ LLM_CONTEXT_WINDOW_SIZES = {
"llama3-70b-8192": 8192,
"llama3-8b-8192": 8192,
"mixtral-8x7b-32768": 32768,
"llama-3.3-70b-versatile": 128000,
"llama-3.3-70b-instruct": 128000,
}
DEFAULT_CONTEXT_WINDOW_SIZE = 8192

View File

@@ -0,0 +1,45 @@
from typing import Dict, Optional, Union
from pydantic import BaseModel, Field
from crewai.tools.base_tool import BaseTool
from crewai.utilities import I18N
i18n = I18N()
class AddImageToolSchema(BaseModel):
image_url: str = Field(..., description="The URL or path of the image to add")
action: Optional[str] = Field(
default=None,
description="Optional context or question about the image"
)
class AddImageTool(BaseTool):
"""Tool for adding images to the content"""
name: str = Field(default_factory=lambda: i18n.tools("add_image")["name"]) # type: ignore
description: str = Field(default_factory=lambda: i18n.tools("add_image")["description"]) # type: ignore
args_schema: type[BaseModel] = AddImageToolSchema
def _run(
self,
image_url: str,
action: Optional[str] = None,
**kwargs,
) -> dict:
action = action or i18n.tools("add_image")["default_action"] # type: ignore
content = [
{"type": "text", "text": action},
{
"type": "image_url",
"image_url": {
"url": image_url,
},
}
]
return {
"role": "user",
"content": content
}

View File

@@ -20,13 +20,13 @@ class AgentTools:
delegate_tool = DelegateWorkTool(
agents=self.agents,
i18n=self.i18n,
description=self.i18n.tools("delegate_work").format(coworkers=coworkers),
description=self.i18n.tools("delegate_work").format(coworkers=coworkers), # type: ignore
)
ask_tool = AskQuestionTool(
agents=self.agents,
i18n=self.i18n,
description=self.i18n.tools("ask_question").format(coworkers=coworkers),
description=self.i18n.tools("ask_question").format(coworkers=coworkers), # type: ignore
)
return [delegate_tool, ask_tool]

View File

@@ -10,6 +10,7 @@ from crewai.agents.tools_handler import ToolsHandler
from crewai.task import Task
from crewai.telemetry import Telemetry
from crewai.tools import BaseTool
from crewai.tools.structured_tool import CrewStructuredTool
from crewai.tools.tool_calling import InstructorToolCalling, ToolCalling
from crewai.tools.tool_usage_events import ToolUsageError, ToolUsageFinished
from crewai.utilities import I18N, Converter, ConverterError, Printer
@@ -18,8 +19,7 @@ try:
import agentops # type: ignore
except ImportError:
agentops = None
OPENAI_BIGGER_MODELS = ["gpt-4", "gpt-4o", "o1-preview", "o1-mini"]
OPENAI_BIGGER_MODELS = ["gpt-4", "gpt-4o", "o1-preview", "o1-mini", "o1", "o3", "o3-mini"]
class ToolUsageErrorException(Exception):
@@ -103,6 +103,19 @@ class ToolUsage:
if self.agent.verbose:
self._printer.print(content=f"\n\n{error}\n", color="red")
return error
if isinstance(tool, CrewStructuredTool) and tool.name == self._i18n.tools("add_image")["name"]: # type: ignore
try:
result = self._use(tool_string=tool_string, tool=tool, calling=calling)
return result
except Exception as e:
error = getattr(e, "message", str(e))
self.task.increment_tools_errors()
if self.agent.verbose:
self._printer.print(content=f"\n\n{error}\n", color="red")
return error
return f"{self._use(tool_string=tool_string, tool=tool, calling=calling)}" # type: ignore # BUG?: "_use" of "ToolUsage" does not return a value (it only ever returns None)
def _use(

View File

@@ -37,6 +37,11 @@
},
"tools": {
"delegate_work": "Delegate a specific task to one of the following coworkers: {coworkers}\nThe input to this tool should be the coworker, the task you want them to do, and ALL necessary context to execute the task, they know nothing about the task, so share absolute everything you know, don't reference things but instead explain them.",
"ask_question": "Ask a specific question to one of the following coworkers: {coworkers}\nThe input to this tool should be the coworker, the question you have for them, and ALL necessary context to ask the question properly, they know nothing about the question, so share absolute everything you know, don't reference things but instead explain them."
"ask_question": "Ask a specific question to one of the following coworkers: {coworkers}\nThe input to this tool should be the coworker, the question you have for them, and ALL necessary context to ask the question properly, they know nothing about the question, so share absolute everything you know, don't reference things but instead explain them.",
"add_image": {
"name": "Add image to content",
"description": "See image to understand it's content, you can optionally ask a question about the image",
"default_action": "Please provide a detailed description of this image, including all visual elements, context, and any notable details you can observe."
}
}
}

View File

@@ -1,6 +1,6 @@
import json
import os
from typing import Dict, Optional
from typing import Dict, Optional, Union
from pydantic import BaseModel, Field, PrivateAttr, model_validator
@@ -41,8 +41,8 @@ class I18N(BaseModel):
def errors(self, error: str) -> str:
return self.retrieve("errors", error)
def tools(self, error: str) -> str:
return self.retrieve("tools", error)
def tools(self, tool: str) -> Union[str, Dict[str, str]]:
return self.retrieve("tools", tool)
def retrieve(self, kind, key) -> str:
try: