supporting image tool

This commit is contained in:
João Moura
2024-12-26 23:24:41 -03:00
parent 93bee87324
commit e61f2f50c9
5 changed files with 634 additions and 13 deletions

View File

@@ -144,9 +144,13 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
formatted_answer
)
# Directly append the result to the messages if the
# tool is "Add image to content" in case of multimodal
# agents
if formatted_answer.tool == "Add image to content":
self.messages.append(tool_result.result)
continue
else:
if self.step_callback:
self.step_callback(tool_result)

View File

@@ -4,31 +4,37 @@ from crewai.tools.base_tool import BaseTool
class AddImageToolSchema(BaseModel):
image_url: str = Field(..., description="The URL or path of the image to add")
action: str = Field(..., description="The context or purpose of why this image is being added and how it should be used")
action: str = Field(
default="Please provide a detailed description of this image, including all visual elements, context, and any notable details you can observe.",
description="Optional context or question about the image"
)
class AddImageTool(BaseTool):
"""Tool for adding images to the content"""
name: str = "Add image to content"
description: str = "See image to understand it's content"
description: str = "See image to understand it's content, you can optionally ask a question about the image"
args_schema: type[BaseModel] = AddImageToolSchema
def _run(
self,
image_url: str,
action: str,
action: str = None,
**kwargs,
) -> dict:
action = action or "Please provide a detailed description of this image, including all visual elements, context, and any notable details you can observe."
content = [
{"type": "text", "text": action},
{
"type": "image_url",
"image_url": {
"url": image_url,
},
}
]
return {
"role": "user",
"content": [
{"type": "text", "text": action},
{
"type": "image_url",
"image_url": {
"url": image_url,
},
},
],
"content": content
}

View File

@@ -5,6 +5,7 @@ from difflib import SequenceMatcher
from textwrap import dedent
from typing import Any, List, Union
from crewai.tools.structured_tool import CrewStructuredTool
import crewai.utilities.events as events
from crewai.agents.tools_handler import ToolsHandler
from crewai.task import Task
@@ -103,6 +104,19 @@ class ToolUsage:
if self.agent.verbose:
self._printer.print(content=f"\n\n{error}\n", color="red")
return error
if isinstance(tool, CrewStructuredTool) and tool.name == 'Add image to content':
try:
result = self._use(tool_string=tool_string, tool=tool, calling=calling)
return result
except Exception as e:
error = getattr(e, "message", str(e))
self.task.increment_tools_errors()
if self.agent.verbose:
self._printer.print(content=f"\n\n{error}\n", color="red")
return error
return f"{self._use(tool_string=tool_string, tool=tool, calling=calling)}" # type: ignore # BUG?: "_use" of "ToolUsage" does not return a value (it only ever returns None)
def _use(

View File

@@ -0,0 +1,481 @@
interactions:
- request:
body: '{"messages": [{"role": "system", "content": "You are Image Analyst. You''re
an expert at visual analysis, trained to notice and describe details in images.\nYour
personal goal is: Analyze images with high attention to detail\nYou ONLY have
access to the following tools, and should NEVER make up tools that are not listed
here:\n\nTool Name: Add image to content\nTool Arguments: {''image_url'': {''description'':
''The URL or path of the image to add'', ''type'': ''str''}, ''action'': {''description'':
''Optional context or question about the image'', ''type'': ''str''}}\nTool
Description: See image to understand it''s content, you can optionally ask a
question about the image\n\nUse the following format:\n\nThought: you should
always think about what to do\nAction: the action to take, only one name of
[Add image to content], just the name, exactly as it''s written.\nAction Input:
the input to the action, just a simple python dictionary, enclosed in curly
braces, using \" to wrap keys and values.\nObservation: the result of the action\n\nOnce
all necessary information is gathered:\n\nThought: I now know the final answer\nFinal
Answer: the final answer to the original input question"}, {"role": "user",
"content": "\nCurrent Task: \n Analyze the provided image and describe
what you see in detail.\n Focus on main elements, colors, composition,
and any notable details.\n Image: https://media.istockphoto.com/id/946087016/photo/aerial-view-of-lower-manhattan-new-york.jpg?s=612x612&w=0&k=20&c=viLiMRznQ8v5LzKTt_LvtfPFUVl1oiyiemVdSlm29_k=\n \n\nThis
is the expect criteria for your final answer: A comprehensive description of
the image contents.\nyou MUST return the actual complete content as the final
answer, not a summary.\n\nBegin! This is VERY important to you, use the tools
available and give your best Final Answer, your job depends on it!\n\nThought:"}],
"model": "gpt-4o", "stop": ["\nObservation:"], "stream": false}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate
connection:
- keep-alive
content-length:
- '1948'
content-type:
- application/json
host:
- api.openai.com
user-agent:
- OpenAI/Python 1.52.1
x-stainless-arch:
- arm64
x-stainless-async:
- 'false'
x-stainless-lang:
- python
x-stainless-os:
- MacOS
x-stainless-package-version:
- 1.52.1
x-stainless-raw-response:
- 'true'
x-stainless-retry-count:
- '0'
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.11.7
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
content: "{\n \"id\": \"chatcmpl-AiuIfzzcje5KdvKIG5CkFeORroiKk\",\n \"object\":
\"chat.completion\",\n \"created\": 1735266213,\n \"model\": \"gpt-4o-2024-08-06\",\n
\ \"choices\": [\n {\n \"index\": 0,\n \"message\": {\n \"role\":
\"assistant\",\n \"content\": \"Action: Add image to content\\nAction
Input: {\\\"image_url\\\": \\\"https://media.istockphoto.com/id/946087016/photo/aerial-view-of-lower-manhattan-new-york.jpg?s=612x612&w=0&k=20&c=viLiMRznQ8v5LzKTt_LvtfPFUVl1oiyiemVdSlm29_k=\\\",
\\\"action\\\": \\\"Analyze the provided image and describe what you see in
detail.\\\"}\",\n \"refusal\": null\n },\n \"logprobs\": null,\n
\ \"finish_reason\": \"stop\"\n }\n ],\n \"usage\": {\n \"prompt_tokens\":
417,\n \"completion_tokens\": 103,\n \"total_tokens\": 520,\n \"prompt_tokens_details\":
{\n \"cached_tokens\": 0,\n \"audio_tokens\": 0\n },\n \"completion_tokens_details\":
{\n \"reasoning_tokens\": 0,\n \"audio_tokens\": 0,\n \"accepted_prediction_tokens\":
0,\n \"rejected_prediction_tokens\": 0\n }\n },\n \"system_fingerprint\":
\"fp_5f20662549\"\n}\n"
headers:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 8f85d96b280df217-GRU
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Fri, 27 Dec 2024 02:23:35 GMT
Server:
- cloudflare
Set-Cookie:
- __cf_bm=kJ1pw1xjCMSxjHSS8iJC5z_j2PZxl.i387KCpj9xNZU-1735266215-1.0.1.1-Ybg0wVTsrBlpVZmtQyA1ullY8m3v2Ix0N_SYlhr9z7zKfbLeqGZEVL37YSY.dvIiLVY3XPZzMtG8Xwo6UucW6A;
path=/; expires=Fri, 27-Dec-24 02:53:35 GMT; domain=.api.openai.com; HttpOnly;
Secure; SameSite=None
- _cfuvid=v_wJZ5m7qCjrnRfks0gT2GAk9yR14BdIDAQiQR7xxI8-1735266215000-0.0.1.1-604800000;
path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
openai-organization:
- crewai-iuxna1
openai-processing-ms:
- '1212'
openai-version:
- '2020-10-01'
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- '10000'
x-ratelimit-limit-tokens:
- '30000000'
x-ratelimit-remaining-requests:
- '9999'
x-ratelimit-remaining-tokens:
- '29999539'
x-ratelimit-reset-requests:
- 6ms
x-ratelimit-reset-tokens:
- 0s
x-request-id:
- req_663a2b18099a18361d6b02befc175289
http_version: HTTP/1.1
status_code: 200
- request:
body: !!binary |
Co4LCiQKIgoMc2VydmljZS5uYW1lEhIKEGNyZXdBSS10ZWxlbWV0cnkS5QoKEgoQY3Jld2FpLnRl
bGVtZXRyeRKjBwoQHmzzumMNXHOgpJ4zCIxJSxII72WnLlLfRyYqDENyZXcgQ3JlYXRlZDABOQjB
gFxt5xQYQYhMiVxt5xQYShoKDmNyZXdhaV92ZXJzaW9uEggKBjAuODYuMEoaCg5weXRob25fdmVy
c2lvbhIICgYzLjExLjdKLgoIY3Jld19rZXkSIgogZTM5NTY3YjUwNTI5MDljYTMzNDA5ODRiODM4
OTgwZWFKMQoHY3Jld19pZBImCiQ4MDA0YTA1NC0zYjNkLTQ4OGEtYTlkNC1kZWQzMDVhMDIxY2FK
HAoMY3Jld19wcm9jZXNzEgwKCnNlcXVlbnRpYWxKEQoLY3Jld19tZW1vcnkSAhAAShoKFGNyZXdf
bnVtYmVyX29mX3Rhc2tzEgIYAUobChVjcmV3X251bWJlcl9vZl9hZ2VudHMSAhgBSs4CCgtjcmV3
X2FnZW50cxK+Agq7Alt7ImtleSI6ICI5ZGM4Y2NlMDMwNDY4MTk2MDQxYjRjMzgwYjYxN2NiMCIs
ICJpZCI6ICJjNTZhZGI2Mi1lMGIwLTQzYzAtYmQ4OC0xYzEwYTNhNmU5NDQiLCAicm9sZSI6ICJJ
bWFnZSBBbmFseXN0IiwgInZlcmJvc2U/IjogdHJ1ZSwgIm1heF9pdGVyIjogMjAsICJtYXhfcnBt
IjogbnVsbCwgImZ1bmN0aW9uX2NhbGxpbmdfbGxtIjogIiIsICJsbG0iOiAiZ3B0LTRvIiwgImRl
bGVnYXRpb25fZW5hYmxlZD8iOiBmYWxzZSwgImFsbG93X2NvZGVfZXhlY3V0aW9uPyI6IGZhbHNl
LCAibWF4X3JldHJ5X2xpbWl0IjogMiwgInRvb2xzX25hbWVzIjogW119XUqCAgoKY3Jld190YXNr
cxLzAQrwAVt7ImtleSI6ICJhOWE3NmNhNjk1N2QwYmZmYTY5ZWFiMjBiNjY0ODIyYiIsICJpZCI6
ICJhNzFiZDllNC0wNzdkLTRmMTQtODg0MS03MGMwZWM4MGZkMmMiLCAiYXN5bmNfZXhlY3V0aW9u
PyI6IGZhbHNlLCAiaHVtYW5faW5wdXQ/IjogZmFsc2UsICJhZ2VudF9yb2xlIjogIkltYWdlIEFu
YWx5c3QiLCAiYWdlbnRfa2V5IjogIjlkYzhjY2UwMzA0NjgxOTYwNDFiNGMzODBiNjE3Y2IwIiwg
InRvb2xzX25hbWVzIjogW119XXoCGAGFAQABAAASjgIKEOZ5pMdq9ep85DrP1Vv8Y8MSCE7ahOkm
2IDHKgxUYXNrIENyZWF0ZWQwATlIg85cbecUGEGQ9M5cbecUGEouCghjcmV3X2tleRIiCiBlMzk1
NjdiNTA1MjkwOWNhMzM0MDk4NGI4Mzg5ODBlYUoxCgdjcmV3X2lkEiYKJDgwMDRhMDU0LTNiM2Qt
NDg4YS1hOWQ0LWRlZDMwNWEwMjFjYUouCgh0YXNrX2tleRIiCiBhOWE3NmNhNjk1N2QwYmZmYTY5
ZWFiMjBiNjY0ODIyYkoxCgd0YXNrX2lkEiYKJGE3MWJkOWU0LTA3N2QtNGYxNC04ODQxLTcwYzBl
YzgwZmQyY3oCGAGFAQABAAASlwEKECyaQQK8JkKLh6S2mWHTeDgSCPWCpr7v9CQZKgpUb29sIFVz
YWdlMAE5MLyst23nFBhBOJy/t23nFBhKGgoOY3Jld2FpX3ZlcnNpb24SCAoGMC44Ni4wSiMKCXRv
b2xfbmFtZRIWChRBZGQgaW1hZ2UgdG8gY29udGVudEoOCghhdHRlbXB0cxICGAF6AhgBhQEAAQAA
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '1425'
Content-Type:
- application/x-protobuf
User-Agent:
- OTel-OTLP-Exporter-Python/1.27.0
method: POST
uri: https://telemetry.crewai.com:4319/v1/traces
response:
body:
string: "\n\0"
headers:
Content-Length:
- '2'
Content-Type:
- application/x-protobuf
Date:
- Fri, 27 Dec 2024 02:23:39 GMT
status:
code: 200
message: OK
- request:
body: '{"messages": [{"role": "system", "content": "You are Image Analyst. You''re
an expert at visual analysis, trained to notice and describe details in images.\nYour
personal goal is: Analyze images with high attention to detail\nYou ONLY have
access to the following tools, and should NEVER make up tools that are not listed
here:\n\nTool Name: Add image to content\nTool Arguments: {''image_url'': {''description'':
''The URL or path of the image to add'', ''type'': ''str''}, ''action'': {''description'':
''Optional context or question about the image'', ''type'': ''str''}}\nTool
Description: See image to understand it''s content, you can optionally ask a
question about the image\n\nUse the following format:\n\nThought: you should
always think about what to do\nAction: the action to take, only one name of
[Add image to content], just the name, exactly as it''s written.\nAction Input:
the input to the action, just a simple python dictionary, enclosed in curly
braces, using \" to wrap keys and values.\nObservation: the result of the action\n\nOnce
all necessary information is gathered:\n\nThought: I now know the final answer\nFinal
Answer: the final answer to the original input question"}, {"role": "user",
"content": "\nCurrent Task: \n Analyze the provided image and describe
what you see in detail.\n Focus on main elements, colors, composition,
and any notable details.\n Image: https://media.istockphoto.com/id/946087016/photo/aerial-view-of-lower-manhattan-new-york.jpg?s=612x612&w=0&k=20&c=viLiMRznQ8v5LzKTt_LvtfPFUVl1oiyiemVdSlm29_k=\n \n\nThis
is the expect criteria for your final answer: A comprehensive description of
the image contents.\nyou MUST return the actual complete content as the final
answer, not a summary.\n\nBegin! This is VERY important to you, use the tools
available and give your best Final Answer, your job depends on it!\n\nThought:"},
{"role": "user", "content": [{"type": "text", "text": "Analyze the provided
image and describe what you see in detail."}, {"type": "image_url", "image_url":
{"url": "https://media.istockphoto.com/id/946087016/photo/aerial-view-of-lower-manhattan-new-york.jpg?s=612x612&w=0&k=20&c=viLiMRznQ8v5LzKTt_LvtfPFUVl1oiyiemVdSlm29_k="}}]}],
"model": "gpt-4o", "stop": ["\nObservation:"], "stream": false}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate
connection:
- keep-alive
content-length:
- '2279'
content-type:
- application/json
cookie:
- __cf_bm=kJ1pw1xjCMSxjHSS8iJC5z_j2PZxl.i387KCpj9xNZU-1735266215-1.0.1.1-Ybg0wVTsrBlpVZmtQyA1ullY8m3v2Ix0N_SYlhr9z7zKfbLeqGZEVL37YSY.dvIiLVY3XPZzMtG8Xwo6UucW6A;
_cfuvid=v_wJZ5m7qCjrnRfks0gT2GAk9yR14BdIDAQiQR7xxI8-1735266215000-0.0.1.1-604800000
host:
- api.openai.com
user-agent:
- OpenAI/Python 1.52.1
x-stainless-arch:
- arm64
x-stainless-async:
- 'false'
x-stainless-lang:
- python
x-stainless-os:
- MacOS
x-stainless-package-version:
- 1.52.1
x-stainless-raw-response:
- 'true'
x-stainless-retry-count:
- '0'
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.11.7
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
content: "{\n \"id\": \"chatcmpl-AiuIiqT33ROFMdw1gNmqH9jiw6PfF\",\n \"object\":
\"chat.completion\",\n \"created\": 1735266216,\n \"model\": \"gpt-4o-2024-08-06\",\n
\ \"choices\": [\n {\n \"index\": 0,\n \"message\": {\n \"role\":
\"assistant\",\n \"content\": \"The image is an aerial view of Lower
Manhattan in New York City. \\n\\nMain Elements:\\n- The One World Trade Center
tower stands prominently, distinguishable by its sleek, tapering structure reaching
into the sky, surrounded by other skyscrapers.\\n- Skyscrapers in varying heights
and architectural styles, fill the densely packed urban landscape.\\n- A waterfront
is visible at the edges, with docks and piers extending into the water.\\n\\nColors:\\n-
The buildings exhibit a mix of colors, predominantly grays, whites, and browns,
against the blues of the sky and water.\\n- There's a section of greenery visible,
likely a park or recreational space, offering a contrast with its vibrant green
hues.\\n\\nComposition:\\n- The angle of the photograph showcases the expanse
of the city, highlighting the density and scale of the buildings.\\n- Water
borders the city on two prominent sides, creating a natural boundary and enhancing
the island's urban island feel.\\n\\nNotable Details:\\n- The image captures
the iconic layout of Manhattan, with the surrounding Hudson River and New York
Harbor visible in the background.\\n- Beyond Lower Manhattan, more of the cityscape
stretches into the distance, illustrating the vastness of New York City.\\n-
The day appears clear and sunny, with shadows casting from the buildings, indicating
time in the morning or late afternoon.\\n\\nOverall, the image is a striking
depiction of the dynamic and bustling environment of New York's Lower Manhattan,
encapsulating its urban character and proximity to the water.\",\n \"refusal\":
null\n },\n \"logprobs\": null,\n \"finish_reason\": \"stop\"\n
\ }\n ],\n \"usage\": {\n \"prompt_tokens\": 858,\n \"completion_tokens\":
295,\n \"total_tokens\": 1153,\n \"prompt_tokens_details\": {\n \"cached_tokens\":
0,\n \"audio_tokens\": 0\n },\n \"completion_tokens_details\": {\n
\ \"reasoning_tokens\": 0,\n \"audio_tokens\": 0,\n \"accepted_prediction_tokens\":
0,\n \"rejected_prediction_tokens\": 0\n }\n },\n \"system_fingerprint\":
\"fp_5f20662549\"\n}\n"
headers:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 8f85d9741d0cf217-GRU
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Fri, 27 Dec 2024 02:23:40 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
openai-organization:
- crewai-iuxna1
openai-processing-ms:
- '5136'
openai-version:
- '2020-10-01'
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-input-images:
- '50000'
x-ratelimit-limit-requests:
- '10000'
x-ratelimit-limit-tokens:
- '30000000'
x-ratelimit-remaining-input-images:
- '49999'
x-ratelimit-remaining-requests:
- '9999'
x-ratelimit-remaining-tokens:
- '29998756'
x-ratelimit-reset-input-images:
- 1ms
x-ratelimit-reset-requests:
- 6ms
x-ratelimit-reset-tokens:
- 2ms
x-request-id:
- req_57a7430712d4ff4a81f600ffb94d3b6e
http_version: HTTP/1.1
status_code: 200
- request:
body: '{"messages": [{"role": "system", "content": "You are Image Analyst. You''re
an expert at visual analysis, trained to notice and describe details in images.\nYour
personal goal is: Analyze images with high attention to detail\nYou ONLY have
access to the following tools, and should NEVER make up tools that are not listed
here:\n\nTool Name: Add image to content\nTool Arguments: {''image_url'': {''description'':
''The URL or path of the image to add'', ''type'': ''str''}, ''action'': {''description'':
''Optional context or question about the image'', ''type'': ''str''}}\nTool
Description: See image to understand it''s content, you can optionally ask a
question about the image\n\nUse the following format:\n\nThought: you should
always think about what to do\nAction: the action to take, only one name of
[Add image to content], just the name, exactly as it''s written.\nAction Input:
the input to the action, just a simple python dictionary, enclosed in curly
braces, using \" to wrap keys and values.\nObservation: the result of the action\n\nOnce
all necessary information is gathered:\n\nThought: I now know the final answer\nFinal
Answer: the final answer to the original input question"}, {"role": "user",
"content": "\nCurrent Task: \n Analyze the provided image and describe
what you see in detail.\n Focus on main elements, colors, composition,
and any notable details.\n Image: https://media.istockphoto.com/id/946087016/photo/aerial-view-of-lower-manhattan-new-york.jpg?s=612x612&w=0&k=20&c=viLiMRznQ8v5LzKTt_LvtfPFUVl1oiyiemVdSlm29_k=\n \n\nThis
is the expect criteria for your final answer: A comprehensive description of
the image contents.\nyou MUST return the actual complete content as the final
answer, not a summary.\n\nBegin! This is VERY important to you, use the tools
available and give your best Final Answer, your job depends on it!\n\nThought:"},
{"role": "user", "content": [{"type": "text", "text": "Analyze the provided
image and describe what you see in detail."}, {"type": "image_url", "image_url":
{"url": "https://media.istockphoto.com/id/946087016/photo/aerial-view-of-lower-manhattan-new-york.jpg?s=612x612&w=0&k=20&c=viLiMRznQ8v5LzKTt_LvtfPFUVl1oiyiemVdSlm29_k="}}]},
{"role": "user", "content": "I did it wrong. Invalid Format: I missed the ''Action:''
after ''Thought:''. I will do right next, and don''t use a tool I have already
used.\n\nIf you don''t need to use any more tools, you must give your best complete
final answer, make sure it satisfies the expected criteria, use the EXACT format
below:\n\nThought: I now can give a great answer\nFinal Answer: my best complete
final answer to the task.\n\n"}], "model": "gpt-4o", "stop": ["\nObservation:"],
"stream": false}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate
connection:
- keep-alive
content-length:
- '2717'
content-type:
- application/json
cookie:
- __cf_bm=kJ1pw1xjCMSxjHSS8iJC5z_j2PZxl.i387KCpj9xNZU-1735266215-1.0.1.1-Ybg0wVTsrBlpVZmtQyA1ullY8m3v2Ix0N_SYlhr9z7zKfbLeqGZEVL37YSY.dvIiLVY3XPZzMtG8Xwo6UucW6A;
_cfuvid=v_wJZ5m7qCjrnRfks0gT2GAk9yR14BdIDAQiQR7xxI8-1735266215000-0.0.1.1-604800000
host:
- api.openai.com
user-agent:
- OpenAI/Python 1.52.1
x-stainless-arch:
- arm64
x-stainless-async:
- 'false'
x-stainless-lang:
- python
x-stainless-os:
- MacOS
x-stainless-package-version:
- 1.52.1
x-stainless-raw-response:
- 'true'
x-stainless-retry-count:
- '0'
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.11.7
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
content: "{\n \"id\": \"chatcmpl-AiuInuYNldaQVo6B1EsEquT1VFMN7\",\n \"object\":
\"chat.completion\",\n \"created\": 1735266221,\n \"model\": \"gpt-4o-2024-08-06\",\n
\ \"choices\": [\n {\n \"index\": 0,\n \"message\": {\n \"role\":
\"assistant\",\n \"content\": \"Thought: I now can give a great answer\\nFinal
Answer: The image is an aerial view of Lower Manhattan in New York City. The
photograph prominently features the cluster of skyscrapers that characterizes
the area, with One World Trade Center standing out as a particularly tall and
iconic structure. The buildings vary in color, with shades of glassy blue, grey,
and natural stone dominating the skyline. In the bottom part of the image, there
is a green space, likely Battery Park, providing a stark contrast to the dense
urban environment, with trees and pathways visible. The water surrounding Manhattan
is a deep blue, and several piers jut into the harbor. The Hudson River is visible
on the left, and the East River can be seen on the right, framing the island.
The overall composition captures the bustling and vibrant nature of New York\u2019s
financial hub, with bright sunlight illuminating the buildings, casting sharp
shadows and enhancing the depth of the cityscape. The sky is clear, suggesting
a sunny day with good visibility.\",\n \"refusal\": null\n },\n
\ \"logprobs\": null,\n \"finish_reason\": \"stop\"\n }\n ],\n
\ \"usage\": {\n \"prompt_tokens\": 952,\n \"completion_tokens\": 203,\n
\ \"total_tokens\": 1155,\n \"prompt_tokens_details\": {\n \"cached_tokens\":
0,\n \"audio_tokens\": 0\n },\n \"completion_tokens_details\": {\n
\ \"reasoning_tokens\": 0,\n \"audio_tokens\": 0,\n \"accepted_prediction_tokens\":
0,\n \"rejected_prediction_tokens\": 0\n }\n },\n \"system_fingerprint\":
\"fp_5f20662549\"\n}\n"
headers:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 8f85d995ad1ef217-GRU
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Fri, 27 Dec 2024 02:23:43 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
openai-organization:
- crewai-iuxna1
openai-processing-ms:
- '3108'
openai-version:
- '2020-10-01'
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-input-images:
- '50000'
x-ratelimit-limit-requests:
- '10000'
x-ratelimit-limit-tokens:
- '30000000'
x-ratelimit-remaining-input-images:
- '49999'
x-ratelimit-remaining-requests:
- '9999'
x-ratelimit-remaining-tokens:
- '29998656'
x-ratelimit-reset-input-images:
- 1ms
x-ratelimit-reset-requests:
- 6ms
x-ratelimit-reset-tokens:
- 2ms
x-request-id:
- req_45f0e3d457a18f973a59074d16f137b6
http_version: HTTP/1.1
status_code: 200
version: 1

View File

@@ -2999,4 +2999,120 @@ def test_multimodal_flag_adds_multimodal_tools():
)
# Verify we have exactly one tool (just the AddImageTool)
assert len(used_tools) == 1, "Should only have the AddImageTool"
assert len(used_tools) == 1, "Should only have the AddImageTool"
@pytest.mark.vcr(filter_headers=["authorization"])
def test_multimodal_agent_image_tool_handling():
"""
Test that multimodal agents properly handle image tools in the CrewAgentExecutor
"""
# Create a multimodal agent
multimodal_agent = Agent(
role="Image Analyst",
goal="Analyze images and provide descriptions",
backstory="You're an expert at analyzing and describing images.",
allow_delegation=False,
multimodal=True,
)
# Create a task that involves image analysis
task = Task(
description="Analyze this image and describe what you see.",
expected_output="A detailed description of the image.",
agent=multimodal_agent,
)
crew = Crew(agents=[multimodal_agent], tasks=[task])
# Mock the image tool response
mock_image_tool_result = {
"role": "user",
"content": [
{"type": "text", "text": "Please analyze this image"},
{
"type": "image_url",
"image_url": {
"url": "https://example.com/test-image.jpg",
},
},
],
}
# Create a mock task output for the final result
mock_task_output = TaskOutput(
description="Mock description",
raw="A detailed analysis of the image",
agent="Image Analyst"
)
with patch.object(Task, 'execute_sync') as mock_execute_sync:
# Set up the mock to return our task output
mock_execute_sync.return_value = mock_task_output
# Execute the crew
crew.kickoff()
# Get the tools that were passed to execute_sync
_, kwargs = mock_execute_sync.call_args
tools = kwargs['tools']
# Verify the AddImageTool is present and properly configured
image_tools = [tool for tool in tools if tool.name == "Add image to content"]
assert len(image_tools) == 1, "Should have exactly one AddImageTool"
# Test the tool's execution
image_tool = image_tools[0]
result = image_tool._run(
image_url="https://example.com/test-image.jpg",
action="Please analyze this image"
)
# Verify the tool returns the expected format
assert result == mock_image_tool_result
assert result["role"] == "user"
assert len(result["content"]) == 2
assert result["content"][0]["type"] == "text"
assert result["content"][1]["type"] == "image_url"
@pytest.mark.vcr(filter_headers=["authorization"])
def test_multimodal_agent_live_image_analysis():
"""
Test that multimodal agents can analyze images through a real API call
"""
# Create a multimodal agent
image_analyst = Agent(
role="Image Analyst",
goal="Analyze images with high attention to detail",
backstory="You're an expert at visual analysis, trained to notice and describe details in images.",
allow_delegation=False,
multimodal=True,
verbose=True,
llm="gpt-4o"
)
# Create a task for image analysis
analyze_image = Task(
description="""
Analyze the provided image and describe what you see in detail.
Focus on main elements, colors, composition, and any notable details.
Image: {image_url}
""",
expected_output="A comprehensive description of the image contents.",
agent=image_analyst
)
# Create and run the crew
crew = Crew(
agents=[image_analyst],
tasks=[analyze_image]
)
# Execute with an image URL
result = crew.kickoff(inputs={
"image_url": "https://media.istockphoto.com/id/946087016/photo/aerial-view-of-lower-manhattan-new-york.jpg?s=612x612&w=0&k=20&c=viLiMRznQ8v5LzKTt_LvtfPFUVl1oiyiemVdSlm29_k="
})
# Verify we got a meaningful response
assert isinstance(result.raw, str)
assert len(result.raw) > 100 # Expecting a detailed analysis
assert "error" not in result.raw.lower() # No error messages in response