feat: Add Vision tool to the CrewAI tool

2026-01-26 08:38:15 +00:00 · 2024-07-31 17:10:26 -03:00
parent d28dba453e
commit 4835c2bf68
4 changed files with 146 additions and 11 deletions
--- a/src/crewai_tools/tools/vision_tool/README.md
+++ b/src/crewai_tools/tools/vision_tool/README.md
@@ -0,0 +1,30 @@
+# Vision Tool
+
+## Description
+
+This tool is used to extract text from images. When passed to the agent it will extract the text from the image and then use it to generate a response, report or any other output. The URL or the PATH of the image should be passed to the Agent.
+
+
+## Installation
+Install the crewai_tools package
+```shell
+pip install 'crewai[tools]'
+```
+
+## Usage
+
+In order to use the VisionTool, the OpenAI API key should be set in the environment variable `OPENAI_API_KEY`.
+
+```python
+from crewai_tools import VisionTool
+
+vision_tool = VisionTool()
+
+@agent
+def researcher(self) -> Agent:
+    return Agent(
+        config=self.agents_config["researcher"],
+        allow_delegation=False,
+        tools=[vision_tool]
+    )
+```
--- a/src/crewai_tools/tools/vision_tool/vision_tool.py
+++ b/src/crewai_tools/tools/vision_tool/vision_tool.py
@@ -0,0 +1,93 @@
+import base64
+from typing import Type
+
+import requests
+from crewai_tools.tools.base_tool import BaseTool
+from openai import OpenAI
+from pydantic.v1 import BaseModel
+
+
+class ImagePromptSchema(BaseModel):
+    """Input for Vision Tool."""
+
+    image_path_url: str = "The image path or URL."
+
+
+class VisionTool(BaseTool):
+    name: str = "Vision Tool"
+    description: str = (
+        "This tool uses OpenAI's Vision API to describe the contents of an image."
+    )
+    args_schema: Type[BaseModel] = ImagePromptSchema
+
+    def _run_web_hosted_images(self, client, image_path_url: str) -> str:
+        response = client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What's in this image?"},
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": image_path_url},
+                        },
+                    ],
+                }
+            ],
+            max_tokens=300,
+        )
+
+        return response.choices[0].message.content
+
+    def _run_local_images(self, client, image_path_url: str) -> str:
+        base64_image = self._encode_image(image_path_url)
+
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {client.api_key}",
+        }
+
+        payload = {
+            "model": "gpt-4o-mini",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What's in this image?"},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{base64_image}"
+                            },
+                        },
+                    ],
+                }
+            ],
+            "max_tokens": 300,
+        }
+
+        response = requests.post(
+            "https://api.openai.com/v1/chat/completions", headers=headers, json=payload
+        )
+
+        return response.json()["choices"][0]["message"]["content"]
+
+    def _run(self, **kwargs) -> str:
+        client = OpenAI()
+
+        image_path_url = kwargs.get("image_path_url")
+
+        if not image_path_url:
+            return "Image Path or URL is required."
+
+        if "http" in image_path_url:
+            image_description = self._run_web_hosted_images(client, image_path_url)
+        else:
+            image_description = self._run_local_images(client, image_path_url)
+
+        return image_description
+
+    def _encode_image(self, image_path: str):
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode("utf-8")