feat: Add Vision tool to the CrewAI tool

2026-01-09 08:08:32 +00:00 · 2024-07-31 17:10:26 -03:00
parent d28dba453e
commit 4835c2bf68
4 changed files with 146 additions and 11 deletions
--- a/src/crewai_tools/init.py
+++ b/src/crewai_tools/init.py
@@ -17,23 +17,25 @@ from .tools import (
    LlamaIndexTool,
    MDXSearchTool,
    MultiOnTool,
    NL2SQLTool,
    PDFSearchTool,
    PGSearchTool,
    RagTool,
    ScrapeElementFromWebsiteTool,
    ScrapflyScrapeWebsiteTool,
    ScrapeWebsiteTool,
    ScrapflyScrapeWebsiteTool,
    SeleniumScrapingTool,
    SerperDevTool,
-    SerplyWebSearchTool,
+    SerplyJobSearchTool,
    SerplyNewsSearchTool,
    SerplyScholarSearchTool,
    SerplyWebpageToMarkdownTool,
-    SerplyJobSearchTool,
+    SerplyWebSearchTool,
    TXTSearchTool,
    VisionTool,
    WebsiteSearchTool,
    XMLSearchTool,
    YoutubeChannelSearchTool,
-    YoutubeVideoSearchTool
+    YoutubeVideoSearchTool,
 )
 from .tools.base_tool import BaseTool, Tool, tool
--- a/src/crewai_tools/tools/init.py
+++ b/src/crewai_tools/tools/init.py
@@ -8,14 +8,19 @@ from .directory_search_tool.directory_search_tool import DirectorySearchTool
 from .docx_search_tool.docx_search_tool import DOCXSearchTool
 from .exa_tools.exa_search_tool import EXASearchTool
 from .file_read_tool.file_read_tool import FileReadTool
-from .firecrawl_crawl_website_tool.firecrawl_crawl_website_tool import FirecrawlCrawlWebsiteTool
+from .firecrawl_crawl_website_tool.firecrawl_crawl_website_tool import (
-from .firecrawl_scrape_website_tool.firecrawl_scrape_website_tool import FirecrawlScrapeWebsiteTool
+    FirecrawlCrawlWebsiteTool,
 )
 from .firecrawl_scrape_website_tool.firecrawl_scrape_website_tool import (
    FirecrawlScrapeWebsiteTool,
 )
 from .firecrawl_search_tool.firecrawl_search_tool import FirecrawlSearchTool
 from .github_search_tool.github_search_tool import GithubSearchTool
 from .json_search_tool.json_search_tool import JSONSearchTool
 from .llamaindex_tool.llamaindex_tool import LlamaIndexTool
 from .mdx_seach_tool.mdx_search_tool import MDXSearchTool
 from .multion_tool.multion_tool import MultiOnTool
 from .nl2sql.nl2sql_tool import NL2SQLTool
 from .pdf_search_tool.pdf_search_tool import PDFSearchTool
 from .pg_seach_tool.pg_search_tool import PGSearchTool
 from .rag.rag_tool import RagTool
@@ -23,17 +28,22 @@ from .scrape_element_from_website.scrape_element_from_website import (
    ScrapeElementFromWebsiteTool,
 )
 from .scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool
-from .scrapfly_scrape_website_tool.scrapfly_scrape_website_tool import ScrapflyScrapeWebsiteTool
+from .scrapfly_scrape_website_tool.scrapfly_scrape_website_tool import (
    ScrapflyScrapeWebsiteTool,
 )
 from .selenium_scraping_tool.selenium_scraping_tool import SeleniumScrapingTool
 from .serper_dev_tool.serper_dev_tool import SerperDevTool
-from .serply_api_tool.serply_web_search_tool import SerplyWebSearchTool
+from .serply_api_tool.serply_job_search_tool import SerplyJobSearchTool
 from .serply_api_tool.serply_news_search_tool import SerplyNewsSearchTool
 from .serply_api_tool.serply_scholar_search_tool import SerplyScholarSearchTool
 from .serply_api_tool.serply_web_search_tool import SerplyWebSearchTool
 from .serply_api_tool.serply_webpage_to_markdown_tool import SerplyWebpageToMarkdownTool
-from .serply_api_tool.serply_job_search_tool import SerplyJobSearchTool
+from .spider_tool.spider_tool import SpiderTool
 from .txt_search_tool.txt_search_tool import TXTSearchTool
 from .vision_tool.vision_tool import VisionTool
 from .website_search.website_search_tool import WebsiteSearchTool
 from .xml_search_tool.xml_search_tool import XMLSearchTool
-from .youtube_channel_search_tool.youtube_channel_search_tool import YoutubeChannelSearchTool
+from .youtube_channel_search_tool.youtube_channel_search_tool import (
    YoutubeChannelSearchTool,
 )
 from .youtube_video_search_tool.youtube_video_search_tool import YoutubeVideoSearchTool
 from .spider_tool.spider_tool import SpiderTool
--- a/src/crewai_tools/tools/vision_tool/README.md
+++ b/src/crewai_tools/tools/vision_tool/README.md
@@ -0,0 +1,30 @@
 # Vision Tool
 ## Description
 This tool is used to extract text from images. When passed to the agent it will extract the text from the image and then use it to generate a response, report or any other output. The URL or the PATH of the image should be passed to the Agent.
 ## Installation
 Install the crewai_tools package
 ```shell
 pip install 'crewai[tools]'
 ```
 ## Usage
 In order to use the VisionTool, the OpenAI API key should be set in the environment variable `OPENAI_API_KEY`.
 ```python
 from crewai_tools import VisionTool
 vision_tool = VisionTool()
@agent
 def researcher(self) -> Agent:
    return Agent(
        config=self.agents_config["researcher"],
        allow_delegation=False,
        tools=[vision_tool]
    )
 ```
--- a/src/crewai_tools/tools/vision_tool/vision_tool.py
+++ b/src/crewai_tools/tools/vision_tool/vision_tool.py
@@ -0,0 +1,93 @@
 import base64
 from typing import Type
 import requests
 from crewai_tools.tools.base_tool import BaseTool
 from openai import OpenAI
 from pydantic.v1 import BaseModel
 class ImagePromptSchema(BaseModel):
    """Input for Vision Tool."""
    image_path_url: str = "The image path or URL."
 class VisionTool(BaseTool):
    name: str = "Vision Tool"
    description: str = (
        "This tool uses OpenAI's Vision API to describe the contents of an image."
    )
    args_schema: Type[BaseModel] = ImagePromptSchema
    def _run_web_hosted_images(self, client, image_path_url: str) -> str:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "What's in this image?"},
                        {
                            "type": "image_url",
                            "image_url": {"url": image_path_url},
                        },
                    ],
                }
            ],
            max_tokens=300,
        )
        return response.choices[0].message.content
    def _run_local_images(self, client, image_path_url: str) -> str:
        base64_image = self._encode_image(image_path_url)
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {client.api_key}",
        }
        payload = {
            "model": "gpt-4o-mini",
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "What's in this image?"},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            },
                        },
                    ],
                }
            ],
            "max_tokens": 300,
        }
        response = requests.post(
            "https://api.openai.com/v1/chat/completions", headers=headers, json=payload
        )
        return response.json()["choices"][0]["message"]["content"]
    def _run(self, **kwargs) -> str:
        client = OpenAI()
        image_path_url = kwargs.get("image_path_url")
        if not image_path_url:
            return "Image Path or URL is required."
        if "http" in image_path_url:
            image_description = self._run_web_hosted_images(client, image_path_url)
        else:
            image_description = self._run_local_images(client, image_path_url)
        return image_description
    def _encode_image(self, image_path: str):
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")