From 4835c2bf68bc98bd28e29bee22a4d7d5b6b0cd49 Mon Sep 17 00:00:00 2001 From: Eduardo Chiarotti Date: Wed, 31 Jul 2024 17:10:26 -0300 Subject: [PATCH] feat: Add Vision tool to the CrewAI tool --- src/crewai_tools/__init__.py | 10 +- src/crewai_tools/tools/__init__.py | 24 +++-- src/crewai_tools/tools/vision_tool/README.md | 30 ++++++ .../tools/vision_tool/vision_tool.py | 93 +++++++++++++++++++ 4 files changed, 146 insertions(+), 11 deletions(-) create mode 100644 src/crewai_tools/tools/vision_tool/README.md create mode 100644 src/crewai_tools/tools/vision_tool/vision_tool.py diff --git a/src/crewai_tools/__init__.py b/src/crewai_tools/__init__.py index a049cdc5b..b049d630d 100644 --- a/src/crewai_tools/__init__.py +++ b/src/crewai_tools/__init__.py @@ -17,23 +17,25 @@ from .tools import ( LlamaIndexTool, MDXSearchTool, MultiOnTool, + NL2SQLTool, PDFSearchTool, PGSearchTool, RagTool, ScrapeElementFromWebsiteTool, - ScrapflyScrapeWebsiteTool, ScrapeWebsiteTool, + ScrapflyScrapeWebsiteTool, SeleniumScrapingTool, SerperDevTool, - SerplyWebSearchTool, + SerplyJobSearchTool, SerplyNewsSearchTool, SerplyScholarSearchTool, SerplyWebpageToMarkdownTool, - SerplyJobSearchTool, + SerplyWebSearchTool, TXTSearchTool, + VisionTool, WebsiteSearchTool, XMLSearchTool, YoutubeChannelSearchTool, - YoutubeVideoSearchTool + YoutubeVideoSearchTool, ) from .tools.base_tool import BaseTool, Tool, tool diff --git a/src/crewai_tools/tools/__init__.py b/src/crewai_tools/tools/__init__.py index a72fda277..483ebda21 100644 --- a/src/crewai_tools/tools/__init__.py +++ b/src/crewai_tools/tools/__init__.py @@ -8,14 +8,19 @@ from .directory_search_tool.directory_search_tool import DirectorySearchTool from .docx_search_tool.docx_search_tool import DOCXSearchTool from .exa_tools.exa_search_tool import EXASearchTool from .file_read_tool.file_read_tool import FileReadTool -from .firecrawl_crawl_website_tool.firecrawl_crawl_website_tool import FirecrawlCrawlWebsiteTool -from .firecrawl_scrape_website_tool.firecrawl_scrape_website_tool import FirecrawlScrapeWebsiteTool +from .firecrawl_crawl_website_tool.firecrawl_crawl_website_tool import ( + FirecrawlCrawlWebsiteTool, +) +from .firecrawl_scrape_website_tool.firecrawl_scrape_website_tool import ( + FirecrawlScrapeWebsiteTool, +) from .firecrawl_search_tool.firecrawl_search_tool import FirecrawlSearchTool from .github_search_tool.github_search_tool import GithubSearchTool from .json_search_tool.json_search_tool import JSONSearchTool from .llamaindex_tool.llamaindex_tool import LlamaIndexTool from .mdx_seach_tool.mdx_search_tool import MDXSearchTool from .multion_tool.multion_tool import MultiOnTool +from .nl2sql.nl2sql_tool import NL2SQLTool from .pdf_search_tool.pdf_search_tool import PDFSearchTool from .pg_seach_tool.pg_search_tool import PGSearchTool from .rag.rag_tool import RagTool @@ -23,17 +28,22 @@ from .scrape_element_from_website.scrape_element_from_website import ( ScrapeElementFromWebsiteTool, ) from .scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool -from .scrapfly_scrape_website_tool.scrapfly_scrape_website_tool import ScrapflyScrapeWebsiteTool +from .scrapfly_scrape_website_tool.scrapfly_scrape_website_tool import ( + ScrapflyScrapeWebsiteTool, +) from .selenium_scraping_tool.selenium_scraping_tool import SeleniumScrapingTool from .serper_dev_tool.serper_dev_tool import SerperDevTool -from .serply_api_tool.serply_web_search_tool import SerplyWebSearchTool +from .serply_api_tool.serply_job_search_tool import SerplyJobSearchTool from .serply_api_tool.serply_news_search_tool import SerplyNewsSearchTool from .serply_api_tool.serply_scholar_search_tool import SerplyScholarSearchTool +from .serply_api_tool.serply_web_search_tool import SerplyWebSearchTool from .serply_api_tool.serply_webpage_to_markdown_tool import SerplyWebpageToMarkdownTool -from .serply_api_tool.serply_job_search_tool import SerplyJobSearchTool +from .spider_tool.spider_tool import SpiderTool from .txt_search_tool.txt_search_tool import TXTSearchTool +from .vision_tool.vision_tool import VisionTool from .website_search.website_search_tool import WebsiteSearchTool from .xml_search_tool.xml_search_tool import XMLSearchTool -from .youtube_channel_search_tool.youtube_channel_search_tool import YoutubeChannelSearchTool +from .youtube_channel_search_tool.youtube_channel_search_tool import ( + YoutubeChannelSearchTool, +) from .youtube_video_search_tool.youtube_video_search_tool import YoutubeVideoSearchTool -from .spider_tool.spider_tool import SpiderTool diff --git a/src/crewai_tools/tools/vision_tool/README.md b/src/crewai_tools/tools/vision_tool/README.md new file mode 100644 index 000000000..bf7ab7486 --- /dev/null +++ b/src/crewai_tools/tools/vision_tool/README.md @@ -0,0 +1,30 @@ +# Vision Tool + +## Description + +This tool is used to extract text from images. When passed to the agent it will extract the text from the image and then use it to generate a response, report or any other output. The URL or the PATH of the image should be passed to the Agent. + + +## Installation +Install the crewai_tools package +```shell +pip install 'crewai[tools]' +``` + +## Usage + +In order to use the VisionTool, the OpenAI API key should be set in the environment variable `OPENAI_API_KEY`. + +```python +from crewai_tools import VisionTool + +vision_tool = VisionTool() + +@agent +def researcher(self) -> Agent: + return Agent( + config=self.agents_config["researcher"], + allow_delegation=False, + tools=[vision_tool] + ) +``` diff --git a/src/crewai_tools/tools/vision_tool/vision_tool.py b/src/crewai_tools/tools/vision_tool/vision_tool.py new file mode 100644 index 000000000..a9abd5c43 --- /dev/null +++ b/src/crewai_tools/tools/vision_tool/vision_tool.py @@ -0,0 +1,93 @@ +import base64 +from typing import Type + +import requests +from crewai_tools.tools.base_tool import BaseTool +from openai import OpenAI +from pydantic.v1 import BaseModel + + +class ImagePromptSchema(BaseModel): + """Input for Vision Tool.""" + + image_path_url: str = "The image path or URL." + + +class VisionTool(BaseTool): + name: str = "Vision Tool" + description: str = ( + "This tool uses OpenAI's Vision API to describe the contents of an image." + ) + args_schema: Type[BaseModel] = ImagePromptSchema + + def _run_web_hosted_images(self, client, image_path_url: str) -> str: + response = client.chat.completions.create( + model="gpt-4o-mini", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + { + "type": "image_url", + "image_url": {"url": image_path_url}, + }, + ], + } + ], + max_tokens=300, + ) + + return response.choices[0].message.content + + def _run_local_images(self, client, image_path_url: str) -> str: + base64_image = self._encode_image(image_path_url) + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {client.api_key}", + } + + payload = { + "model": "gpt-4o-mini", + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}" + }, + }, + ], + } + ], + "max_tokens": 300, + } + + response = requests.post( + "https://api.openai.com/v1/chat/completions", headers=headers, json=payload + ) + + return response.json()["choices"][0]["message"]["content"] + + def _run(self, **kwargs) -> str: + client = OpenAI() + + image_path_url = kwargs.get("image_path_url") + + if not image_path_url: + return "Image Path or URL is required." + + if "http" in image_path_url: + image_description = self._run_web_hosted_images(client, image_path_url) + else: + image_description = self._run_local_images(client, image_path_url) + + return image_description + + def _encode_image(self, image_path: str): + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode("utf-8")