feat: Add Vision tool to the CrewAI tool

This commit is contained in:
Eduardo Chiarotti
2024-07-31 17:10:26 -03:00
parent d28dba453e
commit 4835c2bf68
4 changed files with 146 additions and 11 deletions

View File

@@ -17,23 +17,25 @@ from .tools import (
LlamaIndexTool,
MDXSearchTool,
MultiOnTool,
NL2SQLTool,
PDFSearchTool,
PGSearchTool,
RagTool,
ScrapeElementFromWebsiteTool,
ScrapflyScrapeWebsiteTool,
ScrapeWebsiteTool,
ScrapflyScrapeWebsiteTool,
SeleniumScrapingTool,
SerperDevTool,
SerplyWebSearchTool,
SerplyJobSearchTool,
SerplyNewsSearchTool,
SerplyScholarSearchTool,
SerplyWebpageToMarkdownTool,
SerplyJobSearchTool,
SerplyWebSearchTool,
TXTSearchTool,
VisionTool,
WebsiteSearchTool,
XMLSearchTool,
YoutubeChannelSearchTool,
YoutubeVideoSearchTool
YoutubeVideoSearchTool,
)
from .tools.base_tool import BaseTool, Tool, tool

View File

@@ -8,14 +8,19 @@ from .directory_search_tool.directory_search_tool import DirectorySearchTool
from .docx_search_tool.docx_search_tool import DOCXSearchTool
from .exa_tools.exa_search_tool import EXASearchTool
from .file_read_tool.file_read_tool import FileReadTool
from .firecrawl_crawl_website_tool.firecrawl_crawl_website_tool import FirecrawlCrawlWebsiteTool
from .firecrawl_scrape_website_tool.firecrawl_scrape_website_tool import FirecrawlScrapeWebsiteTool
from .firecrawl_crawl_website_tool.firecrawl_crawl_website_tool import (
FirecrawlCrawlWebsiteTool,
)
from .firecrawl_scrape_website_tool.firecrawl_scrape_website_tool import (
FirecrawlScrapeWebsiteTool,
)
from .firecrawl_search_tool.firecrawl_search_tool import FirecrawlSearchTool
from .github_search_tool.github_search_tool import GithubSearchTool
from .json_search_tool.json_search_tool import JSONSearchTool
from .llamaindex_tool.llamaindex_tool import LlamaIndexTool
from .mdx_seach_tool.mdx_search_tool import MDXSearchTool
from .multion_tool.multion_tool import MultiOnTool
from .nl2sql.nl2sql_tool import NL2SQLTool
from .pdf_search_tool.pdf_search_tool import PDFSearchTool
from .pg_seach_tool.pg_search_tool import PGSearchTool
from .rag.rag_tool import RagTool
@@ -23,17 +28,22 @@ from .scrape_element_from_website.scrape_element_from_website import (
ScrapeElementFromWebsiteTool,
)
from .scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool
from .scrapfly_scrape_website_tool.scrapfly_scrape_website_tool import ScrapflyScrapeWebsiteTool
from .scrapfly_scrape_website_tool.scrapfly_scrape_website_tool import (
ScrapflyScrapeWebsiteTool,
)
from .selenium_scraping_tool.selenium_scraping_tool import SeleniumScrapingTool
from .serper_dev_tool.serper_dev_tool import SerperDevTool
from .serply_api_tool.serply_web_search_tool import SerplyWebSearchTool
from .serply_api_tool.serply_job_search_tool import SerplyJobSearchTool
from .serply_api_tool.serply_news_search_tool import SerplyNewsSearchTool
from .serply_api_tool.serply_scholar_search_tool import SerplyScholarSearchTool
from .serply_api_tool.serply_web_search_tool import SerplyWebSearchTool
from .serply_api_tool.serply_webpage_to_markdown_tool import SerplyWebpageToMarkdownTool
from .serply_api_tool.serply_job_search_tool import SerplyJobSearchTool
from .spider_tool.spider_tool import SpiderTool
from .txt_search_tool.txt_search_tool import TXTSearchTool
from .vision_tool.vision_tool import VisionTool
from .website_search.website_search_tool import WebsiteSearchTool
from .xml_search_tool.xml_search_tool import XMLSearchTool
from .youtube_channel_search_tool.youtube_channel_search_tool import YoutubeChannelSearchTool
from .youtube_channel_search_tool.youtube_channel_search_tool import (
YoutubeChannelSearchTool,
)
from .youtube_video_search_tool.youtube_video_search_tool import YoutubeVideoSearchTool
from .spider_tool.spider_tool import SpiderTool

View File

@@ -0,0 +1,30 @@
# Vision Tool
## Description
This tool is used to extract text from images. When passed to the agent it will extract the text from the image and then use it to generate a response, report or any other output. The URL or the PATH of the image should be passed to the Agent.
## Installation
Install the crewai_tools package
```shell
pip install 'crewai[tools]'
```
## Usage
In order to use the VisionTool, the OpenAI API key should be set in the environment variable `OPENAI_API_KEY`.
```python
from crewai_tools import VisionTool
vision_tool = VisionTool()
@agent
def researcher(self) -> Agent:
return Agent(
config=self.agents_config["researcher"],
allow_delegation=False,
tools=[vision_tool]
)
```

View File

@@ -0,0 +1,93 @@
import base64
from typing import Type
import requests
from crewai_tools.tools.base_tool import BaseTool
from openai import OpenAI
from pydantic.v1 import BaseModel
class ImagePromptSchema(BaseModel):
"""Input for Vision Tool."""
image_path_url: str = "The image path or URL."
class VisionTool(BaseTool):
name: str = "Vision Tool"
description: str = (
"This tool uses OpenAI's Vision API to describe the contents of an image."
)
args_schema: Type[BaseModel] = ImagePromptSchema
def _run_web_hosted_images(self, client, image_path_url: str) -> str:
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this image?"},
{
"type": "image_url",
"image_url": {"url": image_path_url},
},
],
}
],
max_tokens=300,
)
return response.choices[0].message.content
def _run_local_images(self, client, image_path_url: str) -> str:
base64_image = self._encode_image(image_path_url)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {client.api_key}",
}
payload = {
"model": "gpt-4o-mini",
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this image?"},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
},
},
],
}
],
"max_tokens": 300,
}
response = requests.post(
"https://api.openai.com/v1/chat/completions", headers=headers, json=payload
)
return response.json()["choices"][0]["message"]["content"]
def _run(self, **kwargs) -> str:
client = OpenAI()
image_path_url = kwargs.get("image_path_url")
if not image_path_url:
return "Image Path or URL is required."
if "http" in image_path_url:
image_description = self._run_web_hosted_images(client, image_path_url)
else:
image_description = self._run_local_images(client, image_path_url)
return image_description
def _encode_image(self, image_path: str):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")