From 4835c2bf68bc98bd28e29bee22a4d7d5b6b0cd49 Mon Sep 17 00:00:00 2001
From: Eduardo Chiarotti <dudumelgaco@hotmail.com>
Date: Wed, 31 Jul 2024 17:10:26 -0300
Subject: [PATCH] feat: Add Vision tool to the CrewAI tool

---
 src/crewai_tools/__init__.py                  | 10 +-
 src/crewai_tools/tools/__init__.py            | 24 +++--
 src/crewai_tools/tools/vision_tool/README.md  | 30 ++++++
 .../tools/vision_tool/vision_tool.py          | 93 +++++++++++++++++++
 4 files changed, 146 insertions(+), 11 deletions(-)
 create mode 100644 src/crewai_tools/tools/vision_tool/README.md
 create mode 100644 src/crewai_tools/tools/vision_tool/vision_tool.py

diff --git a/src/crewai_tools/__init__.py b/src/crewai_tools/__init__.py
index a049cdc5b..b049d630d 100644
--- a/src/crewai_tools/__init__.py
+++ b/src/crewai_tools/__init__.py
@@ -17,23 +17,25 @@ from .tools import (
     LlamaIndexTool,
     MDXSearchTool,
     MultiOnTool,
+    NL2SQLTool,
     PDFSearchTool,
     PGSearchTool,
     RagTool,
     ScrapeElementFromWebsiteTool,
-    ScrapflyScrapeWebsiteTool,
     ScrapeWebsiteTool,
+    ScrapflyScrapeWebsiteTool,
     SeleniumScrapingTool,
     SerperDevTool,
-    SerplyWebSearchTool,
+    SerplyJobSearchTool,
     SerplyNewsSearchTool,
     SerplyScholarSearchTool,
     SerplyWebpageToMarkdownTool,
-    SerplyJobSearchTool,
+    SerplyWebSearchTool,
     TXTSearchTool,
+    VisionTool,
     WebsiteSearchTool,
     XMLSearchTool,
     YoutubeChannelSearchTool,
-    YoutubeVideoSearchTool
+    YoutubeVideoSearchTool,
 )
 from .tools.base_tool import BaseTool, Tool, tool
diff --git a/src/crewai_tools/tools/__init__.py b/src/crewai_tools/tools/__init__.py
index a72fda277..483ebda21 100644
--- a/src/crewai_tools/tools/__init__.py
+++ b/src/crewai_tools/tools/__init__.py
@@ -8,14 +8,19 @@ from .directory_search_tool.directory_search_tool import DirectorySearchTool
 from .docx_search_tool.docx_search_tool import DOCXSearchTool
 from .exa_tools.exa_search_tool import EXASearchTool
 from .file_read_tool.file_read_tool import FileReadTool
-from .firecrawl_crawl_website_tool.firecrawl_crawl_website_tool import FirecrawlCrawlWebsiteTool
-from .firecrawl_scrape_website_tool.firecrawl_scrape_website_tool import FirecrawlScrapeWebsiteTool
+from .firecrawl_crawl_website_tool.firecrawl_crawl_website_tool import (
+    FirecrawlCrawlWebsiteTool,
+)
+from .firecrawl_scrape_website_tool.firecrawl_scrape_website_tool import (
+    FirecrawlScrapeWebsiteTool,
+)
 from .firecrawl_search_tool.firecrawl_search_tool import FirecrawlSearchTool
 from .github_search_tool.github_search_tool import GithubSearchTool
 from .json_search_tool.json_search_tool import JSONSearchTool
 from .llamaindex_tool.llamaindex_tool import LlamaIndexTool
 from .mdx_seach_tool.mdx_search_tool import MDXSearchTool
 from .multion_tool.multion_tool import MultiOnTool
+from .nl2sql.nl2sql_tool import NL2SQLTool
 from .pdf_search_tool.pdf_search_tool import PDFSearchTool
 from .pg_seach_tool.pg_search_tool import PGSearchTool
 from .rag.rag_tool import RagTool
@@ -23,17 +28,22 @@ from .scrape_element_from_website.scrape_element_from_website import (
     ScrapeElementFromWebsiteTool,
 )
 from .scrape_website_tool.scrape_website_tool import ScrapeWebsiteTool
-from .scrapfly_scrape_website_tool.scrapfly_scrape_website_tool import ScrapflyScrapeWebsiteTool
+from .scrapfly_scrape_website_tool.scrapfly_scrape_website_tool import (
+    ScrapflyScrapeWebsiteTool,
+)
 from .selenium_scraping_tool.selenium_scraping_tool import SeleniumScrapingTool
 from .serper_dev_tool.serper_dev_tool import SerperDevTool
-from .serply_api_tool.serply_web_search_tool import SerplyWebSearchTool
+from .serply_api_tool.serply_job_search_tool import SerplyJobSearchTool
 from .serply_api_tool.serply_news_search_tool import SerplyNewsSearchTool
 from .serply_api_tool.serply_scholar_search_tool import SerplyScholarSearchTool
+from .serply_api_tool.serply_web_search_tool import SerplyWebSearchTool
 from .serply_api_tool.serply_webpage_to_markdown_tool import SerplyWebpageToMarkdownTool
-from .serply_api_tool.serply_job_search_tool import SerplyJobSearchTool
+from .spider_tool.spider_tool import SpiderTool
 from .txt_search_tool.txt_search_tool import TXTSearchTool
+from .vision_tool.vision_tool import VisionTool
 from .website_search.website_search_tool import WebsiteSearchTool
 from .xml_search_tool.xml_search_tool import XMLSearchTool
-from .youtube_channel_search_tool.youtube_channel_search_tool import YoutubeChannelSearchTool
+from .youtube_channel_search_tool.youtube_channel_search_tool import (
+    YoutubeChannelSearchTool,
+)
 from .youtube_video_search_tool.youtube_video_search_tool import YoutubeVideoSearchTool
-from .spider_tool.spider_tool import SpiderTool
diff --git a/src/crewai_tools/tools/vision_tool/README.md b/src/crewai_tools/tools/vision_tool/README.md
new file mode 100644
index 000000000..bf7ab7486
--- /dev/null
+++ b/src/crewai_tools/tools/vision_tool/README.md
@@ -0,0 +1,30 @@
+# Vision Tool
+
+## Description
+
+This tool is used to extract text from images. When passed to the agent it will extract the text from the image and then use it to generate a response, report or any other output. The URL or the PATH of the image should be passed to the Agent.
+
+
+## Installation
+Install the crewai_tools package
+```shell
+pip install 'crewai[tools]'
+```
+
+## Usage
+
+In order to use the VisionTool, the OpenAI API key should be set in the environment variable `OPENAI_API_KEY`.
+
+```python
+from crewai_tools import VisionTool
+
+vision_tool = VisionTool()
+
+@agent
+def researcher(self) -> Agent:
+    return Agent(
+        config=self.agents_config["researcher"],
+        allow_delegation=False,
+        tools=[vision_tool]
+    )
+```
diff --git a/src/crewai_tools/tools/vision_tool/vision_tool.py b/src/crewai_tools/tools/vision_tool/vision_tool.py
new file mode 100644
index 000000000..a9abd5c43
--- /dev/null
+++ b/src/crewai_tools/tools/vision_tool/vision_tool.py
@@ -0,0 +1,93 @@
+import base64
+from typing import Type
+
+import requests
+from crewai_tools.tools.base_tool import BaseTool
+from openai import OpenAI
+from pydantic.v1 import BaseModel
+
+
+class ImagePromptSchema(BaseModel):
+    """Input for Vision Tool."""
+
+    image_path_url: str = "The image path or URL."
+
+
+class VisionTool(BaseTool):
+    name: str = "Vision Tool"
+    description: str = (
+        "This tool uses OpenAI's Vision API to describe the contents of an image."
+    )
+    args_schema: Type[BaseModel] = ImagePromptSchema
+
+    def _run_web_hosted_images(self, client, image_path_url: str) -> str:
+        response = client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What's in this image?"},
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": image_path_url},
+                        },
+                    ],
+                }
+            ],
+            max_tokens=300,
+        )
+
+        return response.choices[0].message.content
+
+    def _run_local_images(self, client, image_path_url: str) -> str:
+        base64_image = self._encode_image(image_path_url)
+
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {client.api_key}",
+        }
+
+        payload = {
+            "model": "gpt-4o-mini",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What's in this image?"},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{base64_image}"
+                            },
+                        },
+                    ],
+                }
+            ],
+            "max_tokens": 300,
+        }
+
+        response = requests.post(
+            "https://api.openai.com/v1/chat/completions", headers=headers, json=payload
+        )
+
+        return response.json()["choices"][0]["message"]["content"]
+
+    def _run(self, **kwargs) -> str:
+        client = OpenAI()
+
+        image_path_url = kwargs.get("image_path_url")
+
+        if not image_path_url:
+            return "Image Path or URL is required."
+
+        if "http" in image_path_url:
+            image_description = self._run_web_hosted_images(client, image_path_url)
+        else:
+            image_description = self._run_local_images(client, image_path_url)
+
+        return image_description
+
+    def _encode_image(self, image_path: str):
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode("utf-8")