From 67be0c674d57c838b19deada46a797d4a2455ee6 Mon Sep 17 00:00:00 2001
From: Daniel Barreto <danielfsbarreto@gmail.com>
Date: Mon, 28 Apr 2025 19:53:00 -0300
Subject: [PATCH] Allow setting custom LLM for the vision tool (#294)

* Allow setting custom LLM for the vision tool

Defaults to gpt-4o-mini otherwise

* Enhance VisionTool with model management and improved initialization

- Added support for setting a custom model identifier with a default of "gpt-4o-mini".
- Introduced properties for model management, allowing dynamic updates and resetting of the LLM instance.
- Updated the initialization method to accept an optional LLM and model parameter.
- Refactored the image processing logic for clarity and efficiency.

* docstrings

* Add stop config

---------

Co-authored-by: lorenzejay <lorenzejaytech@gmail.com>
---
 .../tools/vision_tool/vision_tool.py          | 68 ++++++++++++++-----
 1 file changed, 52 insertions(+), 16 deletions(-)

diff --git a/src/crewai_tools/tools/vision_tool/vision_tool.py b/src/crewai_tools/tools/vision_tool/vision_tool.py
index a8daaabb9..cd4f5e74c 100644
--- a/src/crewai_tools/tools/vision_tool/vision_tool.py
+++ b/src/crewai_tools/tools/vision_tool/vision_tool.py
@@ -2,9 +2,9 @@ import base64
 from pathlib import Path
 from typing import Optional, Type
 
+from crewai import LLM
 from crewai.tools import BaseTool
-from openai import OpenAI
-from pydantic import BaseModel, field_validator
+from pydantic import BaseModel, PrivateAttr, field_validator
 
 
 class ImagePromptSchema(BaseModel):
@@ -32,19 +32,52 @@ class ImagePromptSchema(BaseModel):
 
 
 class VisionTool(BaseTool):
+    """Tool for analyzing images using vision models.
+
+    Args:
+        llm: Optional LLM instance to use
+        model: Model identifier to use if no LLM is provided
+    """
+
     name: str = "Vision Tool"
     description: str = (
         "This tool uses OpenAI's Vision API to describe the contents of an image."
     )
     args_schema: Type[BaseModel] = ImagePromptSchema
-    _client: Optional[OpenAI] = None
+
+    _model: str = PrivateAttr(default="gpt-4o-mini")
+    _llm: Optional[LLM] = PrivateAttr(default=None)
+
+    def __init__(self, llm: Optional[LLM] = None, model: str = "gpt-4o-mini", **kwargs):
+        """Initialize the vision tool.
+
+        Args:
+            llm: Optional LLM instance to use
+            model: Model identifier to use if no LLM is provided
+            **kwargs: Additional arguments for the base tool
+        """
+        super().__init__(**kwargs)
+        self._model = model
+        self._llm = llm
 
     @property
-    def client(self) -> OpenAI:
-        """Cached OpenAI client instance."""
-        if self._client is None:
-            self._client = OpenAI()
-        return self._client
+    def model(self) -> str:
+        """Get the current model identifier."""
+        return self._model
+
+    @model.setter
+    def model(self, value: str) -> None:
+        """Set the model identifier and reset LLM if it was auto-created."""
+        self._model = value
+        if self._llm is not None and self._llm._model != value:
+            self._llm = None
+
+    @property
+    def llm(self) -> LLM:
+        """Get the LLM instance, creating one if needed."""
+        if self._llm is None:
+            self._llm = LLM(model=self._model, stop=["STOP", "END"])
+        return self._llm
 
     def _run(self, **kwargs) -> str:
         try:
@@ -52,7 +85,6 @@ class VisionTool(BaseTool):
             if not image_path_url:
                 return "Image Path or URL is required."
 
-            # Validate input using Pydantic
             ImagePromptSchema(image_path_url=image_path_url)
 
             if image_path_url.startswith("http"):
@@ -64,8 +96,7 @@ class VisionTool(BaseTool):
                 except Exception as e:
                     return f"Error processing image: {str(e)}"
 
-            response = self.client.chat.completions.create(
-                model="gpt-4o-mini",
+            response = self.llm.call(
                 messages=[
                     {
                         "role": "user",
@@ -76,16 +107,21 @@ class VisionTool(BaseTool):
                                 "image_url": {"url": image_data},
                             },
                         ],
-                    }
+                    },
                 ],
-                max_tokens=300,
             )
-
-            return response.choices[0].message.content
-
+            return response
         except Exception as e:
             return f"An error occurred: {str(e)}"
 
     def _encode_image(self, image_path: str) -> str:
+        """Encode an image file as base64.
+
+        Args:
+            image_path: Path to the image file
+
+        Returns:
+            Base64-encoded image data
+        """
         with open(image_path, "rb") as image_file:
             return base64.b64encode(image_file.read()).decode("utf-8")