Squashed 'packages/tools/' content from commit 78317b9c

git-subtree-dir: packages/tools git-subtree-split: 78317b9c127f18bd040c1d77e3c0840cdc9a5b38
2026-01-22 22:58:13 +00:00 · 2025-09-12 21:58:02 -04:00
commit e16606672a
303 changed files with 49010 additions and 0 deletions
--- a/crewai_tools/tools/ocr_tool/ocr_tool.py
+++ b/crewai_tools/tools/ocr_tool/ocr_tool.py
@@ -0,0 +1,126 @@
+"""
+Optical Character Recognition (OCR) Tool
+
+This tool provides functionality for extracting text from images using supported LLMs. Make sure your model supports the `vision` feature.
+"""
+
+import base64
+from typing import Optional, Type
+
+from openai import OpenAI
+from pydantic import BaseModel, PrivateAttr
+
+from crewai.tools.base_tool import BaseTool
+from crewai import LLM
+
+
+class OCRToolSchema(BaseModel):
+    """Input schema for Optical Character Recognition Tool.
+    
+    Attributes:
+        image_path_url (str): Path to a local image file or URL of an image.
+            For local files, provide the absolute or relative path.
+            For remote images, provide the complete URL starting with 'http' or 'https'.
+    """
+
+    image_path_url: str = "The image path or URL."
+
+
+class OCRTool(BaseTool):
+    """A tool for performing Optical Character Recognition on images.
+
+    This tool leverages LLMs to extract text from images. It can process
+    both local image files and images available via URLs.
+
+    Attributes:
+        name (str): Name of the tool.
+        description (str): Description of the tool's functionality.
+        args_schema (Type[BaseModel]): Pydantic schema for input validation.
+
+    Private Attributes:
+        _llm (Optional[LLM]): Language model instance for making API calls.
+    """
+
+    name: str = "Optical Character Recognition Tool"
+    description: str = (
+        "This tool uses an LLM's API to extract text from an image file."
+    )
+    _llm: Optional[LLM] = PrivateAttr(default=None)
+
+    args_schema: Type[BaseModel] = OCRToolSchema
+
+    def __init__(self, llm: LLM = None, **kwargs):
+        """Initialize the OCR tool.
+
+        Args:
+            llm (LLM, optional): Language model instance to use for API calls.
+                If not provided, a default LLM with gpt-4o model will be used.
+            **kwargs: Additional arguments passed to the parent class.
+        """
+        super().__init__(**kwargs)
+
+        if llm is None:
+            # Use the default LLM
+            llm = LLM(
+                model="gpt-4o",
+                temperature=0.7,
+            )
+
+        self._llm = llm
+
+    def _run(self, **kwargs) -> str:
+        """Execute the OCR operation on the provided image.
+
+        Args:
+            **kwargs: Keyword arguments containing the image_path_url.
+
+        Returns:
+            str: Extracted text from the image.
+                If no image path/URL is provided, returns an error message.
+
+        Note:
+            The method handles both local image files and remote URLs:
+            - For local files: The image is read and encoded to base64
+            - For URLs: The URL is passed directly to the Vision API
+        """
+        image_path_url = kwargs.get("image_path_url")
+
+        if not image_path_url:
+            return "Image Path or URL is required."
+
+        if image_path_url.startswith("http"):
+            image_data = image_path_url
+        else:
+            base64_image = self._encode_image(image_path_url)
+            image_data = f"data:image/jpeg;base64,{base64_image}"
+        
+        messages=[
+            {
+                "role": "system",
+                "content": "You are an expert OCR specialist. Extract complete text from the provided image. Provide the result as a raw text."
+            },
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_data},
+                    }
+                ],
+            }
+        ]
+
+        response = self._llm.call(messages=messages)
+        return response
+
+    def _encode_image(self, image_path: str):
+        """Encode an image file to base64 format.
+
+        Args:
+            image_path (str): Path to the local image file.
+
+        Returns:
+            str: Base64-encoded image data as a UTF-8 string.
+        """
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode("utf-8")