mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-24 15:48:23 +00:00
Squashed 'packages/tools/' content from commit 78317b9c
git-subtree-dir: packages/tools git-subtree-split: 78317b9c127f18bd040c1d77e3c0840cdc9a5b38
This commit is contained in:
68
crewai_tools/tools/contextualai_parse_tool/README.md
Normal file
68
crewai_tools/tools/contextualai_parse_tool/README.md
Normal file
@@ -0,0 +1,68 @@
|
||||
# ContextualAIParseTool
|
||||
|
||||
## Description
|
||||
This tool is designed to integrate Contextual AI's enterprise-grade document parsing capabilities with CrewAI, enabling you to leverage advanced AI-powered document understanding for complex layouts, tables, and figures. Use this tool to extract structured content from your documents using Contextual AI's powerful document parser.
|
||||
|
||||
## Installation
|
||||
To incorporate this tool into your project, follow the installation instructions below:
|
||||
|
||||
```
|
||||
pip install 'crewai[tools]' contextual-client
|
||||
```
|
||||
|
||||
**Note**: You'll need a Contextual AI API key. Sign up at [app.contextual.ai](https://app.contextual.ai) to get your free API key.
|
||||
|
||||
## Example
|
||||
|
||||
```python
|
||||
from crewai_tools import ContextualAIParseTool
|
||||
|
||||
tool = ContextualAIParseTool(api_key="your_api_key_here")
|
||||
|
||||
result = tool._run(
|
||||
file_path="/path/to/document.pdf",
|
||||
parse_mode="standard",
|
||||
page_range="0-5",
|
||||
output_types=["markdown-per-page"]
|
||||
)
|
||||
print(result)
|
||||
```
|
||||
|
||||
The result will show the parsed contents of your document. For example:
|
||||
```
|
||||
{
|
||||
"file_name": "attention_is_all_you_need.pdf",
|
||||
"status": "completed",
|
||||
"pages": [
|
||||
{
|
||||
"index": 0,
|
||||
"markdown": "Provided proper attribution ...
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"markdown": "## 1 Introduction ...
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
```
|
||||
## Parameters
|
||||
- `api_key`: Your Contextual AI API key
|
||||
- `file_path`: Path to document to parse
|
||||
- `parse_mode`: Parsing mode (default: "standard")
|
||||
- `figure_caption_mode`: Figure caption handling (default: "concise")
|
||||
- `enable_document_hierarchy`: Enable hierarchy detection (default: True)
|
||||
- `page_range`: Pages to parse (e.g., "0-5", None for all)
|
||||
- `output_types`: Output formats (default: ["markdown-per-page"])
|
||||
|
||||
## Key Features
|
||||
- **Advanced Document Understanding**: Handles complex PDF layouts, tables, and multi-column documents
|
||||
- **Figure and Table Extraction**: Intelligent extraction of figures, charts, and tabular data
|
||||
- **Page Range Selection**: Parse specific pages or entire documents
|
||||
|
||||
## Use Cases
|
||||
- Extract structured content from complex PDFs and research papers
|
||||
- Parse financial reports, legal documents, and technical manuals
|
||||
- Convert documents to markdown for further processing in RAG pipelines
|
||||
|
||||
For more detailed information about Contextual AI's capabilities, visit the [official documentation](https://docs.contextual.ai).
|
||||
@@ -0,0 +1,92 @@
|
||||
from typing import Any, Optional, Type, List
|
||||
from crewai.tools import BaseTool
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class ContextualAIParseSchema(BaseModel):
|
||||
"""Schema for contextual parse tool."""
|
||||
file_path: str = Field(..., description="Path to the document to parse")
|
||||
parse_mode: str = Field(default="standard", description="Parsing mode")
|
||||
figure_caption_mode: str = Field(default="concise", description="Figure caption mode")
|
||||
enable_document_hierarchy: bool = Field(default=True, description="Enable document hierarchy")
|
||||
page_range: Optional[str] = Field(default=None, description="Page range to parse (e.g., '0-5')")
|
||||
output_types: List[str] = Field(default=["markdown-per-page"], description="List of output types")
|
||||
|
||||
|
||||
class ContextualAIParseTool(BaseTool):
|
||||
"""Tool to parse documents using Contextual AI's parser."""
|
||||
|
||||
name: str = "Contextual AI Document Parser"
|
||||
description: str = "Parse documents using Contextual AI's advanced document parser"
|
||||
args_schema: Type[BaseModel] = ContextualAIParseSchema
|
||||
|
||||
api_key: str
|
||||
package_dependencies: List[str] = ["contextual-client"]
|
||||
|
||||
def _run(
|
||||
self,
|
||||
file_path: str,
|
||||
parse_mode: str = "standard",
|
||||
figure_caption_mode: str = "concise",
|
||||
enable_document_hierarchy: bool = True,
|
||||
page_range: Optional[str] = None,
|
||||
output_types: List[str] = ["markdown-per-page"]
|
||||
) -> str:
|
||||
"""Parse a document using Contextual AI's parser."""
|
||||
try:
|
||||
import requests
|
||||
import json
|
||||
import os
|
||||
from time import sleep
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"Document not found: {file_path}")
|
||||
|
||||
base_url = "https://api.contextual.ai/v1"
|
||||
headers = {
|
||||
"accept": "application/json",
|
||||
"authorization": f"Bearer {self.api_key}"
|
||||
}
|
||||
|
||||
# Submit parse job
|
||||
url = f"{base_url}/parse"
|
||||
config = {
|
||||
"parse_mode": parse_mode,
|
||||
"figure_caption_mode": figure_caption_mode,
|
||||
"enable_document_hierarchy": enable_document_hierarchy,
|
||||
}
|
||||
|
||||
if page_range:
|
||||
config["page_range"] = page_range
|
||||
|
||||
with open(file_path, "rb") as fp:
|
||||
file = {"raw_file": fp}
|
||||
result = requests.post(url, headers=headers, data=config, files=file)
|
||||
response = json.loads(result.text)
|
||||
job_id = response['job_id']
|
||||
|
||||
# Monitor job status
|
||||
status_url = f"{base_url}/parse/jobs/{job_id}/status"
|
||||
while True:
|
||||
result = requests.get(status_url, headers=headers)
|
||||
parse_response = json.loads(result.text)['status']
|
||||
|
||||
if parse_response == "completed":
|
||||
break
|
||||
elif parse_response == "failed":
|
||||
raise RuntimeError("Document parsing failed")
|
||||
|
||||
sleep(5)
|
||||
|
||||
# Get parse results
|
||||
results_url = f"{base_url}/parse/jobs/{job_id}/results"
|
||||
result = requests.get(
|
||||
results_url,
|
||||
headers=headers,
|
||||
params={"output_types": ",".join(output_types)},
|
||||
)
|
||||
|
||||
return json.dumps(json.loads(result.text), indent=2)
|
||||
|
||||
except Exception as e:
|
||||
return f"Failed to parse document: {str(e)}"
|
||||
Reference in New Issue
Block a user