mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-09 08:08:32 +00:00
added tool for docling support
This commit is contained in:
82
src/crewai/knowledge/source/docling_source.py
Normal file
82
src/crewai/knowledge/source/docling_source.py
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Union, Iterator
|
||||||
|
from pydantic import Field
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
|
||||||
|
from crewai.utilities.constants import KNOWLEDGE_DIRECTORY
|
||||||
|
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
from docling_core.types.doc.document import DoclingDocument
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
|
||||||
|
|
||||||
|
class DoclingSource(BaseFileKnowledgeSource):
|
||||||
|
"""Utility package for converting documents to markdown or json
|
||||||
|
This will auto support PDF, DOCX, and TXT, XLSX, files without any additional dependencies.
|
||||||
|
"""
|
||||||
|
|
||||||
|
file_paths: List[str] = Field(default_factory=list)
|
||||||
|
document_converter: DocumentConverter = Field(default_factory=DocumentConverter)
|
||||||
|
safe_file_paths: List[Union[Path, str]] = Field(default_factory=list)
|
||||||
|
content: List[DoclingDocument] | None = Field(default=None)
|
||||||
|
|
||||||
|
def model_post_init(self, _) -> None:
|
||||||
|
self.safe_file_paths = self._process_file_paths()
|
||||||
|
self.document_converter = DocumentConverter(
|
||||||
|
allowed_formats=[
|
||||||
|
InputFormat.MD,
|
||||||
|
InputFormat.ASCIIDOC,
|
||||||
|
InputFormat.PDF,
|
||||||
|
InputFormat.DOCX,
|
||||||
|
InputFormat.HTML,
|
||||||
|
InputFormat.IMAGE,
|
||||||
|
InputFormat.XLSX,
|
||||||
|
InputFormat.PPTX,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.content = self.load_content()
|
||||||
|
|
||||||
|
def load_content(self):
|
||||||
|
try:
|
||||||
|
self.content = self.convert_source_to_docling_documents()
|
||||||
|
except Exception as e:
|
||||||
|
self._logger.log("error", f"Error loading content: {e}")
|
||||||
|
|
||||||
|
def add(self) -> None:
|
||||||
|
if self.content is None:
|
||||||
|
return
|
||||||
|
for doc in self.content:
|
||||||
|
new_chunks = self._chunk_text(doc)
|
||||||
|
self.chunks.extend(new_chunks)
|
||||||
|
self._save_documents()
|
||||||
|
|
||||||
|
def convert_source_to_docling_documents(self) -> List[DoclingDocument]:
|
||||||
|
conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
|
||||||
|
return [result.document for result in conv_results_iter]
|
||||||
|
|
||||||
|
def _chunk_text(self, doc: DoclingDocument) -> Iterator[str]:
|
||||||
|
chunker = HierarchicalChunker()
|
||||||
|
for chunk in chunker.chunk(doc):
|
||||||
|
yield chunk.text
|
||||||
|
|
||||||
|
def _process_file_paths(self) -> List[Path | str]:
|
||||||
|
processed_paths = []
|
||||||
|
for path in self.file_paths:
|
||||||
|
if path.startswith("http"):
|
||||||
|
if path.startswith(("http://", "https://")):
|
||||||
|
try:
|
||||||
|
result = urlparse(path)
|
||||||
|
if all([result.scheme, result.netloc]): # Basic URL validation
|
||||||
|
processed_paths.append(path)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid URL format: {path}")
|
||||||
|
except Exception as e:
|
||||||
|
raise ValueError(f"Invalid URL: {path}. Error: {str(e)}")
|
||||||
|
else:
|
||||||
|
local_path = Path(KNOWLEDGE_DIRECTORY).joinpath(path)
|
||||||
|
if local_path.exists():
|
||||||
|
processed_paths.append(local_path)
|
||||||
|
else:
|
||||||
|
raise FileNotFoundError(f"File not found: {local_path}")
|
||||||
|
return processed_paths
|
||||||
@@ -11,6 +11,7 @@ from crewai.knowledge.source.json_knowledge_source import JSONKnowledgeSource
|
|||||||
from crewai.knowledge.source.pdf_knowledge_source import PDFKnowledgeSource
|
from crewai.knowledge.source.pdf_knowledge_source import PDFKnowledgeSource
|
||||||
from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource
|
from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource
|
||||||
from crewai.knowledge.source.text_file_knowledge_source import TextFileKnowledgeSource
|
from crewai.knowledge.source.text_file_knowledge_source import TextFileKnowledgeSource
|
||||||
|
from crewai.knowledge.source.docling_source import DoclingSource
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
@@ -200,7 +201,7 @@ def test_single_short_file(mock_vector_db, tmpdir):
|
|||||||
f.write(content)
|
f.write(content)
|
||||||
|
|
||||||
file_source = TextFileKnowledgeSource(
|
file_source = TextFileKnowledgeSource(
|
||||||
file_path=file_path, metadata={"preference": "personal"}
|
file_paths=[file_path], metadata={"preference": "personal"}
|
||||||
)
|
)
|
||||||
mock_vector_db.sources = [file_source]
|
mock_vector_db.sources = [file_source]
|
||||||
mock_vector_db.query.return_value = [{"context": content, "score": 0.9}]
|
mock_vector_db.query.return_value = [{"context": content, "score": 0.9}]
|
||||||
@@ -242,7 +243,7 @@ def test_single_2k_character_file(mock_vector_db, tmpdir):
|
|||||||
f.write(content)
|
f.write(content)
|
||||||
|
|
||||||
file_source = TextFileKnowledgeSource(
|
file_source = TextFileKnowledgeSource(
|
||||||
file_path=file_path, metadata={"preference": "personal"}
|
file_paths=[file_path], metadata={"preference": "personal"}
|
||||||
)
|
)
|
||||||
mock_vector_db.sources = [file_source]
|
mock_vector_db.sources = [file_source]
|
||||||
mock_vector_db.query.return_value = [{"context": content, "score": 0.9}]
|
mock_vector_db.query.return_value = [{"context": content, "score": 0.9}]
|
||||||
@@ -279,7 +280,7 @@ def test_multiple_short_files(mock_vector_db, tmpdir):
|
|||||||
file_paths.append((file_path, item["metadata"]))
|
file_paths.append((file_path, item["metadata"]))
|
||||||
|
|
||||||
file_sources = [
|
file_sources = [
|
||||||
TextFileKnowledgeSource(file_path=path, metadata=metadata)
|
TextFileKnowledgeSource(file_paths=[path], metadata=metadata)
|
||||||
for path, metadata in file_paths
|
for path, metadata in file_paths
|
||||||
]
|
]
|
||||||
mock_vector_db.sources = file_sources
|
mock_vector_db.sources = file_sources
|
||||||
@@ -352,7 +353,7 @@ def test_multiple_2k_character_files(mock_vector_db, tmpdir):
|
|||||||
file_paths.append(file_path)
|
file_paths.append(file_path)
|
||||||
|
|
||||||
file_sources = [
|
file_sources = [
|
||||||
TextFileKnowledgeSource(file_path=path, metadata={"preference": "personal"})
|
TextFileKnowledgeSource(file_paths=[path], metadata={"preference": "personal"})
|
||||||
for path in file_paths
|
for path in file_paths
|
||||||
]
|
]
|
||||||
mock_vector_db.sources = file_sources
|
mock_vector_db.sources = file_sources
|
||||||
@@ -399,7 +400,7 @@ def test_hybrid_string_and_files(mock_vector_db, tmpdir):
|
|||||||
file_paths.append(file_path)
|
file_paths.append(file_path)
|
||||||
|
|
||||||
file_sources = [
|
file_sources = [
|
||||||
TextFileKnowledgeSource(file_path=path, metadata={"preference": "personal"})
|
TextFileKnowledgeSource(file_paths=[path], metadata={"preference": "personal"})
|
||||||
for path in file_paths
|
for path in file_paths
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -424,7 +425,7 @@ def test_pdf_knowledge_source(mock_vector_db):
|
|||||||
|
|
||||||
# Create a PDFKnowledgeSource
|
# Create a PDFKnowledgeSource
|
||||||
pdf_source = PDFKnowledgeSource(
|
pdf_source = PDFKnowledgeSource(
|
||||||
file_path=pdf_path, metadata={"preference": "personal"}
|
file_paths=[pdf_path], metadata={"preference": "personal"}
|
||||||
)
|
)
|
||||||
mock_vector_db.sources = [pdf_source]
|
mock_vector_db.sources = [pdf_source]
|
||||||
mock_vector_db.query.return_value = [
|
mock_vector_db.query.return_value = [
|
||||||
@@ -461,7 +462,7 @@ def test_csv_knowledge_source(mock_vector_db, tmpdir):
|
|||||||
|
|
||||||
# Create a CSVKnowledgeSource
|
# Create a CSVKnowledgeSource
|
||||||
csv_source = CSVKnowledgeSource(
|
csv_source = CSVKnowledgeSource(
|
||||||
file_path=csv_path, metadata={"preference": "personal"}
|
file_paths=[csv_path], metadata={"preference": "personal"}
|
||||||
)
|
)
|
||||||
mock_vector_db.sources = [csv_source]
|
mock_vector_db.sources = [csv_source]
|
||||||
mock_vector_db.query.return_value = [
|
mock_vector_db.query.return_value = [
|
||||||
@@ -496,7 +497,7 @@ def test_json_knowledge_source(mock_vector_db, tmpdir):
|
|||||||
|
|
||||||
# Create a JSONKnowledgeSource
|
# Create a JSONKnowledgeSource
|
||||||
json_source = JSONKnowledgeSource(
|
json_source = JSONKnowledgeSource(
|
||||||
file_path=json_path, metadata={"preference": "personal"}
|
file_paths=[json_path], metadata={"preference": "personal"}
|
||||||
)
|
)
|
||||||
mock_vector_db.sources = [json_source]
|
mock_vector_db.sources = [json_source]
|
||||||
mock_vector_db.query.return_value = [
|
mock_vector_db.query.return_value = [
|
||||||
@@ -529,7 +530,7 @@ def test_excel_knowledge_source(mock_vector_db, tmpdir):
|
|||||||
|
|
||||||
# Create an ExcelKnowledgeSource
|
# Create an ExcelKnowledgeSource
|
||||||
excel_source = ExcelKnowledgeSource(
|
excel_source = ExcelKnowledgeSource(
|
||||||
file_path=excel_path, metadata={"preference": "personal"}
|
file_paths=[excel_path], metadata={"preference": "personal"}
|
||||||
)
|
)
|
||||||
mock_vector_db.sources = [excel_source]
|
mock_vector_db.sources = [excel_source]
|
||||||
mock_vector_db.query.return_value = [
|
mock_vector_db.query.return_value = [
|
||||||
@@ -543,3 +544,23 @@ def test_excel_knowledge_source(mock_vector_db, tmpdir):
|
|||||||
# Assert that the correct information is retrieved
|
# Assert that the correct information is retrieved
|
||||||
assert any("30" in result["context"] for result in results)
|
assert any("30" in result["context"] for result in results)
|
||||||
mock_vector_db.query.assert_called_once()
|
mock_vector_db.query.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
|
def test_docling_source(mock_vector_db):
|
||||||
|
docling_source = DoclingSource(
|
||||||
|
file_paths=[
|
||||||
|
"https://lilianweng.github.io/posts/2024-11-28-reward-hacking/",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
mock_vector_db.sources = [docling_source]
|
||||||
|
mock_vector_db.query.return_value = [
|
||||||
|
{
|
||||||
|
"context": "Reward hacking is a technique used to improve the performance of reinforcement learning agents.",
|
||||||
|
"score": 0.9,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
# Perform a query
|
||||||
|
query = "What is reward hacking?"
|
||||||
|
results = mock_vector_db.query(query)
|
||||||
|
assert any("reward hacking" in result["context"].lower() for result in results)
|
||||||
|
mock_vector_db.query.assert_called_once()
|
||||||
|
|||||||
Reference in New Issue
Block a user