From 04cb9afae5703d155a3b2ba008f6884a2c854007 Mon Sep 17 00:00:00 2001 From: Lorenze Jay Date: Sun, 15 Dec 2024 22:15:49 -0800 Subject: [PATCH] added tool for docling support --- src/crewai/knowledge/source/docling_source.py | 82 +++++++++++++++++++ tests/knowledge/knowledge_test.py | 39 +++++++-- 2 files changed, 112 insertions(+), 9 deletions(-) create mode 100644 src/crewai/knowledge/source/docling_source.py diff --git a/src/crewai/knowledge/source/docling_source.py b/src/crewai/knowledge/source/docling_source.py new file mode 100644 index 000000000..6e4b6a6b4 --- /dev/null +++ b/src/crewai/knowledge/source/docling_source.py @@ -0,0 +1,82 @@ +from pathlib import Path +from typing import List, Union, Iterator +from pydantic import Field +from urllib.parse import urlparse + +from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource +from crewai.utilities.constants import KNOWLEDGE_DIRECTORY +from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker +from docling.document_converter import DocumentConverter +from docling_core.types.doc.document import DoclingDocument +from docling.datamodel.base_models import InputFormat + + +class DoclingSource(BaseFileKnowledgeSource): + """Utility package for converting documents to markdown or json + This will auto support PDF, DOCX, and TXT, XLSX, files without any additional dependencies. + """ + + file_paths: List[str] = Field(default_factory=list) + document_converter: DocumentConverter = Field(default_factory=DocumentConverter) + safe_file_paths: List[Union[Path, str]] = Field(default_factory=list) + content: List[DoclingDocument] | None = Field(default=None) + + def model_post_init(self, _) -> None: + self.safe_file_paths = self._process_file_paths() + self.document_converter = DocumentConverter( + allowed_formats=[ + InputFormat.MD, + InputFormat.ASCIIDOC, + InputFormat.PDF, + InputFormat.DOCX, + InputFormat.HTML, + InputFormat.IMAGE, + InputFormat.XLSX, + InputFormat.PPTX, + ] + ) + self.content = self.load_content() + + def load_content(self): + try: + self.content = self.convert_source_to_docling_documents() + except Exception as e: + self._logger.log("error", f"Error loading content: {e}") + + def add(self) -> None: + if self.content is None: + return + for doc in self.content: + new_chunks = self._chunk_text(doc) + self.chunks.extend(new_chunks) + self._save_documents() + + def convert_source_to_docling_documents(self) -> List[DoclingDocument]: + conv_results_iter = self.document_converter.convert_all(self.safe_file_paths) + return [result.document for result in conv_results_iter] + + def _chunk_text(self, doc: DoclingDocument) -> Iterator[str]: + chunker = HierarchicalChunker() + for chunk in chunker.chunk(doc): + yield chunk.text + + def _process_file_paths(self) -> List[Path | str]: + processed_paths = [] + for path in self.file_paths: + if path.startswith("http"): + if path.startswith(("http://", "https://")): + try: + result = urlparse(path) + if all([result.scheme, result.netloc]): # Basic URL validation + processed_paths.append(path) + else: + raise ValueError(f"Invalid URL format: {path}") + except Exception as e: + raise ValueError(f"Invalid URL: {path}. Error: {str(e)}") + else: + local_path = Path(KNOWLEDGE_DIRECTORY).joinpath(path) + if local_path.exists(): + processed_paths.append(local_path) + else: + raise FileNotFoundError(f"File not found: {local_path}") + return processed_paths diff --git a/tests/knowledge/knowledge_test.py b/tests/knowledge/knowledge_test.py index 201ddea12..9114f5a80 100644 --- a/tests/knowledge/knowledge_test.py +++ b/tests/knowledge/knowledge_test.py @@ -11,6 +11,7 @@ from crewai.knowledge.source.json_knowledge_source import JSONKnowledgeSource from crewai.knowledge.source.pdf_knowledge_source import PDFKnowledgeSource from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource from crewai.knowledge.source.text_file_knowledge_source import TextFileKnowledgeSource +from crewai.knowledge.source.docling_source import DoclingSource @pytest.fixture(autouse=True) @@ -200,7 +201,7 @@ def test_single_short_file(mock_vector_db, tmpdir): f.write(content) file_source = TextFileKnowledgeSource( - file_path=file_path, metadata={"preference": "personal"} + file_paths=[file_path], metadata={"preference": "personal"} ) mock_vector_db.sources = [file_source] mock_vector_db.query.return_value = [{"context": content, "score": 0.9}] @@ -242,7 +243,7 @@ def test_single_2k_character_file(mock_vector_db, tmpdir): f.write(content) file_source = TextFileKnowledgeSource( - file_path=file_path, metadata={"preference": "personal"} + file_paths=[file_path], metadata={"preference": "personal"} ) mock_vector_db.sources = [file_source] mock_vector_db.query.return_value = [{"context": content, "score": 0.9}] @@ -279,7 +280,7 @@ def test_multiple_short_files(mock_vector_db, tmpdir): file_paths.append((file_path, item["metadata"])) file_sources = [ - TextFileKnowledgeSource(file_path=path, metadata=metadata) + TextFileKnowledgeSource(file_paths=[path], metadata=metadata) for path, metadata in file_paths ] mock_vector_db.sources = file_sources @@ -352,7 +353,7 @@ def test_multiple_2k_character_files(mock_vector_db, tmpdir): file_paths.append(file_path) file_sources = [ - TextFileKnowledgeSource(file_path=path, metadata={"preference": "personal"}) + TextFileKnowledgeSource(file_paths=[path], metadata={"preference": "personal"}) for path in file_paths ] mock_vector_db.sources = file_sources @@ -399,7 +400,7 @@ def test_hybrid_string_and_files(mock_vector_db, tmpdir): file_paths.append(file_path) file_sources = [ - TextFileKnowledgeSource(file_path=path, metadata={"preference": "personal"}) + TextFileKnowledgeSource(file_paths=[path], metadata={"preference": "personal"}) for path in file_paths ] @@ -424,7 +425,7 @@ def test_pdf_knowledge_source(mock_vector_db): # Create a PDFKnowledgeSource pdf_source = PDFKnowledgeSource( - file_path=pdf_path, metadata={"preference": "personal"} + file_paths=[pdf_path], metadata={"preference": "personal"} ) mock_vector_db.sources = [pdf_source] mock_vector_db.query.return_value = [ @@ -461,7 +462,7 @@ def test_csv_knowledge_source(mock_vector_db, tmpdir): # Create a CSVKnowledgeSource csv_source = CSVKnowledgeSource( - file_path=csv_path, metadata={"preference": "personal"} + file_paths=[csv_path], metadata={"preference": "personal"} ) mock_vector_db.sources = [csv_source] mock_vector_db.query.return_value = [ @@ -496,7 +497,7 @@ def test_json_knowledge_source(mock_vector_db, tmpdir): # Create a JSONKnowledgeSource json_source = JSONKnowledgeSource( - file_path=json_path, metadata={"preference": "personal"} + file_paths=[json_path], metadata={"preference": "personal"} ) mock_vector_db.sources = [json_source] mock_vector_db.query.return_value = [ @@ -529,7 +530,7 @@ def test_excel_knowledge_source(mock_vector_db, tmpdir): # Create an ExcelKnowledgeSource excel_source = ExcelKnowledgeSource( - file_path=excel_path, metadata={"preference": "personal"} + file_paths=[excel_path], metadata={"preference": "personal"} ) mock_vector_db.sources = [excel_source] mock_vector_db.query.return_value = [ @@ -543,3 +544,23 @@ def test_excel_knowledge_source(mock_vector_db, tmpdir): # Assert that the correct information is retrieved assert any("30" in result["context"] for result in results) mock_vector_db.query.assert_called_once() + + +def test_docling_source(mock_vector_db): + docling_source = DoclingSource( + file_paths=[ + "https://lilianweng.github.io/posts/2024-11-28-reward-hacking/", + ], + ) + mock_vector_db.sources = [docling_source] + mock_vector_db.query.return_value = [ + { + "context": "Reward hacking is a technique used to improve the performance of reinforcement learning agents.", + "score": 0.9, + } + ] + # Perform a query + query = "What is reward hacking?" + results = mock_vector_db.query(query) + assert any("reward hacking" in result["context"].lower() for result in results) + mock_vector_db.query.assert_called_once()