Compare commits

...

3 Commits

Author SHA1 Message Date
Brandon Hancock
31e907bd84 update docs 2025-01-16 12:20:56 -05:00
Brandon Hancock (bhancock_ai)
4fff01dc4d Merge branch 'main' into bugfix/fix-docling-references 2025-01-16 11:33:39 -05:00
Brandon Hancock
67772ca591 Fix docling issues 2025-01-16 11:29:26 -05:00
2 changed files with 12 additions and 5 deletions

View File

@@ -93,6 +93,12 @@ result = crew.kickoff(inputs={"question": "What city does John live in and how o
Here's another example with the `CrewDoclingSource`. The CrewDoclingSource is actually quite versatile and can handle multiple file formats including TXT, PDF, DOCX, HTML, and more.
<Note>
You need to install `docling` for the following example to work: `uv add docling`
</Note>
```python Code
from crewai import LLM, Agent, Crew, Process, Task
from crewai.knowledge.source.crew_docling_source import CrewDoclingSource

View File

@@ -8,6 +8,7 @@ try:
from docling.exceptions import ConversionError
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
from docling_core.types.doc.document import DoclingDocument
DOCLING_AVAILABLE = True
except ImportError:
DOCLING_AVAILABLE = False
@@ -38,8 +39,8 @@ class CrewDoclingSource(BaseKnowledgeSource):
file_paths: List[Union[Path, str]] = Field(default_factory=list)
chunks: List[str] = Field(default_factory=list)
safe_file_paths: List[Union[Path, str]] = Field(default_factory=list)
content: List[DoclingDocument] = Field(default_factory=list)
document_converter: DocumentConverter = Field(
content: List["DoclingDocument"] = Field(default_factory=list)
document_converter: "DocumentConverter" = Field(
default_factory=lambda: DocumentConverter(
allowed_formats=[
InputFormat.MD,
@@ -65,7 +66,7 @@ class CrewDoclingSource(BaseKnowledgeSource):
self.safe_file_paths = self.validate_content()
self.content = self._load_content()
def _load_content(self) -> List[DoclingDocument]:
def _load_content(self) -> List["DoclingDocument"]:
try:
return self._convert_source_to_docling_documents()
except ConversionError as e:
@@ -87,11 +88,11 @@ class CrewDoclingSource(BaseKnowledgeSource):
self.chunks.extend(list(new_chunks_iterable))
self._save_documents()
def _convert_source_to_docling_documents(self) -> List[DoclingDocument]:
def _convert_source_to_docling_documents(self) -> List["DoclingDocument"]:
conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
return [result.document for result in conv_results_iter]
def _chunk_doc(self, doc: DoclingDocument) -> Iterator[str]:
def _chunk_doc(self, doc: "DoclingDocument") -> Iterator[str]:
chunker = HierarchicalChunker()
for chunk in chunker.chunk(doc):
yield chunk.text