enabling local files to work and type cleanup

This commit is contained in:
Lorenze Jay
2024-12-16 20:21:47 -08:00
parent 356eb07d5f
commit 10c04d54a9
2 changed files with 12 additions and 4 deletions

View File

@@ -25,7 +25,7 @@ class DoclingSource(BaseKnowledgeSource):
file_paths: List[str] = Field(default_factory=list)
document_converter: DocumentConverter = Field(default_factory=DocumentConverter)
chunks: List[str] = Field(default_factory=list)
safe_file_paths: List[str] = Field(default_factory=list)
safe_file_paths: List[Union[Path, str]] = Field(default_factory=list)
content: List[DoclingDocument] = Field(default_factory=list)
def model_post_init(self, _) -> None:
@@ -75,8 +75,8 @@ class DoclingSource(BaseKnowledgeSource):
for chunk in chunker.chunk(doc):
yield chunk.text
def _process_file_paths(self) -> List[str]:
processed_paths = []
def _process_file_paths(self) -> List[Union[Path, str]]:
processed_paths: List[Union[Path, str]] = []
for path in self.file_paths:
if isinstance(path, str):
if path.startswith(("http://", "https://")):
@@ -90,7 +90,7 @@ class DoclingSource(BaseKnowledgeSource):
else:
local_path = Path(KNOWLEDGE_DIRECTORY + "/" + path)
if local_path.exists():
processed_paths.append(local_path.name)
processed_paths.append(local_path)
else:
raise FileNotFoundError(f"File not found: {local_path}")
else:

View File

@@ -575,3 +575,11 @@ def test_multiple_docling_sources():
assert docling_source.file_paths == urls
assert docling_source.content is not None
def test_docling_source_with_local_file():
current_dir = Path(__file__).parent
pdf_path = current_dir / "crewai_quickstart.pdf"
docling_source = DoclingSource(file_paths=[str(pdf_path.name)])
assert docling_source.file_paths == [str(pdf_path.name)]
assert docling_source.content is not None