mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-08 15:48:29 +00:00
logged but file_path is backwards compatible
This commit is contained in:
@@ -19,11 +19,20 @@ class DoclingSource(BaseFileKnowledgeSource):
|
||||
|
||||
file_paths: List[str] = Field(default_factory=list)
|
||||
document_converter: DocumentConverter = Field(default_factory=DocumentConverter)
|
||||
safe_file_paths: List[Union[Path, str]] = Field(default_factory=list)
|
||||
content: List[DoclingDocument] | None = Field(default=None)
|
||||
chunks: List[str] = Field(default_factory=list)
|
||||
# We are accepting string urls and validating them if they are valid urls
|
||||
# Overiding content to be a list of DoclingDocuments
|
||||
safe_file_paths: List[Union[Path, str]] = Field(default_factory=list) # type: ignore[assignment]
|
||||
content: List[DoclingDocument] | None = Field(default=None) # type: ignore[assignment]
|
||||
|
||||
def model_post_init(self, _) -> None:
|
||||
if self.file_path:
|
||||
self._logger.log(
|
||||
"warning",
|
||||
"The 'file_path' attribute is deprecated and will be removed in a future version. Please use 'file_paths' instead.",
|
||||
color="yellow",
|
||||
)
|
||||
self.file_paths = self.file_path # type: ignore[assignment]
|
||||
self.safe_file_paths = self._process_file_paths()
|
||||
self.document_converter = DocumentConverter(
|
||||
allowed_formats=[
|
||||
@@ -39,7 +48,7 @@ class DoclingSource(BaseFileKnowledgeSource):
|
||||
)
|
||||
self.content = self.load_content()
|
||||
|
||||
def load_content(self) -> List[DoclingDocument] | None:
|
||||
def load_content(self) -> List[DoclingDocument] | None: # type: ignore[assignment]
|
||||
try:
|
||||
return self.convert_source_to_docling_documents()
|
||||
except Exception as e:
|
||||
@@ -58,28 +67,43 @@ class DoclingSource(BaseFileKnowledgeSource):
|
||||
conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
|
||||
return [result.document for result in conv_results_iter]
|
||||
|
||||
def _chunk_text(self, doc: DoclingDocument) -> Iterator[str]:
|
||||
def _chunk_text(self, doc: DoclingDocument) -> Iterator[str]: # type: ignore[assignment]
|
||||
chunker = HierarchicalChunker()
|
||||
for chunk in chunker.chunk(doc):
|
||||
yield chunk.text
|
||||
|
||||
def _process_file_paths(self) -> list[Path | str]:
|
||||
def _process_file_paths(self) -> list[Path | str]: # type: ignore[assignment]
|
||||
processed_paths = []
|
||||
for path in self.file_paths:
|
||||
if path.startswith("http"):
|
||||
if isinstance(path, str):
|
||||
if path.startswith(("http://", "https://")):
|
||||
try:
|
||||
result = urlparse(path)
|
||||
if all([result.scheme, result.netloc]): # Basic URL validation
|
||||
if self._validate_url(path):
|
||||
processed_paths.append(path)
|
||||
else:
|
||||
raise ValueError(f"Invalid URL format: {path}")
|
||||
except Exception as e:
|
||||
raise ValueError(f"Invalid URL: {path}. Error: {str(e)}")
|
||||
else:
|
||||
local_path = Path(KNOWLEDGE_DIRECTORY + "/" + path)
|
||||
if local_path.exists():
|
||||
processed_paths.append(local_path)
|
||||
else:
|
||||
raise FileNotFoundError(f"File not found: {local_path}")
|
||||
local_path = Path(KNOWLEDGE_DIRECTORY + "/" + path)
|
||||
if local_path.exists():
|
||||
processed_paths.append(local_path)
|
||||
else:
|
||||
raise FileNotFoundError(f"File not found: {local_path}")
|
||||
else:
|
||||
# this is an instance of Path
|
||||
processed_paths.append(path)
|
||||
return processed_paths
|
||||
|
||||
def _validate_url(self, url: str) -> bool:
|
||||
try:
|
||||
result = urlparse(url)
|
||||
return all(
|
||||
[
|
||||
result.scheme in ("http", "https"),
|
||||
result.netloc,
|
||||
len(result.netloc.split(".")) >= 2, # Ensure domain has TLD
|
||||
]
|
||||
)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
Reference in New Issue
Block a user