Merge branch 'main' into feat/joao-flow-improvement-requests

This commit is contained in:
João Moura
2024-12-24 18:52:59 -03:00
committed by GitHub
10 changed files with 1257 additions and 48 deletions

View File

@@ -1,8 +1,8 @@
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Dict, List, Union
from typing import Dict, List, Optional, Union
from pydantic import Field
from pydantic import Field, field_validator
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage
@@ -14,17 +14,28 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
"""Base class for knowledge sources that load content from files."""
_logger: Logger = Logger(verbose=True)
file_path: Union[Path, List[Path], str, List[str]] = Field(
..., description="The path to the file"
file_path: Optional[Union[Path, List[Path], str, List[str]]] = Field(
default=None,
description="[Deprecated] The path to the file. Use file_paths instead.",
)
file_paths: Optional[Union[Path, List[Path], str, List[str]]] = Field(
default_factory=list, description="The path to the file"
)
content: Dict[Path, str] = Field(init=False, default_factory=dict)
storage: KnowledgeStorage = Field(default_factory=KnowledgeStorage)
safe_file_paths: List[Path] = Field(default_factory=list)
@field_validator("file_path", "file_paths", mode="before")
def validate_file_path(cls, v, values):
"""Validate that at least one of file_path or file_paths is provided."""
if v is None and ("file_path" not in values or values.get("file_path") is None):
raise ValueError("Either file_path or file_paths must be provided")
return v
def model_post_init(self, _):
"""Post-initialization method to load content."""
self.safe_file_paths = self._process_file_paths()
self.validate_paths()
self.validate_content()
self.content = self.load_content()
@abstractmethod
@@ -32,7 +43,7 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
"""Load and preprocess file content. Should be overridden by subclasses. Assume that the file path is relative to the project root in the knowledge directory."""
pass
def validate_paths(self):
def validate_content(self):
"""Validate the paths."""
for path in self.safe_file_paths:
if not path.exists():
@@ -59,13 +70,30 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
def _process_file_paths(self) -> List[Path]:
"""Convert file_path to a list of Path objects."""
paths = (
[self.file_path]
if isinstance(self.file_path, (str, Path))
else self.file_path
if hasattr(self, "file_path") and self.file_path is not None:
self._logger.log(
"warning",
"The 'file_path' attribute is deprecated and will be removed in a future version. Please use 'file_paths' instead.",
color="yellow",
)
self.file_paths = self.file_path
if self.file_paths is None:
raise ValueError("Your source must be provided with a file_paths: []")
# Convert single path to list
path_list: List[Union[Path, str]] = (
[self.file_paths]
if isinstance(self.file_paths, (str, Path))
else list(self.file_paths)
if isinstance(self.file_paths, list)
else []
)
if not isinstance(paths, list):
raise ValueError("file_path must be a Path, str, or a list of these types")
if not path_list:
raise ValueError(
"file_path/file_paths must be a Path, str, or a list of these types"
)
return [self.convert_to_path(path) for path in paths]
return [self.convert_to_path(path) for path in path_list]

View File

@@ -21,7 +21,7 @@ class BaseKnowledgeSource(BaseModel, ABC):
collection_name: Optional[str] = Field(default=None)
@abstractmethod
def load_content(self) -> Dict[Any, str]:
def validate_content(self) -> Any:
"""Load and preprocess content from the source."""
pass

View File

@@ -0,0 +1,120 @@
from pathlib import Path
from typing import Iterator, List, Optional, Union
from urllib.parse import urlparse
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter
from docling.exceptions import ConversionError
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
from docling_core.types.doc.document import DoclingDocument
from pydantic import Field
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
from crewai.utilities.constants import KNOWLEDGE_DIRECTORY
from crewai.utilities.logger import Logger
class CrewDoclingSource(BaseKnowledgeSource):
"""Default Source class for converting documents to markdown or json
This will auto support PDF, DOCX, and TXT, XLSX, Images, and HTML files without any additional dependencies and follows the docling package as the source of truth.
"""
_logger: Logger = Logger(verbose=True)
file_path: Optional[List[Union[Path, str]]] = Field(default=None)
file_paths: List[Union[Path, str]] = Field(default_factory=list)
chunks: List[str] = Field(default_factory=list)
safe_file_paths: List[Union[Path, str]] = Field(default_factory=list)
content: List[DoclingDocument] = Field(default_factory=list)
document_converter: DocumentConverter = Field(
default_factory=lambda: DocumentConverter(
allowed_formats=[
InputFormat.MD,
InputFormat.ASCIIDOC,
InputFormat.PDF,
InputFormat.DOCX,
InputFormat.HTML,
InputFormat.IMAGE,
InputFormat.XLSX,
InputFormat.PPTX,
]
)
)
def model_post_init(self, _) -> None:
if self.file_path:
self._logger.log(
"warning",
"The 'file_path' attribute is deprecated and will be removed in a future version. Please use 'file_paths' instead.",
color="yellow",
)
self.file_paths = self.file_path
self.safe_file_paths = self.validate_content()
self.content = self._load_content()
def _load_content(self) -> List[DoclingDocument]:
try:
return self._convert_source_to_docling_documents()
except ConversionError as e:
self._logger.log(
"error",
f"Error loading content: {e}. Supported formats: {self.document_converter.allowed_formats}",
"red",
)
raise e
except Exception as e:
self._logger.log("error", f"Error loading content: {e}")
raise e
def add(self) -> None:
if self.content is None:
return
for doc in self.content:
new_chunks_iterable = self._chunk_doc(doc)
self.chunks.extend(list(new_chunks_iterable))
self._save_documents()
def _convert_source_to_docling_documents(self) -> List[DoclingDocument]:
conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
return [result.document for result in conv_results_iter]
def _chunk_doc(self, doc: DoclingDocument) -> Iterator[str]:
chunker = HierarchicalChunker()
for chunk in chunker.chunk(doc):
yield chunk.text
def validate_content(self) -> List[Union[Path, str]]:
processed_paths: List[Union[Path, str]] = []
for path in self.file_paths:
if isinstance(path, str):
if path.startswith(("http://", "https://")):
try:
if self._validate_url(path):
processed_paths.append(path)
else:
raise ValueError(f"Invalid URL format: {path}")
except Exception as e:
raise ValueError(f"Invalid URL: {path}. Error: {str(e)}")
else:
local_path = Path(KNOWLEDGE_DIRECTORY + "/" + path)
if local_path.exists():
processed_paths.append(local_path)
else:
raise FileNotFoundError(f"File not found: {local_path}")
else:
# this is an instance of Path
processed_paths.append(path)
return processed_paths
def _validate_url(self, url: str) -> bool:
try:
result = urlparse(url)
return all(
[
result.scheme in ("http", "https"),
result.netloc,
len(result.netloc.split(".")) >= 2, # Ensure domain has TLD
]
)
except Exception:
return False

View File

@@ -13,9 +13,9 @@ class StringKnowledgeSource(BaseKnowledgeSource):
def model_post_init(self, _):
"""Post-initialization method to validate content."""
self.load_content()
self.validate_content()
def load_content(self):
def validate_content(self):
"""Validate string content."""
if not isinstance(self.content, str):
raise ValueError("StringKnowledgeSource only accepts string content")

View File

@@ -1,13 +1,25 @@
import datetime
import inspect
import json
import logging
import threading
import uuid
from concurrent.futures import Future
from copy import copy
from hashlib import md5
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from typing import (
Any,
Callable,
ClassVar,
Dict,
List,
Optional,
Set,
Tuple,
Type,
Union,
)
from opentelemetry.trace import Span
from pydantic import (
@@ -51,6 +63,7 @@ class Task(BaseModel):
"""
__hash__ = object.__hash__ # type: ignore
logger: ClassVar[logging.Logger] = logging.getLogger(__name__)
used_tools: int = 0
tools_errors: int = 0
delegations: int = 0
@@ -389,7 +402,18 @@ class Task(BaseModel):
if inputs:
self.description = self._original_description.format(**inputs)
self.expected_output = self._original_expected_output.format(**inputs)
self.expected_output = self.interpolate_only(
input_string=self._original_expected_output, inputs=inputs
)
def interpolate_only(self, input_string: str, inputs: Dict[str, Any]) -> str:
"""Interpolate placeholders (e.g., {key}) in a string while leaving JSON untouched."""
escaped_string = input_string.replace("{", "{{").replace("}", "}}")
for key in inputs.keys():
escaped_string = escaped_string.replace(f"{{{{{key}}}}}", f"{{{key}}}")
return escaped_string.format(**inputs)
def increment_tools_errors(self) -> None:
"""Increment the tools errors counter."""
@@ -471,22 +495,33 @@ class Task(BaseModel):
return OutputFormat.RAW
def _save_file(self, result: Any) -> None:
"""Save task output to a file.
Args:
result: The result to save to the file. Can be a dict or any stringifiable object.
Raises:
ValueError: If output_file is not set
RuntimeError: If there is an error writing to the file
"""
if self.output_file is None:
raise ValueError("output_file is not set.")
resolved_path = Path(self.output_file).expanduser().resolve()
directory = resolved_path.parent
try:
resolved_path = Path(self.output_file).expanduser().resolve()
directory = resolved_path.parent
if not directory.exists():
directory.mkdir(parents=True, exist_ok=True)
if not directory.exists():
directory.mkdir(parents=True, exist_ok=True)
with resolved_path.open("w", encoding="utf-8") as file:
if isinstance(result, dict):
import json
json.dump(result, file, ensure_ascii=False, indent=2)
else:
file.write(str(result))
with resolved_path.open("w", encoding="utf-8") as file:
if isinstance(result, dict):
import json
json.dump(result, file, ensure_ascii=False, indent=2)
else:
file.write(str(result))
except (OSError, IOError) as e:
raise RuntimeError(f"Failed to save output file: {e}")
return None
def __repr__(self):