refactor: Change storage field to optional and improve error handling when saving documents

fix: Change storage initialization to None for KnowledgeStorage
Feat/joao flow improvement requests (#1795 )
2026-04-15 23:42:37 +00:00 · 2024-12-26 22:27:19 -04:00 · 2024-12-26 21:30:06 -04:00 · 2024-12-24 18:55:44 -03:00 · 2024-12-23 13:54:16 -05:00 · 2024-12-23 13:19:58 -05:00
13 changed files with 1412 additions and 166 deletions
--- a/docs/concepts/knowledge.mdx
+++ b/docs/concepts/knowledge.mdx
@@ -79,6 +79,55 @@ crew = Crew(
 result = crew.kickoff(inputs={"question": "What city does John live in and how old is he?"})
 ```

+
+Here's another example with the `CrewDoclingSource`
+```python Code
+from crewai import LLM, Agent, Crew, Process, Task
+from crewai.knowledge.source.crew_docling_source import CrewDoclingSource
+
+# Create a knowledge source
+content_source = CrewDoclingSource(
+    file_paths=[
+        "https://lilianweng.github.io/posts/2024-11-28-reward-hacking",
+        "https://lilianweng.github.io/posts/2024-07-07-hallucination",
+    ],
+)
+
+# Create an LLM with a temperature of 0 to ensure deterministic outputs
+llm = LLM(model="gpt-4o-mini", temperature=0)
+
+# Create an agent with the knowledge store
+agent = Agent(
+    role="About papers",
+    goal="You know everything about the papers.",
+    backstory="""You are a master at understanding papers and their content.""",
+    verbose=True,
+    allow_delegation=False,
+    llm=llm,
+)
+task = Task(
+    description="Answer the following questions about the papers: {question}",
+    expected_output="An answer to the question.",
+    agent=agent,
+)
+
+crew = Crew(
+    agents=[agent],
+    tasks=[task],
+    verbose=True,
+    process=Process.sequential,
+    knowledge_sources=[
+        content_source
+    ],  # Enable knowledge by adding the sources here. You can also add more sources to the sources list.
+)
+
+result = crew.kickoff(
+    inputs={
+        "question": "What is the reward hacking paper about? Be sure to provide sources."
+    }
+)
+```
+
 ## Knowledge Configuration

 ### Chunking Configuration
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,6 +51,9 @@ openpyxl = [
    "openpyxl>=3.1.5",
 ]
 mem0 = ["mem0ai>=0.1.29"]
+docling = [
+    "docling>=2.12.0",
+]

 [tool.uv]
 dev-dependencies = [
--- a/src/crewai/flow/flow.py
+++ b/src/crewai/flow/flow.py
@@ -80,10 +80,27 @@ def listen(condition):
    return decorator


-def router(method):
+def router(condition):
    def decorator(func):
        func.__is_router__ = True
-        func.__router_for__ = method.__name__
+        # Handle conditions like listen/start
+        if isinstance(condition, str):
+            func.__trigger_methods__ = [condition]
+            func.__condition_type__ = "OR"
+        elif (
+            isinstance(condition, dict)
+            and "type" in condition
+            and "methods" in condition
+        ):
+            func.__trigger_methods__ = condition["methods"]
+            func.__condition_type__ = condition["type"]
+        elif callable(condition) and hasattr(condition, "__name__"):
+            func.__trigger_methods__ = [condition.__name__]
+            func.__condition_type__ = "OR"
+        else:
+            raise ValueError(
+                "Condition must be a method, string, or a result of or_() or and_()"
+            )
        return func

    return decorator
@@ -123,8 +140,8 @@ class FlowMeta(type):

        start_methods = []
        listeners = {}
-        routers = {}
        router_paths = {}
+        routers = set()

        for attr_name, attr_value in dct.items():
            if hasattr(attr_value, "__is_start_method__"):
@@ -137,18 +154,11 @@ class FlowMeta(type):
                methods = attr_value.__trigger_methods__
                condition_type = getattr(attr_value, "__condition_type__", "OR")
                listeners[attr_name] = (condition_type, methods)
-
-            elif hasattr(attr_value, "__is_router__"):
-                routers[attr_value.__router_for__] = attr_name
-                possible_returns = get_possible_return_constants(attr_value)
-                if possible_returns:
-                    router_paths[attr_name] = possible_returns
-
-                # Register router as a listener to its triggering method
-                trigger_method_name = attr_value.__router_for__
-                methods = [trigger_method_name]
-                condition_type = "OR"
-                listeners[attr_name] = (condition_type, methods)
+                if hasattr(attr_value, "__is_router__") and attr_value.__is_router__:
+                    routers.add(attr_name)
+                    possible_returns = get_possible_return_constants(attr_value)
+                    if possible_returns:
+                        router_paths[attr_name] = possible_returns

        setattr(cls, "_start_methods", start_methods)
        setattr(cls, "_listeners", listeners)
@@ -163,7 +173,7 @@ class Flow(Generic[T], metaclass=FlowMeta):

    _start_methods: List[str] = []
    _listeners: Dict[str, tuple[str, List[str]]] = {}
-    _routers: Dict[str, str] = {}
+    _routers: Set[str] = set()
    _router_paths: Dict[str, List[str]] = {}
    initial_state: Union[Type[T], T, None] = None
    event_emitter = Signal("event_emitter")
@@ -210,20 +220,10 @@ class Flow(Generic[T], metaclass=FlowMeta):
        return self._method_outputs

    def _initialize_state(self, inputs: Dict[str, Any]) -> None:
-        """
-        Initializes or updates the state with the provided inputs.
-
-        Args:
-            inputs: Dictionary of inputs to initialize or update the state.
-
-        Raises:
-            ValueError: If inputs do not match the structured state model.
-            TypeError: If state is neither a BaseModel instance nor a dictionary.
-        """
        if isinstance(self._state, BaseModel):
-            # Structured state management
+            # Structured state
            try:
-                # Define a function to create the dynamic class
+
                def create_model_with_extra_forbid(
                    base_model: Type[BaseModel],
                ) -> Type[BaseModel]:
@@ -233,34 +233,20 @@ class Flow(Generic[T], metaclass=FlowMeta):

                    return ModelWithExtraForbid

-                # Create the dynamic class
                ModelWithExtraForbid = create_model_with_extra_forbid(
                    self._state.__class__
                )
-
-                # Create a new instance using the combined state and inputs
                self._state = cast(
                    T, ModelWithExtraForbid(**{**self._state.model_dump(), **inputs})
                )
-
            except ValidationError as e:
                raise ValueError(f"Invalid inputs for structured state: {e}") from e
        elif isinstance(self._state, dict):
-            # Unstructured state management
            self._state.update(inputs)
        else:
            raise TypeError("State must be a BaseModel instance or a dictionary.")

    def kickoff(self, inputs: Optional[Dict[str, Any]] = None) -> Any:
-        """
-        Starts the execution of the flow synchronously.
-
-        Args:
-            inputs: Optional dictionary of inputs to initialize or update the state.
-
-        Returns:
-            The final output from the flow execution.
-        """
        self.event_emitter.send(
            self,
            event=FlowStartedEvent(
@@ -274,15 +260,6 @@ class Flow(Generic[T], metaclass=FlowMeta):
        return asyncio.run(self.kickoff_async())

    async def kickoff_async(self, inputs: Optional[Dict[str, Any]] = None) -> Any:
-        """
-        Starts the execution of the flow asynchronously.
-
-        Args:
-            inputs: Optional dictionary of inputs to initialize or update the state.
-
-        Returns:
-            The final output from the flow execution.
-        """
        if not self._start_methods:
            raise ValueError("No start method defined")

@@ -290,16 +267,12 @@ class Flow(Generic[T], metaclass=FlowMeta):
            self.__class__.__name__, list(self._methods.keys())
        )

-        # Create tasks for all start methods
        tasks = [
            self._execute_start_method(start_method)
            for start_method in self._start_methods
        ]
-
-        # Run all start methods concurrently
        await asyncio.gather(*tasks)

-        # Determine the final output (from the last executed method)
        final_output = self._method_outputs[-1] if self._method_outputs else None

        self.event_emitter.send(
@@ -310,7 +283,6 @@ class Flow(Generic[T], metaclass=FlowMeta):
                result=final_output,
            ),
        )
-
        return final_output

    async def _execute_start_method(self, start_method_name: str) -> None:
@@ -327,49 +299,68 @@ class Flow(Generic[T], metaclass=FlowMeta):
            if asyncio.iscoroutinefunction(method)
            else method(*args, **kwargs)
        )
-        self._method_outputs.append(result)  # Store the output
-
-        # Track method execution counts
+        self._method_outputs.append(result)
        self._method_execution_counts[method_name] = (
            self._method_execution_counts.get(method_name, 0) + 1
        )
-
        return result

    async def _execute_listeners(self, trigger_method: str, result: Any) -> None:
-        listener_tasks = []
-
-        if trigger_method in self._routers:
-            router_method = self._methods[self._routers[trigger_method]]
-            path = await self._execute_method(
-                self._routers[trigger_method], router_method
+        # First, handle routers repeatedly until no router triggers anymore
+        while True:
+            routers_triggered = self._find_triggered_methods(
+                trigger_method, router_only=True
            )
-            trigger_method = path
+            if not routers_triggered:
+                break
+            for router_name in routers_triggered:
+                await self._execute_single_listener(router_name, result)
+                # After executing router, the router's result is the path
+                # The last router executed sets the trigger_method
+                # The router result is the last element in self._method_outputs
+                trigger_method = self._method_outputs[-1]

+        # Now that no more routers are triggered by current trigger_method,
+        # execute normal listeners
+        listeners_triggered = self._find_triggered_methods(
+            trigger_method, router_only=False
+        )
+        if listeners_triggered:
+            tasks = [
+                self._execute_single_listener(listener_name, result)
+                for listener_name in listeners_triggered
+            ]
+            await asyncio.gather(*tasks)
+
+    def _find_triggered_methods(
+        self, trigger_method: str, router_only: bool
+    ) -> List[str]:
+        triggered = []
        for listener_name, (condition_type, methods) in self._listeners.items():
+            is_router = listener_name in self._routers
+
+            if router_only != is_router:
+                continue
+
            if condition_type == "OR":
+                # If the trigger_method matches any in methods, run this
                if trigger_method in methods:
-                    # Schedule the listener without preventing re-execution
-                    listener_tasks.append(
-                        self._execute_single_listener(listener_name, result)
-                    )
+                    triggered.append(listener_name)
            elif condition_type == "AND":
                # Initialize pending methods for this listener if not already done
                if listener_name not in self._pending_and_listeners:
                    self._pending_and_listeners[listener_name] = set(methods)
                # Remove the trigger method from pending methods
-                self._pending_and_listeners[listener_name].discard(trigger_method)
+                if trigger_method in self._pending_and_listeners[listener_name]:
+                    self._pending_and_listeners[listener_name].discard(trigger_method)
+
                if not self._pending_and_listeners[listener_name]:
                    # All required methods have been executed
-                    listener_tasks.append(
-                        self._execute_single_listener(listener_name, result)
-                    )
+                    triggered.append(listener_name)
                    # Reset pending methods for this listener
                    self._pending_and_listeners.pop(listener_name, None)

-        # Run all listener tasks concurrently and wait for them to complete
-        if listener_tasks:
-            await asyncio.gather(*listener_tasks)
+        return triggered

    async def _execute_single_listener(self, listener_name: str, result: Any) -> None:
        try:
@@ -386,17 +377,13 @@ class Flow(Generic[T], metaclass=FlowMeta):

            sig = inspect.signature(method)
            params = list(sig.parameters.values())
-
-            # Exclude 'self' parameter
            method_params = [p for p in params if p.name != "self"]

            if method_params:
-                # If listener expects parameters, pass the result
                listener_result = await self._execute_method(
                    listener_name, method, result
                )
            else:
-                # If listener does not expect parameters, call without arguments
                listener_result = await self._execute_method(listener_name, method)

            self.event_emitter.send(
@@ -408,8 +395,9 @@ class Flow(Generic[T], metaclass=FlowMeta):
                ),
            )

-            # Execute listeners of this listener
+            # Execute listeners (and possibly routers) of this listener
            await self._execute_listeners(listener_name, listener_result)
+
        except Exception as e:
            print(
                f"[Flow._execute_single_listener] Error in method {listener_name}: {e}"
@@ -422,5 +410,4 @@ class Flow(Generic[T], metaclass=FlowMeta):
        self._telemetry.flow_plotting_span(
            self.__class__.__name__, list(self._methods.keys())
        )
-
        plot_flow(self, filename)
--- a/src/crewai/flow/utils.py
+++ b/src/crewai/flow/utils.py
@@ -31,16 +31,50 @@ def get_possible_return_constants(function):
        print(f"Source code:\n{source}")
        return None

-    return_values = []
+    return_values = set()
+    dict_definitions = {}
+
+    class DictionaryAssignmentVisitor(ast.NodeVisitor):
+        def visit_Assign(self, node):
+            # Check if this assignment is assigning a dictionary literal to a variable
+            if isinstance(node.value, ast.Dict) and len(node.targets) == 1:
+                target = node.targets[0]
+                if isinstance(target, ast.Name):
+                    var_name = target.id
+                    dict_values = []
+                    # Extract string values from the dictionary
+                    for val in node.value.values:
+                        if isinstance(val, ast.Constant) and isinstance(val.value, str):
+                            dict_values.append(val.value)
+                        # If non-string, skip or just ignore
+                    if dict_values:
+                        dict_definitions[var_name] = dict_values
+            self.generic_visit(node)

    class ReturnVisitor(ast.NodeVisitor):
        def visit_Return(self, node):
-            # Check if the return value is a constant (Python 3.8+)
-            if isinstance(node.value, ast.Constant):
-                return_values.append(node.value.value)
+            # Direct string return
+            if isinstance(node.value, ast.Constant) and isinstance(
+                node.value.value, str
+            ):
+                return_values.add(node.value.value)
+            # Dictionary-based return, like return paths[result]
+            elif isinstance(node.value, ast.Subscript):
+                # Check if we're subscripting a known dictionary variable
+                if isinstance(node.value.value, ast.Name):
+                    var_name = node.value.value.id
+                    if var_name in dict_definitions:
+                        # Add all possible dictionary values
+                        for v in dict_definitions[var_name]:
+                            return_values.add(v)
+            self.generic_visit(node)

+    # First pass: identify dictionary assignments
+    DictionaryAssignmentVisitor().visit(code_ast)
+    # Second pass: identify returns
    ReturnVisitor().visit(code_ast)
-    return return_values
+
+    return list(return_values) if return_values else None


 def calculate_node_levels(flow):
@@ -61,10 +95,7 @@ def calculate_node_levels(flow):
        current_level = levels[current]
        visited.add(current)

-        for listener_name, (
-            condition_type,
-            trigger_methods,
-        ) in flow._listeners.items():
+        for listener_name, (condition_type, trigger_methods) in flow._listeners.items():
            if condition_type == "OR":
                if current in trigger_methods:
                    if (
@@ -89,7 +120,7 @@ def calculate_node_levels(flow):
                            queue.append(listener_name)

        # Handle router connections
-        if current in flow._routers.values():
+        if current in flow._routers:
            router_method_name = current
            paths = flow._router_paths.get(router_method_name, [])
            for path in paths:
@@ -105,6 +136,7 @@ def calculate_node_levels(flow):
                            levels[listener_name] = current_level + 1
                            if listener_name not in visited:
                                queue.append(listener_name)
+
    return levels


@@ -142,7 +174,7 @@ def dfs_ancestors(node, ancestors, visited, flow):
            dfs_ancestors(listener_name, ancestors, visited, flow)

    # Handle router methods separately
-    if node in flow._routers.values():
+    if node in flow._routers:
        router_method_name = node
        paths = flow._router_paths.get(router_method_name, [])
        for path in paths:
--- a/src/crewai/flow/visualization_utils.py
+++ b/src/crewai/flow/visualization_utils.py
@@ -94,12 +94,14 @@ def add_edges(net, flow, node_positions, colors):
    ancestors = build_ancestor_dict(flow)
    parent_children = build_parent_children_dict(flow)

+    # Edges for normal listeners
    for method_name in flow._listeners:
        condition_type, trigger_methods = flow._listeners[method_name]
        is_and_condition = condition_type == "AND"

        for trigger in trigger_methods:
-            if trigger in flow._methods or trigger in flow._routers.values():
+            # Check if nodes exist before adding edges
+            if trigger in node_positions and method_name in node_positions:
                is_router_edge = any(
                    trigger in paths for paths in flow._router_paths.values()
                )
@@ -135,7 +137,22 @@ def add_edges(net, flow, node_positions, colors):
                }

                net.add_edge(trigger, method_name, **edge_style)
+            else:
+                # Nodes not found in node_positions. Check if it's a known router outcome and a known method.
+                is_router_edge = any(
+                    trigger in paths for paths in flow._router_paths.values()
+                )
+                # Check if method_name is a known method
+                method_known = method_name in flow._methods

+                # If it's a known router edge and the method is known, don't warn.
+                # This means the path is legitimate, just not reflected as nodes here.
+                if not (is_router_edge and method_known):
+                    print(
+                        f"Warning: No node found for '{trigger}' or '{method_name}'. Skipping edge."
+                    )
+
+    # Edges for router return paths
    for router_method_name, paths in flow._router_paths.items():
        for path in paths:
            for listener_name, (
@@ -143,36 +160,49 @@ def add_edges(net, flow, node_positions, colors):
                trigger_methods,
            ) in flow._listeners.items():
                if path in trigger_methods:
-                    is_cycle_edge = is_ancestor(trigger, method_name, ancestors)
-                    parent_has_multiple_children = (
-                        len(parent_children.get(router_method_name, [])) > 1
-                    )
-                    needs_curvature = is_cycle_edge or parent_has_multiple_children
+                    if (
+                        router_method_name in node_positions
+                        and listener_name in node_positions
+                    ):
+                        is_cycle_edge = is_ancestor(
+                            router_method_name, listener_name, ancestors
+                        )
+                        parent_has_multiple_children = (
+                            len(parent_children.get(router_method_name, [])) > 1
+                        )
+                        needs_curvature = is_cycle_edge or parent_has_multiple_children

-                    if needs_curvature:
-                        source_pos = node_positions.get(router_method_name)
-                        target_pos = node_positions.get(listener_name)
+                        if needs_curvature:
+                            source_pos = node_positions.get(router_method_name)
+                            target_pos = node_positions.get(listener_name)

-                        if source_pos and target_pos:
-                            dx = target_pos[0] - source_pos[0]
-                            smooth_type = "curvedCCW" if dx <= 0 else "curvedCW"
-                            index = get_child_index(
-                                router_method_name, listener_name, parent_children
-                            )
-                            edge_smooth = {
-                                "type": smooth_type,
-                                "roundness": 0.2 + (0.1 * index),
-                            }
+                            if source_pos and target_pos:
+                                dx = target_pos[0] - source_pos[0]
+                                smooth_type = "curvedCCW" if dx <= 0 else "curvedCW"
+                                index = get_child_index(
+                                    router_method_name, listener_name, parent_children
+                                )
+                                edge_smooth = {
+                                    "type": smooth_type,
+                                    "roundness": 0.2 + (0.1 * index),
+                                }
+                            else:
+                                edge_smooth = {"type": "cubicBezier"}
                        else:
-                            edge_smooth = {"type": "cubicBezier"}
-                    else:
-                        edge_smooth = False
+                            edge_smooth = False

-                    edge_style = {
-                        "color": colors["router_edge"],
-                        "width": 2,
-                        "arrows": "to",
-                        "dashes": True,
-                        "smooth": edge_smooth,
-                    }
-                    net.add_edge(router_method_name, listener_name, **edge_style)
+                        edge_style = {
+                            "color": colors["router_edge"],
+                            "width": 2,
+                            "arrows": "to",
+                            "dashes": True,
+                            "smooth": edge_smooth,
+                        }
+                        net.add_edge(router_method_name, listener_name, **edge_style)
+                    else:
+                        # Same check here: known router edge and known method?
+                        method_known = listener_name in flow._methods
+                        if not method_known:
+                            print(
+                                f"Warning: No node found for '{router_method_name}' or '{listener_name}'. Skipping edge."
+                            )
--- a/src/crewai/knowledge/knowledge.py
+++ b/src/crewai/knowledge/knowledge.py
@@ -14,13 +14,13 @@ class Knowledge(BaseModel):
    Knowledge is a collection of sources and setup for the vector store to save and query relevant context.
    Args:
        sources: List[BaseKnowledgeSource] = Field(default_factory=list)
-        storage: KnowledgeStorage = Field(default_factory=KnowledgeStorage)
+        storage: Optional[KnowledgeStorage] = Field(default=None)
        embedder_config: Optional[Dict[str, Any]] = None
    """

    sources: List[BaseKnowledgeSource] = Field(default_factory=list)
    model_config = ConfigDict(arbitrary_types_allowed=True)
-    storage: KnowledgeStorage = Field(default_factory=KnowledgeStorage)
+    storage: Optional[KnowledgeStorage] = Field(default=None)
    embedder_config: Optional[Dict[str, Any]] = None
    collection_name: Optional[str] = None

--- a/src/crewai/knowledge/source/base_file_knowledge_source.py
+++ b/src/crewai/knowledge/source/base_file_knowledge_source.py
@@ -1,8 +1,8 @@
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Dict, List, Union
+from typing import Dict, List, Optional, Union

-from pydantic import Field
+from pydantic import Field, field_validator

 from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
 from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage
@@ -14,17 +14,28 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
    """Base class for knowledge sources that load content from files."""

    _logger: Logger = Logger(verbose=True)
-    file_path: Union[Path, List[Path], str, List[str]] = Field(
-        ..., description="The path to the file"
+    file_path: Optional[Union[Path, List[Path], str, List[str]]] = Field(
+        default=None,
+        description="[Deprecated] The path to the file. Use file_paths instead.",
+    )
+    file_paths: Optional[Union[Path, List[Path], str, List[str]]] = Field(
+        default_factory=list, description="The path to the file"
    )
    content: Dict[Path, str] = Field(init=False, default_factory=dict)
-    storage: KnowledgeStorage = Field(default_factory=KnowledgeStorage)
+    storage: Optional[KnowledgeStorage] = Field(default=None)
    safe_file_paths: List[Path] = Field(default_factory=list)

+    @field_validator("file_path", "file_paths", mode="before")
+    def validate_file_path(cls, v, values):
+        """Validate that at least one of file_path or file_paths is provided."""
+        if v is None and ("file_path" not in values or values.get("file_path") is None):
+            raise ValueError("Either file_path or file_paths must be provided")
+        return v
+
    def model_post_init(self, _):
        """Post-initialization method to load content."""
        self.safe_file_paths = self._process_file_paths()
-        self.validate_paths()
+        self.validate_content()
        self.content = self.load_content()

    @abstractmethod
@@ -32,7 +43,7 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
        """Load and preprocess file content. Should be overridden by subclasses. Assume that the file path is relative to the project root in the knowledge directory."""
        pass

-    def validate_paths(self):
+    def validate_content(self):
        """Validate the paths."""
        for path in self.safe_file_paths:
            if not path.exists():
@@ -51,7 +62,10 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):

    def _save_documents(self):
        """Save the documents to the storage."""
-        self.storage.save(self.chunks)
+        if self.storage:
+            self.storage.save(self.chunks)
+        else:
+            raise ValueError("No storage found to save documents.")

    def convert_to_path(self, path: Union[Path, str]) -> Path:
        """Convert a path to a Path object."""
@@ -59,13 +73,30 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):

    def _process_file_paths(self) -> List[Path]:
        """Convert file_path to a list of Path objects."""
-        paths = (
-            [self.file_path]
-            if isinstance(self.file_path, (str, Path))
-            else self.file_path
+
+        if hasattr(self, "file_path") and self.file_path is not None:
+            self._logger.log(
+                "warning",
+                "The 'file_path' attribute is deprecated and will be removed in a future version. Please use 'file_paths' instead.",
+                color="yellow",
+            )
+            self.file_paths = self.file_path
+
+        if self.file_paths is None:
+            raise ValueError("Your source must be provided with a file_paths: []")
+
+        # Convert single path to list
+        path_list: List[Union[Path, str]] = (
+            [self.file_paths]
+            if isinstance(self.file_paths, (str, Path))
+            else list(self.file_paths)
+            if isinstance(self.file_paths, list)
+            else []
        )

-        if not isinstance(paths, list):
-            raise ValueError("file_path must be a Path, str, or a list of these types")
+        if not path_list:
+            raise ValueError(
+                "file_path/file_paths must be a Path, str, or a list of these types"
+            )

-        return [self.convert_to_path(path) for path in paths]
+        return [self.convert_to_path(path) for path in path_list]
--- a/src/crewai/knowledge/source/base_knowledge_source.py
+++ b/src/crewai/knowledge/source/base_knowledge_source.py
@@ -16,12 +16,12 @@ class BaseKnowledgeSource(BaseModel, ABC):
    chunk_embeddings: List[np.ndarray] = Field(default_factory=list)

    model_config = ConfigDict(arbitrary_types_allowed=True)
-    storage: KnowledgeStorage = Field(default_factory=KnowledgeStorage)
+    storage: Optional[KnowledgeStorage] = Field(default=None)
    metadata: Dict[str, Any] = Field(default_factory=dict)  # Currently unused
    collection_name: Optional[str] = Field(default=None)

    @abstractmethod
-    def load_content(self) -> Dict[Any, str]:
+    def validate_content(self) -> Any:
        """Load and preprocess content from the source."""
        pass

@@ -46,4 +46,7 @@ class BaseKnowledgeSource(BaseModel, ABC):
        Save the documents to the storage.
        This method should be called after the chunks and embeddings are generated.
        """
-        self.storage.save(self.chunks)
+        if self.storage:
+            self.storage.save(self.chunks)
+        else:
+            raise ValueError("No storage found to save documents.")
--- a/src/crewai/knowledge/source/crew_docling_source.py
+++ b/src/crewai/knowledge/source/crew_docling_source.py
@@ -0,0 +1,120 @@
+from pathlib import Path
+from typing import Iterator, List, Optional, Union
+from urllib.parse import urlparse
+
+from docling.datamodel.base_models import InputFormat
+from docling.document_converter import DocumentConverter
+from docling.exceptions import ConversionError
+from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
+from docling_core.types.doc.document import DoclingDocument
+from pydantic import Field
+
+from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
+from crewai.utilities.constants import KNOWLEDGE_DIRECTORY
+from crewai.utilities.logger import Logger
+
+
+class CrewDoclingSource(BaseKnowledgeSource):
+    """Default Source class for converting documents to markdown or json
+    This will auto support PDF, DOCX, and TXT, XLSX, Images, and HTML files without any additional dependencies and follows the docling package as the source of truth.
+    """
+
+    _logger: Logger = Logger(verbose=True)
+
+    file_path: Optional[List[Union[Path, str]]] = Field(default=None)
+    file_paths: List[Union[Path, str]] = Field(default_factory=list)
+    chunks: List[str] = Field(default_factory=list)
+    safe_file_paths: List[Union[Path, str]] = Field(default_factory=list)
+    content: List[DoclingDocument] = Field(default_factory=list)
+    document_converter: DocumentConverter = Field(
+        default_factory=lambda: DocumentConverter(
+            allowed_formats=[
+                InputFormat.MD,
+                InputFormat.ASCIIDOC,
+                InputFormat.PDF,
+                InputFormat.DOCX,
+                InputFormat.HTML,
+                InputFormat.IMAGE,
+                InputFormat.XLSX,
+                InputFormat.PPTX,
+            ]
+        )
+    )
+
+    def model_post_init(self, _) -> None:
+        if self.file_path:
+            self._logger.log(
+                "warning",
+                "The 'file_path' attribute is deprecated and will be removed in a future version. Please use 'file_paths' instead.",
+                color="yellow",
+            )
+            self.file_paths = self.file_path
+        self.safe_file_paths = self.validate_content()
+        self.content = self._load_content()
+
+    def _load_content(self) -> List[DoclingDocument]:
+        try:
+            return self._convert_source_to_docling_documents()
+        except ConversionError as e:
+            self._logger.log(
+                "error",
+                f"Error loading content: {e}. Supported formats: {self.document_converter.allowed_formats}",
+                "red",
+            )
+            raise e
+        except Exception as e:
+            self._logger.log("error", f"Error loading content: {e}")
+            raise e
+
+    def add(self) -> None:
+        if self.content is None:
+            return
+        for doc in self.content:
+            new_chunks_iterable = self._chunk_doc(doc)
+            self.chunks.extend(list(new_chunks_iterable))
+        self._save_documents()
+
+    def _convert_source_to_docling_documents(self) -> List[DoclingDocument]:
+        conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
+        return [result.document for result in conv_results_iter]
+
+    def _chunk_doc(self, doc: DoclingDocument) -> Iterator[str]:
+        chunker = HierarchicalChunker()
+        for chunk in chunker.chunk(doc):
+            yield chunk.text
+
+    def validate_content(self) -> List[Union[Path, str]]:
+        processed_paths: List[Union[Path, str]] = []
+        for path in self.file_paths:
+            if isinstance(path, str):
+                if path.startswith(("http://", "https://")):
+                    try:
+                        if self._validate_url(path):
+                            processed_paths.append(path)
+                        else:
+                            raise ValueError(f"Invalid URL format: {path}")
+                    except Exception as e:
+                        raise ValueError(f"Invalid URL: {path}. Error: {str(e)}")
+                else:
+                    local_path = Path(KNOWLEDGE_DIRECTORY + "/" + path)
+                    if local_path.exists():
+                        processed_paths.append(local_path)
+                    else:
+                        raise FileNotFoundError(f"File not found: {local_path}")
+            else:
+                # this is an instance of Path
+                processed_paths.append(path)
+        return processed_paths
+
+    def _validate_url(self, url: str) -> bool:
+        try:
+            result = urlparse(url)
+            return all(
+                [
+                    result.scheme in ("http", "https"),
+                    result.netloc,
+                    len(result.netloc.split(".")) >= 2,  # Ensure domain has TLD
+                ]
+            )
+        except Exception:
+            return False
--- a/src/crewai/knowledge/source/string_knowledge_source.py
+++ b/src/crewai/knowledge/source/string_knowledge_source.py
@@ -13,9 +13,9 @@ class StringKnowledgeSource(BaseKnowledgeSource):

    def model_post_init(self, _):
        """Post-initialization method to validate content."""
-        self.load_content()
+        self.validate_content()

-    def load_content(self):
+    def validate_content(self):
        """Validate string content."""
        if not isinstance(self.content, str):
            raise ValueError("StringKnowledgeSource only accepts string content")
--- a/tests/flow_test.py
+++ b/tests/flow_test.py
@@ -263,3 +263,62 @@ def test_flow_with_custom_state():
    flow = StateFlow()
    flow.kickoff()
    assert flow.counter == 2
+
+
+def test_router_with_multiple_conditions():
+    """Test a router that triggers when any of multiple steps complete (OR condition),
+    and another router that triggers only after all specified steps complete (AND condition).
+    """
+
+    execution_order = []
+
+    class ComplexRouterFlow(Flow):
+        @start()
+        def step_a(self):
+            execution_order.append("step_a")
+
+        @start()
+        def step_b(self):
+            execution_order.append("step_b")
+
+        @router(or_("step_a", "step_b"))
+        def router_or(self):
+            execution_order.append("router_or")
+            return "next_step_or"
+
+        @listen("next_step_or")
+        def handle_next_step_or_event(self):
+            execution_order.append("handle_next_step_or_event")
+
+        @listen(handle_next_step_or_event)
+        def branch_2_step(self):
+            execution_order.append("branch_2_step")
+
+        @router(and_(handle_next_step_or_event, branch_2_step))
+        def router_and(self):
+            execution_order.append("router_and")
+            return "final_step"
+
+        @listen("final_step")
+        def log_final_step(self):
+            execution_order.append("log_final_step")
+
+    flow = ComplexRouterFlow()
+    flow.kickoff()
+
+    assert "step_a" in execution_order
+    assert "step_b" in execution_order
+    assert "router_or" in execution_order
+    assert "handle_next_step_or_event" in execution_order
+    assert "branch_2_step" in execution_order
+    assert "router_and" in execution_order
+    assert "log_final_step" in execution_order
+
+    # Check that the AND router triggered after both relevant steps:
+    assert execution_order.index("router_and") > execution_order.index(
+        "handle_next_step_or_event"
+    )
+    assert execution_order.index("router_and") > execution_order.index("branch_2_step")
+
+    # final_step should run after router_and
+    assert execution_order.index("log_final_step") > execution_order.index("router_and")
--- a/tests/knowledge/knowledge_test.py
+++ b/tests/knowledge/knowledge_test.py
@@ -1,10 +1,12 @@
 """Test Knowledge creation and querying functionality."""

 from pathlib import Path
+from typing import List, Union
 from unittest.mock import patch

 import pytest

+from crewai.knowledge.source.crew_docling_source import CrewDoclingSource
 from crewai.knowledge.source.csv_knowledge_source import CSVKnowledgeSource
 from crewai.knowledge.source.excel_knowledge_source import ExcelKnowledgeSource
 from crewai.knowledge.source.json_knowledge_source import JSONKnowledgeSource
@@ -200,7 +202,7 @@ def test_single_short_file(mock_vector_db, tmpdir):
        f.write(content)

    file_source = TextFileKnowledgeSource(
-        file_path=file_path, metadata={"preference": "personal"}
+        file_paths=[file_path], metadata={"preference": "personal"}
    )
    mock_vector_db.sources = [file_source]
    mock_vector_db.query.return_value = [{"context": content, "score": 0.9}]
@@ -242,7 +244,7 @@ def test_single_2k_character_file(mock_vector_db, tmpdir):
        f.write(content)

    file_source = TextFileKnowledgeSource(
-        file_path=file_path, metadata={"preference": "personal"}
+        file_paths=[file_path], metadata={"preference": "personal"}
    )
    mock_vector_db.sources = [file_source]
    mock_vector_db.query.return_value = [{"context": content, "score": 0.9}]
@@ -279,7 +281,7 @@ def test_multiple_short_files(mock_vector_db, tmpdir):
        file_paths.append((file_path, item["metadata"]))

    file_sources = [
-        TextFileKnowledgeSource(file_path=path, metadata=metadata)
+        TextFileKnowledgeSource(file_paths=[path], metadata=metadata)
        for path, metadata in file_paths
    ]
    mock_vector_db.sources = file_sources
@@ -352,7 +354,7 @@ def test_multiple_2k_character_files(mock_vector_db, tmpdir):
        file_paths.append(file_path)

    file_sources = [
-        TextFileKnowledgeSource(file_path=path, metadata={"preference": "personal"})
+        TextFileKnowledgeSource(file_paths=[path], metadata={"preference": "personal"})
        for path in file_paths
    ]
    mock_vector_db.sources = file_sources
@@ -399,7 +401,7 @@ def test_hybrid_string_and_files(mock_vector_db, tmpdir):
        file_paths.append(file_path)

    file_sources = [
-        TextFileKnowledgeSource(file_path=path, metadata={"preference": "personal"})
+        TextFileKnowledgeSource(file_paths=[path], metadata={"preference": "personal"})
        for path in file_paths
    ]

@@ -424,7 +426,7 @@ def test_pdf_knowledge_source(mock_vector_db):

    # Create a PDFKnowledgeSource
    pdf_source = PDFKnowledgeSource(
-        file_path=pdf_path, metadata={"preference": "personal"}
+        file_paths=[pdf_path], metadata={"preference": "personal"}
    )
    mock_vector_db.sources = [pdf_source]
    mock_vector_db.query.return_value = [
@@ -461,7 +463,7 @@ def test_csv_knowledge_source(mock_vector_db, tmpdir):

    # Create a CSVKnowledgeSource
    csv_source = CSVKnowledgeSource(
-        file_path=csv_path, metadata={"preference": "personal"}
+        file_paths=[csv_path], metadata={"preference": "personal"}
    )
    mock_vector_db.sources = [csv_source]
    mock_vector_db.query.return_value = [
@@ -496,7 +498,7 @@ def test_json_knowledge_source(mock_vector_db, tmpdir):

    # Create a JSONKnowledgeSource
    json_source = JSONKnowledgeSource(
-        file_path=json_path, metadata={"preference": "personal"}
+        file_paths=[json_path], metadata={"preference": "personal"}
    )
    mock_vector_db.sources = [json_source]
    mock_vector_db.query.return_value = [
@@ -529,7 +531,7 @@ def test_excel_knowledge_source(mock_vector_db, tmpdir):

    # Create an ExcelKnowledgeSource
    excel_source = ExcelKnowledgeSource(
-        file_path=excel_path, metadata={"preference": "personal"}
+        file_paths=[excel_path], metadata={"preference": "personal"}
    )
    mock_vector_db.sources = [excel_source]
    mock_vector_db.query.return_value = [
@@ -543,3 +545,42 @@ def test_excel_knowledge_source(mock_vector_db, tmpdir):
    # Assert that the correct information is retrieved
    assert any("30" in result["context"] for result in results)
    mock_vector_db.query.assert_called_once()
+
+
+def test_docling_source(mock_vector_db):
+    docling_source = CrewDoclingSource(
+        file_paths=[
+            "https://lilianweng.github.io/posts/2024-11-28-reward-hacking/",
+        ],
+    )
+    mock_vector_db.sources = [docling_source]
+    mock_vector_db.query.return_value = [
+        {
+            "context": "Reward hacking is a technique used to improve the performance of reinforcement learning agents.",
+            "score": 0.9,
+        }
+    ]
+    # Perform a query
+    query = "What is reward hacking?"
+    results = mock_vector_db.query(query)
+    assert any("reward hacking" in result["context"].lower() for result in results)
+    mock_vector_db.query.assert_called_once()
+
+
+def test_multiple_docling_sources():
+    urls: List[Union[Path, str]] = [
+        "https://lilianweng.github.io/posts/2024-11-28-reward-hacking/",
+        "https://lilianweng.github.io/posts/2024-07-07-hallucination/",
+    ]
+    docling_source = CrewDoclingSource(file_paths=urls)
+
+    assert docling_source.file_paths == urls
+    assert docling_source.content is not None
+
+
+def test_docling_source_with_local_file():
+    current_dir = Path(__file__).parent
+    pdf_path = current_dir / "crewai_quickstart.pdf"
+    docling_source = CrewDoclingSource(file_paths=[pdf_path])
+    assert docling_source.file_paths == [pdf_path]
+    assert docling_source.content is not None
--- a/uv.lock
+++ b/uv.lock
Author	SHA1	Message	Date
ericklima-ca	27472ba69e	refactor: Change storage field to optional and improve error handling when saving documents	2024-12-26 22:27:19 -04:00
ericklima-ca	25aa774d8c	fix: Change storage initialization to None for KnowledgeStorage	2024-12-26 21:30:06 -04:00
Brandon Hancock (bhancock_ai)	6cc2f510bf	Feat/joao flow improvement requests (#1795 ) * Add in or and and in router * In the middle of improving plotting * final plot changes --------- Co-authored-by: João Moura <joaomdmoura@gmail.com>	2024-12-24 18:55:44 -03:00
Lorenze Jay	9a65abf6b8	removed some redundancies (#1796 ) * removed some redundancies * cleanup	2024-12-23 13:54:16 -05:00
Lorenze Jay	b3185ad90c	Feat/docling-support (#1763 ) * added tool for docling support * docling support installation * use file_paths instead of file_path * fix import * organized imports * run_type docs * needs to be list * fixed logic * logged but file_path is backwards compatible * use file_paths instead of file_path 2 * added test for multiple sources for file_paths * fix run-types * enabling local files to work and type cleanup * linted * fix test and types * fixed run types * fix types * renamed to CrewDoclingSource * linted * added docs * resolve conflicts --------- Co-authored-by: Brandon Hancock (bhancock_ai) <109994880+bhancockio@users.noreply.github.com> Co-authored-by: Brandon Hancock <brandon@brandonhancock.io>	2024-12-23 13:19:58 -05:00