fix: Sort imports using ruff --fix

Co-Authored-By: Joe Moura <joao@crewai.com>
fix: Resolve merge conflict and maintain organized imports
2026-03-09 05:18:15 +00:00 · 2024-12-22 04:33:16 +00:00 · 2024-12-22 04:31:39 +00:00 · 2024-12-22 04:29:13 +00:00 · 2024-12-22 04:26:47 +00:00 · 2024-12-22 04:25:37 +00:00
8 changed files with 35 additions and 1167 deletions
--- a/docs/concepts/knowledge.mdx
+++ b/docs/concepts/knowledge.mdx
@@ -79,55 +79,6 @@ crew = Crew(
 result = crew.kickoff(inputs={"question": "What city does John live in and how old is he?"})
 ```

-
-Here's another example with the `CrewDoclingSource`
-```python Code
-from crewai import LLM, Agent, Crew, Process, Task
-from crewai.knowledge.source.crew_docling_source import CrewDoclingSource
-
-# Create a knowledge source
-content_source = CrewDoclingSource(
-    file_paths=[
-        "https://lilianweng.github.io/posts/2024-11-28-reward-hacking",
-        "https://lilianweng.github.io/posts/2024-07-07-hallucination",
-    ],
-)
-
-# Create an LLM with a temperature of 0 to ensure deterministic outputs
-llm = LLM(model="gpt-4o-mini", temperature=0)
-
-# Create an agent with the knowledge store
-agent = Agent(
-    role="About papers",
-    goal="You know everything about the papers.",
-    backstory="""You are a master at understanding papers and their content.""",
-    verbose=True,
-    allow_delegation=False,
-    llm=llm,
-)
-task = Task(
-    description="Answer the following questions about the papers: {question}",
-    expected_output="An answer to the question.",
-    agent=agent,
-)
-
-crew = Crew(
-    agents=[agent],
-    tasks=[task],
-    verbose=True,
-    process=Process.sequential,
-    knowledge_sources=[
-        content_source
-    ],  # Enable knowledge by adding the sources here. You can also add more sources to the sources list.
-)
-
-result = crew.kickoff(
-    inputs={
-        "question": "What is the reward hacking paper about? Be sure to provide sources."
-    }
-)
-```
-
 ## Knowledge Configuration

 ### Chunking Configuration
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,9 +51,6 @@ openpyxl = [
    "openpyxl>=3.1.5",
 ]
 mem0 = ["mem0ai>=0.1.29"]
-docling = [
-    "docling>=2.12.0",
-]

 [tool.uv]
 dev-dependencies = [
--- a/src/crewai/knowledge/source/base_file_knowledge_source.py
+++ b/src/crewai/knowledge/source/base_file_knowledge_source.py
@@ -1,8 +1,8 @@
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Union

-from pydantic import Field, field_validator
+from pydantic import Field

 from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
 from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage
@@ -14,28 +14,17 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
    """Base class for knowledge sources that load content from files."""

    _logger: Logger = Logger(verbose=True)
-    file_path: Optional[Union[Path, List[Path], str, List[str]]] = Field(
-        default=None,
-        description="[Deprecated] The path to the file. Use file_paths instead.",
-    )
-    file_paths: Optional[Union[Path, List[Path], str, List[str]]] = Field(
-        default_factory=list, description="The path to the file"
+    file_path: Union[Path, List[Path], str, List[str]] = Field(
+        ..., description="The path to the file"
    )
    content: Dict[Path, str] = Field(init=False, default_factory=dict)
    storage: KnowledgeStorage = Field(default_factory=KnowledgeStorage)
    safe_file_paths: List[Path] = Field(default_factory=list)

-    @field_validator("file_path", "file_paths", mode="before")
-    def validate_file_path(cls, v, values):
-        """Validate that at least one of file_path or file_paths is provided."""
-        if v is None and ("file_path" not in values or values.get("file_path") is None):
-            raise ValueError("Either file_path or file_paths must be provided")
-        return v
-
    def model_post_init(self, _):
        """Post-initialization method to load content."""
        self.safe_file_paths = self._process_file_paths()
-        self.validate_content()
+        self.validate_paths()
        self.content = self.load_content()

    @abstractmethod
@@ -43,7 +32,7 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
        """Load and preprocess file content. Should be overridden by subclasses. Assume that the file path is relative to the project root in the knowledge directory."""
        pass

-    def validate_content(self):
+    def validate_paths(self):
        """Validate the paths."""
        for path in self.safe_file_paths:
            if not path.exists():
@@ -70,30 +59,13 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):

    def _process_file_paths(self) -> List[Path]:
        """Convert file_path to a list of Path objects."""
-
-        if hasattr(self, "file_path") and self.file_path is not None:
-            self._logger.log(
-                "warning",
-                "The 'file_path' attribute is deprecated and will be removed in a future version. Please use 'file_paths' instead.",
-                color="yellow",
-            )
-            self.file_paths = self.file_path
-
-        if self.file_paths is None:
-            raise ValueError("Your source must be provided with a file_paths: []")
-
-        # Convert single path to list
-        path_list: List[Union[Path, str]] = (
-            [self.file_paths]
-            if isinstance(self.file_paths, (str, Path))
-            else list(self.file_paths)
-            if isinstance(self.file_paths, list)
-            else []
+        paths = (
+            [self.file_path]
+            if isinstance(self.file_path, (str, Path))
+            else self.file_path
        )

-        if not path_list:
-            raise ValueError(
-                "file_path/file_paths must be a Path, str, or a list of these types"
-            )
+        if not isinstance(paths, list):
+            raise ValueError("file_path must be a Path, str, or a list of these types")

-        return [self.convert_to_path(path) for path in path_list]
+        return [self.convert_to_path(path) for path in paths]
--- a/src/crewai/knowledge/source/base_knowledge_source.py
+++ b/src/crewai/knowledge/source/base_knowledge_source.py
@@ -21,7 +21,7 @@ class BaseKnowledgeSource(BaseModel, ABC):
    collection_name: Optional[str] = Field(default=None)

    @abstractmethod
-    def validate_content(self) -> Any:
+    def load_content(self) -> Dict[Any, str]:
        """Load and preprocess content from the source."""
        pass

--- a/src/crewai/knowledge/source/crew_docling_source.py
+++ b/src/crewai/knowledge/source/crew_docling_source.py
@@ -1,120 +0,0 @@
-from pathlib import Path
-from typing import Iterator, List, Optional, Union
-from urllib.parse import urlparse
-
-from docling.datamodel.base_models import InputFormat
-from docling.document_converter import DocumentConverter
-from docling.exceptions import ConversionError
-from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
-from docling_core.types.doc.document import DoclingDocument
-from pydantic import Field
-
-from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
-from crewai.utilities.constants import KNOWLEDGE_DIRECTORY
-from crewai.utilities.logger import Logger
-
-
-class CrewDoclingSource(BaseKnowledgeSource):
-    """Default Source class for converting documents to markdown or json
-    This will auto support PDF, DOCX, and TXT, XLSX, Images, and HTML files without any additional dependencies and follows the docling package as the source of truth.
-    """
-
-    _logger: Logger = Logger(verbose=True)
-
-    file_path: Optional[List[Union[Path, str]]] = Field(default=None)
-    file_paths: List[Union[Path, str]] = Field(default_factory=list)
-    chunks: List[str] = Field(default_factory=list)
-    safe_file_paths: List[Union[Path, str]] = Field(default_factory=list)
-    content: List[DoclingDocument] = Field(default_factory=list)
-    document_converter: DocumentConverter = Field(
-        default_factory=lambda: DocumentConverter(
-            allowed_formats=[
-                InputFormat.MD,
-                InputFormat.ASCIIDOC,
-                InputFormat.PDF,
-                InputFormat.DOCX,
-                InputFormat.HTML,
-                InputFormat.IMAGE,
-                InputFormat.XLSX,
-                InputFormat.PPTX,
-            ]
-        )
-    )
-
-    def model_post_init(self, _) -> None:
-        if self.file_path:
-            self._logger.log(
-                "warning",
-                "The 'file_path' attribute is deprecated and will be removed in a future version. Please use 'file_paths' instead.",
-                color="yellow",
-            )
-            self.file_paths = self.file_path
-        self.safe_file_paths = self.validate_content()
-        self.content = self._load_content()
-
-    def _load_content(self) -> List[DoclingDocument]:
-        try:
-            return self._convert_source_to_docling_documents()
-        except ConversionError as e:
-            self._logger.log(
-                "error",
-                f"Error loading content: {e}. Supported formats: {self.document_converter.allowed_formats}",
-                "red",
-            )
-            raise e
-        except Exception as e:
-            self._logger.log("error", f"Error loading content: {e}")
-            raise e
-
-    def add(self) -> None:
-        if self.content is None:
-            return
-        for doc in self.content:
-            new_chunks_iterable = self._chunk_doc(doc)
-            self.chunks.extend(list(new_chunks_iterable))
-        self._save_documents()
-
-    def _convert_source_to_docling_documents(self) -> List[DoclingDocument]:
-        conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
-        return [result.document for result in conv_results_iter]
-
-    def _chunk_doc(self, doc: DoclingDocument) -> Iterator[str]:
-        chunker = HierarchicalChunker()
-        for chunk in chunker.chunk(doc):
-            yield chunk.text
-
-    def validate_content(self) -> List[Union[Path, str]]:
-        processed_paths: List[Union[Path, str]] = []
-        for path in self.file_paths:
-            if isinstance(path, str):
-                if path.startswith(("http://", "https://")):
-                    try:
-                        if self._validate_url(path):
-                            processed_paths.append(path)
-                        else:
-                            raise ValueError(f"Invalid URL format: {path}")
-                    except Exception as e:
-                        raise ValueError(f"Invalid URL: {path}. Error: {str(e)}")
-                else:
-                    local_path = Path(KNOWLEDGE_DIRECTORY + "/" + path)
-                    if local_path.exists():
-                        processed_paths.append(local_path)
-                    else:
-                        raise FileNotFoundError(f"File not found: {local_path}")
-            else:
-                # this is an instance of Path
-                processed_paths.append(path)
-        return processed_paths
-
-    def _validate_url(self, url: str) -> bool:
-        try:
-            result = urlparse(url)
-            return all(
-                [
-                    result.scheme in ("http", "https"),
-                    result.netloc,
-                    len(result.netloc.split(".")) >= 2,  # Ensure domain has TLD
-                ]
-            )
-        except Exception:
-            return False
--- a/src/crewai/knowledge/source/string_knowledge_source.py
+++ b/src/crewai/knowledge/source/string_knowledge_source.py
@@ -13,9 +13,9 @@ class StringKnowledgeSource(BaseKnowledgeSource):

    def model_post_init(self, _):
        """Post-initialization method to validate content."""
-        self.validate_content()
+        self.load_content()

-    def validate_content(self):
+    def load_content(self):
        """Validate string content."""
        if not isinstance(self.content, str):
            raise ValueError("StringKnowledgeSource only accepts string content")
--- a/tests/knowledge/knowledge_test.py
+++ b/tests/knowledge/knowledge_test.py
@@ -1,12 +1,10 @@
 """Test Knowledge creation and querying functionality."""

 from pathlib import Path
-from typing import List, Union
 from unittest.mock import patch

 import pytest

-from crewai.knowledge.source.crew_docling_source import CrewDoclingSource
 from crewai.knowledge.source.csv_knowledge_source import CSVKnowledgeSource
 from crewai.knowledge.source.excel_knowledge_source import ExcelKnowledgeSource
 from crewai.knowledge.source.json_knowledge_source import JSONKnowledgeSource
@@ -202,7 +200,7 @@ def test_single_short_file(mock_vector_db, tmpdir):
        f.write(content)

    file_source = TextFileKnowledgeSource(
-        file_paths=[file_path], metadata={"preference": "personal"}
+        file_path=file_path, metadata={"preference": "personal"}
    )
    mock_vector_db.sources = [file_source]
    mock_vector_db.query.return_value = [{"context": content, "score": 0.9}]
@@ -244,7 +242,7 @@ def test_single_2k_character_file(mock_vector_db, tmpdir):
        f.write(content)

    file_source = TextFileKnowledgeSource(
-        file_paths=[file_path], metadata={"preference": "personal"}
+        file_path=file_path, metadata={"preference": "personal"}
    )
    mock_vector_db.sources = [file_source]
    mock_vector_db.query.return_value = [{"context": content, "score": 0.9}]
@@ -281,7 +279,7 @@ def test_multiple_short_files(mock_vector_db, tmpdir):
        file_paths.append((file_path, item["metadata"]))

    file_sources = [
-        TextFileKnowledgeSource(file_paths=[path], metadata=metadata)
+        TextFileKnowledgeSource(file_path=path, metadata=metadata)
        for path, metadata in file_paths
    ]
    mock_vector_db.sources = file_sources
@@ -354,7 +352,7 @@ def test_multiple_2k_character_files(mock_vector_db, tmpdir):
        file_paths.append(file_path)

    file_sources = [
-        TextFileKnowledgeSource(file_paths=[path], metadata={"preference": "personal"})
+        TextFileKnowledgeSource(file_path=path, metadata={"preference": "personal"})
        for path in file_paths
    ]
    mock_vector_db.sources = file_sources
@@ -401,7 +399,7 @@ def test_hybrid_string_and_files(mock_vector_db, tmpdir):
        file_paths.append(file_path)

    file_sources = [
-        TextFileKnowledgeSource(file_paths=[path], metadata={"preference": "personal"})
+        TextFileKnowledgeSource(file_path=path, metadata={"preference": "personal"})
        for path in file_paths
    ]

@@ -426,7 +424,7 @@ def test_pdf_knowledge_source(mock_vector_db):

    # Create a PDFKnowledgeSource
    pdf_source = PDFKnowledgeSource(
-        file_paths=[pdf_path], metadata={"preference": "personal"}
+        file_path=pdf_path, metadata={"preference": "personal"}
    )
    mock_vector_db.sources = [pdf_source]
    mock_vector_db.query.return_value = [
@@ -463,7 +461,7 @@ def test_csv_knowledge_source(mock_vector_db, tmpdir):

    # Create a CSVKnowledgeSource
    csv_source = CSVKnowledgeSource(
-        file_paths=[csv_path], metadata={"preference": "personal"}
+        file_path=csv_path, metadata={"preference": "personal"}
    )
    mock_vector_db.sources = [csv_source]
    mock_vector_db.query.return_value = [
@@ -498,7 +496,7 @@ def test_json_knowledge_source(mock_vector_db, tmpdir):

    # Create a JSONKnowledgeSource
    json_source = JSONKnowledgeSource(
-        file_paths=[json_path], metadata={"preference": "personal"}
+        file_path=json_path, metadata={"preference": "personal"}
    )
    mock_vector_db.sources = [json_source]
    mock_vector_db.query.return_value = [
@@ -531,7 +529,7 @@ def test_excel_knowledge_source(mock_vector_db, tmpdir):

    # Create an ExcelKnowledgeSource
    excel_source = ExcelKnowledgeSource(
-        file_paths=[excel_path], metadata={"preference": "personal"}
+        file_path=excel_path, metadata={"preference": "personal"}
    )
    mock_vector_db.sources = [excel_source]
    mock_vector_db.query.return_value = [
@@ -545,42 +543,3 @@ def test_excel_knowledge_source(mock_vector_db, tmpdir):
    # Assert that the correct information is retrieved
    assert any("30" in result["context"] for result in results)
    mock_vector_db.query.assert_called_once()
-
-
-def test_docling_source(mock_vector_db):
-    docling_source = CrewDoclingSource(
-        file_paths=[
-            "https://lilianweng.github.io/posts/2024-11-28-reward-hacking/",
-        ],
-    )
-    mock_vector_db.sources = [docling_source]
-    mock_vector_db.query.return_value = [
-        {
-            "context": "Reward hacking is a technique used to improve the performance of reinforcement learning agents.",
-            "score": 0.9,
-        }
-    ]
-    # Perform a query
-    query = "What is reward hacking?"
-    results = mock_vector_db.query(query)
-    assert any("reward hacking" in result["context"].lower() for result in results)
-    mock_vector_db.query.assert_called_once()
-
-
-def test_multiple_docling_sources():
-    urls: List[Union[Path, str]] = [
-        "https://lilianweng.github.io/posts/2024-11-28-reward-hacking/",
-        "https://lilianweng.github.io/posts/2024-07-07-hallucination/",
-    ]
-    docling_source = CrewDoclingSource(file_paths=urls)
-
-    assert docling_source.file_paths == urls
-    assert docling_source.content is not None
-
-
-def test_docling_source_with_local_file():
-    current_dir = Path(__file__).parent
-    pdf_path = current_dir / "crewai_quickstart.pdf"
-    docling_source = CrewDoclingSource(file_paths=[pdf_path])
-    assert docling_source.file_paths == [pdf_path]
-    assert docling_source.content is not None
--- a/uv.lock
+++ b/uv.lock
Author	SHA1	Message	Date
Devin AI	f26833f751	fix: Sort imports using ruff --fix Co-Authored-By: Joe Moura <joao@crewai.com>	2024-12-22 04:33:16 +00:00
Devin AI	5fe15a8dba	fix: Resolve merge conflict and maintain organized imports Co-Authored-By: Joe Moura <joao@crewai.com>	2024-12-22 04:31:39 +00:00
Devin AI	d8f5a9fb71	fix: Apply ruff automatic import sorting Co-Authored-By: Joe Moura <joao@crewai.com>	2024-12-22 04:29:13 +00:00
Devin AI	55883c6083	fix: Consolidate imports and fix formatting Co-Authored-By: Joe Moura <joao@crewai.com>	2024-12-22 04:26:47 +00:00
Devin AI	072f0cbef6	fix: Reorganize imports using ruff --fix Co-Authored-By: Joe Moura <joao@crewai.com>	2024-12-22 04:25:37 +00:00
Devin AI	7beb511206	fix: Sort imports to fix lint issues Co-Authored-By: Joe Moura <joao@crewai.com>	2024-12-22 04:22:11 +00:00
Devin AI	5b2e41b8eb	feat: Add interpolate_only method and improve error handling - Add interpolate_only method for string interpolation while preserving JSON structure - Add comprehensive test coverage for interpolate_only - Add proper type annotation for logger using ClassVar - Improve error handling and documentation for _save_file method Co-Authored-By: Joe Moura <joao@crewai.com>	2024-12-22 04:19:47 +00:00
Brandon Hancock (bhancock_ai)	e6f620877d	Merge branch 'main' into main	2024-12-20 10:34:39 -05:00
Frieda (Jingying) Huang	43cb2d1f66	Fixed yaml config is not escaped properly for output requirements	2024-12-15 13:28:17 -05:00
Frieda Huang	4e9b70201e	Merge branch 'crewAIInc:main' into main	2024-12-15 11:30:27 -05:00
Frieda Huang	059b0cf5b4	Merge branch 'crewAIInc:main' into main	2024-12-10 22:48:49 -05:00
Frieda Huang	652ddcc1c5	Merge branch 'crewAIInc:main' into main	2024-12-10 07:40:02 -05:00
Brandon Hancock (bhancock_ai)	964d4bfdbf	Merge branch 'main' into main	2024-12-09 09:54:20 -05:00
Frieda (Jingying) Huang	c103d7eab7	Merge branch 'main' of https://github.com/frieda-huang/crewAI	2024-12-08 09:25:06 -05:00
Frieda (Jingying) Huang	4fe9f5d8bd	Fixed output_file not respecting system path	2024-12-08 09:21:12 -05:00