diff --git a/lib/crewai/src/crewai/knowledge/source/base_file_knowledge_source.py b/lib/crewai/src/crewai/knowledge/source/base_file_knowledge_source.py index 0832717c1..f4156ea09 100644 --- a/lib/crewai/src/crewai/knowledge/source/base_file_knowledge_source.py +++ b/lib/crewai/src/crewai/knowledge/source/base_file_knowledge_source.py @@ -84,8 +84,20 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC): raise ValueError("No storage found to save documents.") def convert_to_path(self, path: Path | str) -> Path: - """Convert a path to a Path object.""" - return Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path + """Convert a path to a Path object. + + Validates that the resolved path stays within the knowledge directory + to prevent path traversal attacks. + """ + if isinstance(path, str): + result = (Path(KNOWLEDGE_DIRECTORY) / path).resolve() + knowledge_dir = Path(KNOWLEDGE_DIRECTORY).resolve() + if not result.is_relative_to(knowledge_dir): + raise ValueError( + f"Path '{path}' is outside the allowed knowledge directory" + ) + return result + return path def _process_file_paths(self) -> list[Path]: """Convert file_path to a list of Path objects.""" diff --git a/lib/crewai/src/crewai/knowledge/source/crew_docling_source.py b/lib/crewai/src/crewai/knowledge/source/crew_docling_source.py index 3dddacfac..aedcc9072 100644 --- a/lib/crewai/src/crewai/knowledge/source/crew_docling_source.py +++ b/lib/crewai/src/crewai/knowledge/source/crew_docling_source.py @@ -129,7 +129,12 @@ class CrewDoclingSource(BaseKnowledgeSource): except Exception as e: raise ValueError(f"Invalid URL: {path}. Error: {e!s}") from e else: - local_path = Path(KNOWLEDGE_DIRECTORY + "/" + path) + local_path = (Path(KNOWLEDGE_DIRECTORY) / path).resolve() + knowledge_dir = Path(KNOWLEDGE_DIRECTORY).resolve() + if not local_path.is_relative_to(knowledge_dir): + raise ValueError( + f"Path '{path}' is outside the allowed knowledge directory" + ) if local_path.exists(): processed_paths.append(local_path) else: diff --git a/lib/crewai/src/crewai/knowledge/source/excel_knowledge_source.py b/lib/crewai/src/crewai/knowledge/source/excel_knowledge_source.py index ece582053..4e7b163c8 100644 --- a/lib/crewai/src/crewai/knowledge/source/excel_knowledge_source.py +++ b/lib/crewai/src/crewai/knowledge/source/excel_knowledge_source.py @@ -130,8 +130,20 @@ class ExcelKnowledgeSource(BaseKnowledgeSource): return content_dict def convert_to_path(self, path: Path | str) -> Path: - """Convert a path to a Path object.""" - return Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path + """Convert a path to a Path object. + + Validates that the resolved path stays within the knowledge directory + to prevent path traversal attacks. + """ + if isinstance(path, str): + result = (Path(KNOWLEDGE_DIRECTORY) / path).resolve() + knowledge_dir = Path(KNOWLEDGE_DIRECTORY).resolve() + if not result.is_relative_to(knowledge_dir): + raise ValueError( + f"Path '{path}' is outside the allowed knowledge directory" + ) + return result + return path def _import_dependencies(self) -> ModuleType: """Dynamically import dependencies.""" diff --git a/lib/crewai/tests/knowledge/test_knowledge.py b/lib/crewai/tests/knowledge/test_knowledge.py index b0f35c4d9..d42de4719 100644 --- a/lib/crewai/tests/knowledge/test_knowledge.py +++ b/lib/crewai/tests/knowledge/test_knowledge.py @@ -603,6 +603,70 @@ def test_file_path_validation(): PDFKnowledgeSource() +def test_path_traversal_blocked_in_base_file_knowledge_source(tmpdir): + """Test that path traversal sequences are blocked in BaseFileKnowledgeSource.convert_to_path().""" + traversal_paths = [ + "../../../etc/passwd", + "../../secret_file.txt", + "subdir/../../.env", + "../outside.txt", + ] + for malicious_path in traversal_paths: + with pytest.raises(ValueError, match="outside the allowed knowledge directory"): + TextFileKnowledgeSource(file_paths=[malicious_path]) + + +def test_path_traversal_blocked_in_excel_knowledge_source(): + """Test that path traversal sequences are blocked in ExcelKnowledgeSource.convert_to_path().""" + traversal_paths = [ + "../../../etc/passwd", + "../../secret_file.xlsx", + "subdir/../../.env", + ] + for malicious_path in traversal_paths: + with pytest.raises(ValueError, match="outside the allowed knowledge directory"): + ExcelKnowledgeSource(file_paths=[malicious_path]) + + +def test_path_traversal_blocked_in_crew_docling_source(): + """Test that path traversal sequences are blocked in CrewDoclingSource.validate_content().""" + pytest.importorskip("docling") + + traversal_paths = [ + "../../../etc/passwd", + "../../secret_file.pdf", + "subdir/../../.env", + ] + for malicious_path in traversal_paths: + with pytest.raises(ValueError, match="outside the allowed knowledge directory"): + CrewDoclingSource(file_paths=[malicious_path]) + + +def test_valid_paths_still_work_in_convert_to_path(tmpdir): + """Test that valid paths within the knowledge directory are accepted.""" + import os + + knowledge_dir = Path("knowledge") + knowledge_dir.mkdir(exist_ok=True) + test_file = knowledge_dir / "test_valid.txt" + test_file.write_text("test content") + + try: + source = TextFileKnowledgeSource(file_paths=[test_file]) + assert len(source.safe_file_paths) == 1 + finally: + test_file.unlink(missing_ok=True) + + +def test_path_object_bypasses_traversal_check(tmpdir): + """Test that Path objects (not strings) bypass the traversal check since they are explicit.""" + file_path = Path(tmpdir.join("test.txt")) + file_path.write_text("test content") + + source = TextFileKnowledgeSource(file_paths=[file_path]) + assert source.safe_file_paths == [file_path] + + def test_hash_based_id_generation_without_doc_id(mock_vector_db): """Test that documents without doc_id generate hash-based IDs. Duplicates are deduplicated before upsert.""" import hashlib