"""Test Knowledge Source metadata functionality.""" from pathlib import Path from unittest.mock import MagicMock, patch import pytest from crewai.knowledge.source.csv_knowledge_source import CSVKnowledgeSource from crewai.knowledge.source.json_knowledge_source import JSONKnowledgeSource from crewai.knowledge.source.pdf_knowledge_source import PDFKnowledgeSource from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource from crewai.knowledge.source.text_file_knowledge_source import TextFileKnowledgeSource from crewai.knowledge.storage.knowledge_storage import _coerce_to_records class TestCoerceToRecords: """Test the _coerce_to_records function.""" def test_coerce_string_list(self): """Test coercing a list of strings.""" documents = ["chunk1", "chunk2", "chunk3"] result = _coerce_to_records(documents) assert len(result) == 3 assert result[0]["content"] == "chunk1" assert result[1]["content"] == "chunk2" assert result[2]["content"] == "chunk3" assert "metadata" not in result[0] def test_coerce_dict_with_metadata(self): """Test coercing dictionaries with metadata.""" documents = [ { "content": "chunk1", "metadata": { "filepath": "/path/to/file.txt", "chunk_index": 0, "source_type": "text_file", } }, { "content": "chunk2", "metadata": { "filepath": "/path/to/file.txt", "chunk_index": 1, "source_type": "text_file", } } ] result = _coerce_to_records(documents) assert len(result) == 2 assert result[0]["content"] == "chunk1" assert result[0]["metadata"]["filepath"] == "/path/to/file.txt" assert result[0]["metadata"]["chunk_index"] == 0 assert result[0]["metadata"]["source_type"] == "text_file" assert result[1]["content"] == "chunk2" assert result[1]["metadata"]["chunk_index"] == 1 def test_coerce_mixed_formats(self): """Test coercing mixed string and dict formats.""" documents = [ "plain string chunk", { "content": "dict chunk", "metadata": {"source_type": "test"} } ] result = _coerce_to_records(documents) assert len(result) == 2 assert result[0]["content"] == "plain string chunk" assert "metadata" not in result[0] assert result[1]["content"] == "dict chunk" assert result[1]["metadata"]["source_type"] == "test" def test_coerce_empty_content_skipped(self): """Test that empty content is skipped.""" documents = [ {"content": "valid chunk"}, {"content": None}, {"content": ""}, {"content": "another valid chunk"} ] result = _coerce_to_records(documents) assert len(result) == 2 assert result[0]["content"] == "valid chunk" assert result[1]["content"] == "another valid chunk" def test_coerce_missing_content_skipped(self): """Test that dicts without content key are skipped.""" documents = [ {"content": "valid chunk"}, {"metadata": {"some": "data"}}, {"content": "another valid chunk"} ] result = _coerce_to_records(documents) assert len(result) == 2 assert result[0]["content"] == "valid chunk" assert result[1]["content"] == "another valid chunk" def test_coerce_with_doc_id(self): """Test coercing documents with doc_id.""" documents = [ { "content": "chunk with id", "doc_id": "doc123", "metadata": {"source_type": "test"} } ] result = _coerce_to_records(documents) assert len(result) == 1 assert result[0]["content"] == "chunk with id" assert result[0]["doc_id"] == "doc123" assert result[0]["metadata"]["source_type"] == "test" def test_coerce_metadata_type_conversion(self): """Test that metadata values are properly converted to allowed types.""" documents = [ { "content": "test chunk", "metadata": { "string_val": "text", "int_val": 42, "float_val": 3.14, "bool_val": True, "none_val": None, "other_val": {"nested": "dict"} } } ] result = _coerce_to_records(documents) assert len(result) == 1 metadata = result[0]["metadata"] assert metadata["string_val"] == "text" assert metadata["int_val"] == 42 assert metadata["float_val"] == 3.14 assert metadata["bool_val"] is True assert metadata["none_val"] == "" assert isinstance(metadata["other_val"], str) class TestTextFileKnowledgeSourceMetadata: """Test TextFileKnowledgeSource metadata functionality.""" def test_text_file_chunks_have_metadata(self, tmpdir): """Test that text file chunks include metadata.""" from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage content = "This is a test file. " * 100 file_path = Path(tmpdir.join("test.txt")) with open(file_path, "w") as f: f.write(content) with patch.object(KnowledgeStorage, 'save') as mock_save: source = TextFileKnowledgeSource( file_paths=[file_path], storage=KnowledgeStorage(), chunk_size=100, chunk_overlap=10 ) source.add() assert len(source.chunks) > 0 for i, chunk in enumerate(source.chunks): assert isinstance(chunk, dict) assert "content" in chunk assert "metadata" in chunk assert chunk["metadata"]["filepath"] == str(file_path) assert chunk["metadata"]["chunk_index"] == i assert chunk["metadata"]["source_type"] == "text_file" mock_save.assert_called_once() saved_chunks = mock_save.call_args[0][0] assert len(saved_chunks) == len(source.chunks) class TestPDFKnowledgeSourceMetadata: """Test PDFKnowledgeSource metadata functionality.""" @patch('crewai.knowledge.source.pdf_knowledge_source.PDFKnowledgeSource._import_pdfplumber') def test_pdf_chunks_have_metadata(self, mock_import, tmpdir): """Test that PDF chunks include metadata.""" from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage mock_pdf = MagicMock() mock_page = MagicMock() mock_page.extract_text.return_value = "PDF content. " * 50 mock_pdf.pages = [mock_page] mock_pdfplumber = MagicMock() mock_pdfplumber.open.return_value.__enter__.return_value = mock_pdf mock_import.return_value = mock_pdfplumber file_path = Path(tmpdir.join("test.pdf")) file_path.touch() with patch.object(KnowledgeStorage, 'save') as mock_save: source = PDFKnowledgeSource( file_paths=[file_path], storage=KnowledgeStorage(), chunk_size=100, chunk_overlap=10 ) source.add() assert len(source.chunks) > 0 for i, chunk in enumerate(source.chunks): assert isinstance(chunk, dict) assert "content" in chunk assert "metadata" in chunk assert chunk["metadata"]["filepath"] == str(file_path) assert chunk["metadata"]["chunk_index"] == i assert chunk["metadata"]["source_type"] == "pdf" class TestCSVKnowledgeSourceMetadata: """Test CSVKnowledgeSource metadata functionality.""" def test_csv_chunks_have_metadata(self, tmpdir): """Test that CSV chunks include metadata.""" from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage csv_content = "Name,Age,City\nJohn,30,NYC\nJane,25,LA\n" * 20 file_path = Path(tmpdir.join("test.csv")) with open(file_path, "w") as f: f.write(csv_content) with patch.object(KnowledgeStorage, 'save') as mock_save: source = CSVKnowledgeSource( file_paths=[file_path], storage=KnowledgeStorage(), chunk_size=100, chunk_overlap=10 ) source.add() assert len(source.chunks) > 0 for i, chunk in enumerate(source.chunks): assert isinstance(chunk, dict) assert "content" in chunk assert "metadata" in chunk assert chunk["metadata"]["filepath"] == str(file_path) assert chunk["metadata"]["chunk_index"] == i assert chunk["metadata"]["source_type"] == "csv" class TestJSONKnowledgeSourceMetadata: """Test JSONKnowledgeSource metadata functionality.""" def test_json_chunks_have_metadata(self, tmpdir): """Test that JSON chunks include metadata.""" from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage json_content = '{"users": [{"name": "John", "age": 30}, {"name": "Jane", "age": 25}]}' file_path = Path(tmpdir.join("test.json")) with open(file_path, "w") as f: f.write(json_content) with patch.object(KnowledgeStorage, 'save') as mock_save: source = JSONKnowledgeSource( file_paths=[file_path], storage=KnowledgeStorage(), chunk_size=50, chunk_overlap=5 ) source.add() assert len(source.chunks) > 0 for i, chunk in enumerate(source.chunks): assert isinstance(chunk, dict) assert "content" in chunk assert "metadata" in chunk assert chunk["metadata"]["filepath"] == str(file_path) assert chunk["metadata"]["chunk_index"] == i assert chunk["metadata"]["source_type"] == "json" class TestStringKnowledgeSourceMetadata: """Test StringKnowledgeSource metadata functionality.""" def test_string_chunks_have_metadata(self): """Test that string chunks include metadata.""" from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage content = "This is a test string. " * 50 with patch.object(KnowledgeStorage, 'save') as mock_save: source = StringKnowledgeSource( content=content, storage=KnowledgeStorage(), chunk_size=100, chunk_overlap=10 ) source.add() assert len(source.chunks) > 0 for i, chunk in enumerate(source.chunks): assert isinstance(chunk, dict) assert "content" in chunk assert "metadata" in chunk assert chunk["metadata"]["chunk_index"] == i assert chunk["metadata"]["source_type"] == "string" assert "filepath" not in chunk["metadata"] class TestMultipleFilesMetadata: """Test metadata for multiple files.""" def test_multiple_text_files_have_distinct_metadata(self, tmpdir): """Test that multiple files have distinct metadata.""" from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage file1 = Path(tmpdir.join("file1.txt")) file2 = Path(tmpdir.join("file2.txt")) with open(file1, "w") as f: f.write("Content from file 1. " * 50) with open(file2, "w") as f: f.write("Content from file 2. " * 50) with patch.object(KnowledgeStorage, 'save') as mock_save: source = TextFileKnowledgeSource( file_paths=[file1, file2], storage=KnowledgeStorage(), chunk_size=100, chunk_overlap=10 ) source.add() file1_chunks = [c for c in source.chunks if c["metadata"]["filepath"] == str(file1)] file2_chunks = [c for c in source.chunks if c["metadata"]["filepath"] == str(file2)] assert len(file1_chunks) > 0 assert len(file2_chunks) > 0 for i, chunk in enumerate(file1_chunks): assert chunk["metadata"]["chunk_index"] == i assert chunk["metadata"]["source_type"] == "text_file" for i, chunk in enumerate(file2_chunks): assert chunk["metadata"]["chunk_index"] == i assert chunk["metadata"]["source_type"] == "text_file" class TestBackwardCompatibility: """Test backward compatibility with existing code.""" def test_storage_accepts_string_list(self): """Test that storage still accepts plain string lists.""" from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage with patch('crewai.knowledge.storage.knowledge_storage.get_rag_client') as mock_client: mock_client_instance = MagicMock() mock_client.return_value = mock_client_instance storage = KnowledgeStorage() documents = ["chunk1", "chunk2", "chunk3"] storage.save(documents) mock_client_instance.add_documents.assert_called_once() saved_docs = mock_client_instance.add_documents.call_args[1]["documents"] assert len(saved_docs) == 3 assert all("content" in doc for doc in saved_docs) def test_storage_accepts_dict_list(self): """Test that storage accepts dict lists with metadata.""" from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage with patch('crewai.knowledge.storage.knowledge_storage.get_rag_client') as mock_client: mock_client_instance = MagicMock() mock_client.return_value = mock_client_instance storage = KnowledgeStorage() documents = [ { "content": "chunk1", "metadata": {"filepath": "/path/to/file.txt", "chunk_index": 0} }, { "content": "chunk2", "metadata": {"filepath": "/path/to/file.txt", "chunk_index": 1} } ] storage.save(documents) mock_client_instance.add_documents.assert_called_once() saved_docs = mock_client_instance.add_documents.call_args[1]["documents"] assert len(saved_docs) == 2 assert all("content" in doc for doc in saved_docs) assert all("metadata" in doc for doc in saved_docs) class TestCrewDoclingSourceMetadata: """Test CrewDoclingSource metadata with conversion failures.""" @pytest.mark.skipif( not hasattr(pytest, "importorskip") or pytest.importorskip("docling", reason="docling not available") is None, reason="docling not available" ) def test_docling_filepath_metadata_with_conversion_failure(self, tmp_path): """Test that filepath metadata is correct even when some files fail conversion.""" try: from pathlib import Path from unittest.mock import MagicMock, Mock from crewai.knowledge.source.crew_docling_source import CrewDoclingSource from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage file1 = tmp_path / "file1.txt" file2 = tmp_path / "file2.txt" file3 = tmp_path / "file3.txt" file1.write_text("Content from file 1") file2.write_text("Content from file 2") file3.write_text("Content from file 3") mock_doc1 = MagicMock() mock_doc3 = MagicMock() mock_result1 = MagicMock() mock_result1.document = mock_doc1 mock_result1.input.file = file1 mock_result3 = MagicMock() mock_result3.document = mock_doc3 mock_result3.input.file = file3 with patch("crewai.knowledge.source.crew_docling_source.DocumentConverter") as mock_converter_class: mock_converter = MagicMock() mock_converter_class.return_value = mock_converter mock_converter.convert_all.return_value = iter([mock_result1, mock_result3]) mock_converter.allowed_formats = [] with patch.object(KnowledgeStorage, 'save') as mock_save: with patch("crewai.knowledge.source.crew_docling_source.CrewDoclingSource._chunk_doc") as mock_chunk: mock_chunk.side_effect = [ iter(["Chunk 1 from file1", "Chunk 2 from file1"]), iter(["Chunk 1 from file3", "Chunk 2 from file3"]) ] storage = KnowledgeStorage() source = CrewDoclingSource( file_paths=[file1, file2, file3], storage=storage ) source.add() assert len(source.chunks) == 4 assert source.chunks[0]["metadata"]["filepath"] == str(file1) assert source.chunks[0]["metadata"]["source_type"] == "docling" assert source.chunks[0]["metadata"]["chunk_index"] == 0 assert source.chunks[1]["metadata"]["filepath"] == str(file1) assert source.chunks[1]["metadata"]["chunk_index"] == 1 assert source.chunks[2]["metadata"]["filepath"] == str(file3) assert source.chunks[2]["metadata"]["source_type"] == "docling" assert source.chunks[2]["metadata"]["chunk_index"] == 0 assert source.chunks[3]["metadata"]["filepath"] == str(file3) assert source.chunks[3]["metadata"]["chunk_index"] == 1 for chunk in source.chunks: assert chunk["metadata"]["filepath"] != str(file2) except ImportError: pytest.skip("docling not available")