diff --git a/pyproject.toml b/pyproject.toml index 3f10c1a87..bc5ef733e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,7 @@ openpyxl = [ mem0 = ["mem0ai>=0.1.29"] docling = [ "docling>=2.12.0", + "tokenizers>=0.21,<0.22", ] [tool.uv] diff --git a/src/crewai/knowledge/source/crew_docling_source.py b/src/crewai/knowledge/source/crew_docling_source.py index 8b197168b..98aedea99 100644 --- a/src/crewai/knowledge/source/crew_docling_source.py +++ b/src/crewai/knowledge/source/crew_docling_source.py @@ -17,6 +17,8 @@ from crewai.utilities.logger import Logger class CrewDoclingSource(BaseKnowledgeSource): """Default Source class for converting documents to markdown or json This will auto support PDF, DOCX, and TXT, XLSX, Images, and HTML files without any additional dependencies and follows the docling package as the source of truth. + + Note: To use this class, install crewai with the docling extra: `pip install crewai[docling]` """ _logger: Logger = Logger(verbose=True) diff --git a/tests/test_tokenizers_compatibility.py b/tests/test_tokenizers_compatibility.py new file mode 100644 index 000000000..0d7ed6b54 --- /dev/null +++ b/tests/test_tokenizers_compatibility.py @@ -0,0 +1,24 @@ +"""Test to verify compatibility between tokenizers and transformers.""" + +import pytest + + +def test_tokenizers_transformers_compatibility(): + """Test that the installed tokenizers version is compatible with transformers.""" + try: + import tokenizers + import transformers + except ImportError: + pytest.skip("tokenizers or transformers not installed") + + tokenizers_version = tokenizers.__version__ + transformers_version = transformers.__version__ + + tokenizers_major, tokenizers_minor, _ = map(int, tokenizers_version.split('.')) + + assert tokenizers_major == 0, f"Expected tokenizers major version 0, got {tokenizers_major}" + assert tokenizers_minor >= 21, f"Expected tokenizers minor version >=21, got {tokenizers_minor}" + assert tokenizers_minor < 22, f"Expected tokenizers minor version <22, got {tokenizers_minor}" + + print(f"Tokenizers version: {tokenizers_version}") + print(f"Transformers version: {transformers_version}")