From e891563135c6838f52a4126a3ceed05f38e486d2 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sat, 3 May 2025 21:19:14 +0000 Subject: [PATCH] Fix #2746: Add URL protocol validation for Huggingface embedder Co-Authored-By: Joe Moura --- .../utilities/embedding_configurator.py | 9 ++- tests/knowledge/knowledge_test.py | 81 +++++++++++++++++++ tests/knowledge/test_data.json | 6 ++ 3 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 tests/knowledge/test_data.json diff --git a/src/crewai/utilities/embedding_configurator.py b/src/crewai/utilities/embedding_configurator.py index 44e832ec2..4e7020e35 100644 --- a/src/crewai/utilities/embedding_configurator.py +++ b/src/crewai/utilities/embedding_configurator.py @@ -140,8 +140,15 @@ class EmbeddingConfigurator: HuggingFaceEmbeddingServer, ) + api_url = config.get("api_url") + if api_url: + if not (api_url.startswith("http://") or api_url.startswith("https://")): + api_url = f"http://{api_url}" + else: + api_url = None + return HuggingFaceEmbeddingServer( - url=config.get("api_url"), + url=api_url, ) @staticmethod diff --git a/tests/knowledge/knowledge_test.py b/tests/knowledge/knowledge_test.py index 366067587..7bd6e7c0c 100644 --- a/tests/knowledge/knowledge_test.py +++ b/tests/knowledge/knowledge_test.py @@ -584,3 +584,84 @@ def test_docling_source_with_local_file(): docling_source = CrewDoclingSource(file_paths=[pdf_path]) assert docling_source.file_paths == [pdf_path] assert docling_source.content is not None + + +def test_huggingface_url_validation(): + """Test that Huggingface embedder properly handles URLs without protocol.""" + from crewai.utilities.embedding_configurator import EmbeddingConfigurator + + config_missing_protocol = { + "api_url": "localhost:8080/embed" + } + embedding_function = EmbeddingConfigurator()._configure_huggingface( + config_missing_protocol, "test-model" + ) + # Verify that the URL now has a protocol + assert embedding_function._api_url.startswith("http://") + + config_with_protocol = { + "api_url": "https://localhost:8080/embed" + } + embedding_function = EmbeddingConfigurator()._configure_huggingface( + config_with_protocol, "test-model" + ) + # Verify that the URL remains unchanged + assert embedding_function._api_url == "https://localhost:8080/embed" + + config_with_other_protocol = { + "api_url": "http://localhost:8080/embed" + } + embedding_function = EmbeddingConfigurator()._configure_huggingface( + config_with_other_protocol, "test-model" + ) + # Verify that the URL remains unchanged + assert embedding_function._api_url == "http://localhost:8080/embed" + + config_no_url = {} + embedding_function = EmbeddingConfigurator()._configure_huggingface( + config_no_url, "test-model" + ) + # Verify that no exception is raised when URL is None + assert embedding_function._api_url == 'None' + + +def test_huggingface_missing_protocol_with_json_source(): + """Test that JSONKnowledgeSource works with Huggingface embedder without URL protocol.""" + import os + import json + import tempfile + from crewai.knowledge.source.json_knowledge_source import JSONKnowledgeSource + from crewai.utilities.embedding_configurator import EmbeddingConfigurator + + # Create a temporary JSON file + with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp: + json.dump({"test": "data", "nested": {"value": 123}}, temp) + json_path = temp.name + + # Test that the URL validation works in the embedder configurator + config = { + "api_url": "localhost:8080/embed" # Missing protocol + } + embedding_function = EmbeddingConfigurator()._configure_huggingface( + config, "test-model" + ) + # Verify that the URL now has a protocol + assert embedding_function._api_url.startswith("http://") + + os.unlink(json_path) + + +def test_huggingface_missing_protocol_with_string_source(): + """Test that StringKnowledgeSource works with Huggingface embedder without URL protocol.""" + from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource + from crewai.utilities.embedding_configurator import EmbeddingConfigurator + + # Test that the URL validation works in the embedder configurator + config = { + "api_url": "localhost:8080/embed" # Missing protocol + } + embedding_function = EmbeddingConfigurator()._configure_huggingface( + config, "test-model" + ) + # Verify that the URL now has a protocol + assert embedding_function._api_url.startswith("http://") diff --git a/tests/knowledge/test_data.json b/tests/knowledge/test_data.json new file mode 100644 index 000000000..720589bae --- /dev/null +++ b/tests/knowledge/test_data.json @@ -0,0 +1,6 @@ +{ + "test": "data", + "nested": { + "value": 123 + } +}