Compare commits

...

2 Commits

Author SHA1 Message Date
Devin AI
70379689cf Improve URL validation with better type hints and documentation
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-05-03 21:22:51 +00:00
Devin AI
e891563135 Fix #2746: Add URL protocol validation for Huggingface embedder
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-05-03 21:19:14 +00:00
3 changed files with 118 additions and 2 deletions

View File

@@ -135,13 +135,42 @@ class EmbeddingConfigurator:
)
@staticmethod
def _configure_huggingface(config, model_name):
def _normalize_api_url(api_url: str) -> str:
"""
Normalize API URL by ensuring it has a protocol.
Args:
api_url: The API URL to normalize
Returns:
Normalized URL with protocol (defaults to http:// if missing)
"""
if not (api_url.startswith("http://") or api_url.startswith("https://")):
return f"http://{api_url}"
return api_url
@staticmethod
def _configure_huggingface(config: dict, model_name: str):
"""
Configure Huggingface embedding function with the provided config.
Args:
config: Configuration dictionary for the Huggingface embedder
model_name: Name of the model to use
Returns:
Configured HuggingFaceEmbeddingServer instance
"""
from chromadb.utils.embedding_functions.huggingface_embedding_function import (
HuggingFaceEmbeddingServer,
)
api_url = config.get("api_url")
if api_url:
api_url = EmbeddingConfigurator._normalize_api_url(api_url)
return HuggingFaceEmbeddingServer(
url=config.get("api_url"),
url=api_url,
)
@staticmethod

View File

@@ -584,3 +584,84 @@ def test_docling_source_with_local_file():
docling_source = CrewDoclingSource(file_paths=[pdf_path])
assert docling_source.file_paths == [pdf_path]
assert docling_source.content is not None
def test_huggingface_url_validation():
"""Test that Huggingface embedder properly handles URLs without protocol."""
from crewai.utilities.embedding_configurator import EmbeddingConfigurator
config_missing_protocol = {
"api_url": "localhost:8080/embed"
}
embedding_function = EmbeddingConfigurator()._configure_huggingface(
config_missing_protocol, "test-model"
)
# Verify that the URL now has a protocol
assert embedding_function._api_url.startswith("http://")
config_with_protocol = {
"api_url": "https://localhost:8080/embed"
}
embedding_function = EmbeddingConfigurator()._configure_huggingface(
config_with_protocol, "test-model"
)
# Verify that the URL remains unchanged
assert embedding_function._api_url == "https://localhost:8080/embed"
config_with_other_protocol = {
"api_url": "http://localhost:8080/embed"
}
embedding_function = EmbeddingConfigurator()._configure_huggingface(
config_with_other_protocol, "test-model"
)
# Verify that the URL remains unchanged
assert embedding_function._api_url == "http://localhost:8080/embed"
config_no_url = {}
embedding_function = EmbeddingConfigurator()._configure_huggingface(
config_no_url, "test-model"
)
# Verify that no exception is raised when URL is None
assert embedding_function._api_url == 'None'
def test_huggingface_missing_protocol_with_json_source():
"""Test that JSONKnowledgeSource works with Huggingface embedder without URL protocol."""
import os
import json
import tempfile
from crewai.knowledge.source.json_knowledge_source import JSONKnowledgeSource
from crewai.utilities.embedding_configurator import EmbeddingConfigurator
# Create a temporary JSON file
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp:
json.dump({"test": "data", "nested": {"value": 123}}, temp)
json_path = temp.name
# Test that the URL validation works in the embedder configurator
config = {
"api_url": "localhost:8080/embed" # Missing protocol
}
embedding_function = EmbeddingConfigurator()._configure_huggingface(
config, "test-model"
)
# Verify that the URL now has a protocol
assert embedding_function._api_url.startswith("http://")
os.unlink(json_path)
def test_huggingface_missing_protocol_with_string_source():
"""Test that StringKnowledgeSource works with Huggingface embedder without URL protocol."""
from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource
from crewai.utilities.embedding_configurator import EmbeddingConfigurator
# Test that the URL validation works in the embedder configurator
config = {
"api_url": "localhost:8080/embed" # Missing protocol
}
embedding_function = EmbeddingConfigurator()._configure_huggingface(
config, "test-model"
)
# Verify that the URL now has a protocol
assert embedding_function._api_url.startswith("http://")

View File

@@ -0,0 +1,6 @@
{
"test": "data",
"nested": {
"value": 123
}
}