Compare commits

...

6 Commits

Author SHA1 Message Date
Brandon Hancock
21f4b60754 Include embedding type fix 2025-03-20 08:55:30 -04:00
Brandon Hancock (bhancock_ai)
216ff4aa6f Merge branch 'main' into embedding-config-typing 2025-03-20 08:47:33 -04:00
Nick Fujita
6f849c0e6d 'added docs for config based on agent review' 2025-02-20 18:11:16 +09:00
Nick Fujita
276f661e6c 'add specific providers to provider type' 2025-02-20 18:02:36 +09:00
Nick Fujita
8f99caf61b 'type cleanup' 2025-02-20 17:58:46 +09:00
Nick Fujita
f4642f11cc 'add typings to embedding configurator input arg' 2025-02-20 17:52:13 +09:00
6 changed files with 139 additions and 56 deletions

View File

@@ -20,6 +20,7 @@ from crewai.tools.agent_tools.agent_tools import AgentTools
from crewai.utilities import Converter, Prompts
from crewai.utilities.constants import TRAINED_AGENTS_DATA_FILE, TRAINING_DATA_FILE
from crewai.utilities.converter import generate_model_description
from crewai.utilities.embedding_configurator import EmbeddingConfig
from crewai.utilities.events.agent_events import (
AgentExecutionCompletedEvent,
AgentExecutionErrorEvent,
@@ -108,7 +109,7 @@ class Agent(BaseAgent):
default="safe",
description="Mode for code execution: 'safe' (using Docker) or 'unsafe' (direct execution).",
)
embedder: Optional[Dict[str, Any]] = Field(
embedder: Optional[EmbeddingConfig] = Field(
default=None,
description="Embedder configuration for the agent.",
)
@@ -134,7 +135,7 @@ class Agent(BaseAgent):
self.cache_handler = CacheHandler()
self.set_cache_handler(self.cache_handler)
def set_knowledge(self, crew_embedder: Optional[Dict[str, Any]] = None):
def set_knowledge(self, crew_embedder: Optional[EmbeddingConfig] = None):
try:
if self.embedder is None and crew_embedder:
self.embedder = crew_embedder

View File

@@ -25,6 +25,7 @@ from crewai.tools.base_tool import BaseTool, Tool
from crewai.utilities import I18N, Logger, RPMController
from crewai.utilities.config import process_config
from crewai.utilities.converter import Converter
from crewai.utilities.embedding_configurator import EmbeddingConfig
T = TypeVar("T", bound="BaseAgent")
@@ -362,5 +363,5 @@ class BaseAgent(ABC, BaseModel):
self._rpm_controller = rpm_controller
self.create_agent_executor()
def set_knowledge(self, crew_embedder: Optional[Dict[str, Any]] = None):
def set_knowledge(self, crew_embedder: Optional[EmbeddingConfig] = None):
pass

View File

@@ -41,6 +41,7 @@ from crewai.tools.base_tool import Tool
from crewai.types.usage_metrics import UsageMetrics
from crewai.utilities import I18N, FileHandler, Logger, RPMController
from crewai.utilities.constants import TRAINING_DATA_FILE
from crewai.utilities.embedding_configurator import EmbeddingConfig
from crewai.utilities.evaluators.crew_evaluator_handler import CrewEvaluator
from crewai.utilities.evaluators.task_evaluator import TaskEvaluator
from crewai.utilities.events.crew_events import (
@@ -145,7 +146,7 @@ class Crew(BaseModel):
default=None,
description="An instance of the UserMemory to be used by the Crew to store/fetch memories of a specific user.",
)
embedder: Optional[dict] = Field(
embedder: Optional[EmbeddingConfig] = Field(
default=None,
description="Configuration for the embedder to be used for the crew.",
)

View File

@@ -5,6 +5,7 @@ from pydantic import BaseModel, ConfigDict, Field
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage
from crewai.utilities.embedding_configurator import EmbeddingConfig
os.environ["TOKENIZERS_PARALLELISM"] = "false" # removes logging from fastembed
@@ -21,14 +22,14 @@ class Knowledge(BaseModel):
sources: List[BaseKnowledgeSource] = Field(default_factory=list)
model_config = ConfigDict(arbitrary_types_allowed=True)
storage: Optional[KnowledgeStorage] = Field(default=None)
embedder: Optional[Dict[str, Any]] = None
embedder: Optional[EmbeddingConfig] = None
collection_name: Optional[str] = None
def __init__(
self,
collection_name: str,
sources: List[BaseKnowledgeSource],
embedder: Optional[Dict[str, Any]] = None,
embedder: Optional[EmbeddingConfig] = None,
storage: Optional[KnowledgeStorage] = None,
**data,
):

View File

@@ -15,6 +15,7 @@ from chromadb.config import Settings
from crewai.knowledge.storage.base_knowledge_storage import BaseKnowledgeStorage
from crewai.utilities import EmbeddingConfigurator
from crewai.utilities.constants import KNOWLEDGE_DIRECTORY
from crewai.utilities.embedding_configurator import EmbeddingConfig
from crewai.utilities.logger import Logger
from crewai.utilities.paths import db_storage_path
@@ -48,7 +49,7 @@ class KnowledgeStorage(BaseKnowledgeStorage):
def __init__(
self,
embedder: Optional[Dict[str, Any]] = None,
embedder: Optional[EmbeddingConfig] = None,
collection_name: Optional[str] = None,
):
self.collection_name = collection_name
@@ -187,7 +188,7 @@ class KnowledgeStorage(BaseKnowledgeStorage):
api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small"
)
def _set_embedder_config(self, embedder: Optional[Dict[str, Any]] = None) -> None:
def _set_embedder_config(self, embedder: Optional[EmbeddingConfig] = None) -> None:
"""Set the embedding configuration for the knowledge storage.
Args:

View File

@@ -1,8 +1,84 @@
import os
from typing import Any, Dict, Optional, cast
from typing import Any, Callable, Literal, cast
from chromadb import Documents, EmbeddingFunction, Embeddings
from chromadb.api.types import validate_embedding_function
from pydantic import BaseModel
class EmbeddingProviderConfig(BaseModel):
"""Configuration model for embedding providers.
Attributes:
# Core Model Configuration
model (str | None): The model identifier for embeddings, used across multiple providers
like OpenAI, Azure, Watson, etc.
embedder (str | Callable | None): Custom embedding function or callable for custom
embedding implementations.
# API Authentication & Configuration
api_key (str | None): Authentication key for various providers (OpenAI, VertexAI,
Google, Cohere, VoyageAI, Watson).
api_base (str | None): Base API URL override for OpenAI and Azure services.
api_type (str | None): API type specification, particularly used for Azure configuration.
api_version (str | None): API version for OpenAI and Azure services.
api_url (str | None): API endpoint URL, used by HuggingFace and Watson services.
url (str | None): Base URL for the embedding service, primarily used for Ollama and
HuggingFace endpoints.
# Service-Specific Configuration
project_id (str | None): Project identifier used by VertexAI and Watson services.
organization_id (str | None): Organization identifier for OpenAI and Azure services.
deployment_id (str | None): Deployment identifier for OpenAI and Azure services.
region (str | None): Geographic region for VertexAI services.
session (str | None): Session configuration for Amazon Bedrock embeddings.
# Request Configuration
task_type (str | None): Specifies the task type for Google Generative AI embeddings.
default_headers (str | None): Custom headers for OpenAI and Azure API requests.
dimensions (str | None): Output dimensions specification for OpenAI and Azure embeddings.
"""
# Core Model Configuration
model: str | None = None
embedder: str | Callable | None = None
# API Authentication & Configuration
api_key: str | None = None
api_base: str | None = None
api_type: str | None = None
api_version: str | None = None
api_url: str | None = None
url: str | None = None
# Service-Specific Configuration
project_id: str | None = None
organization_id: str | None = None
deployment_id: str | None = None
region: str | None = None
session: str | None = None
# Request Configuration
task_type: str | None = None
default_headers: str | None = None
dimensions: str | None = None
class EmbeddingConfig(BaseModel):
provider: Literal[
"openai",
"azure",
"ollama",
"vertexai",
"google",
"cohere",
"voyageai",
"bedrock",
"huggingface",
"watson",
"custom",
]
config: EmbeddingProviderConfig | None = None
class EmbeddingConfigurator:
@@ -23,15 +99,19 @@ class EmbeddingConfigurator:
def configure_embedder(
self,
embedder_config: Optional[Dict[str, Any]] = None,
embedder_config: EmbeddingConfig | None = None,
) -> EmbeddingFunction:
"""Configures and returns an embedding function based on the provided config."""
if embedder_config is None:
return self._create_default_embedding_function()
provider = embedder_config.get("provider")
config = embedder_config.get("config", {})
model_name = config.get("model") if provider != "custom" else None
provider = embedder_config.provider
config = (
embedder_config.config
if embedder_config.config
else EmbeddingProviderConfig()
)
model_name = config.model if provider != "custom" else None
if provider not in self.embedding_functions:
raise Exception(
@@ -56,123 +136,123 @@ class EmbeddingConfigurator:
)
@staticmethod
def _configure_openai(config, model_name):
def _configure_openai(config: EmbeddingProviderConfig, model_name: str):
from chromadb.utils.embedding_functions.openai_embedding_function import (
OpenAIEmbeddingFunction,
)
return OpenAIEmbeddingFunction(
api_key=config.get("api_key") or os.getenv("OPENAI_API_KEY"),
api_key=config.api_key or os.getenv("OPENAI_API_KEY"),
model_name=model_name,
api_base=config.get("api_base", None),
api_type=config.get("api_type", None),
api_version=config.get("api_version", None),
default_headers=config.get("default_headers", None),
dimensions=config.get("dimensions", None),
deployment_id=config.get("deployment_id", None),
organization_id=config.get("organization_id", None),
api_base=config.api_base,
api_type=config.api_type,
api_version=config.api_version,
default_headers=config.default_headers,
dimensions=config.dimensions,
deployment_id=config.deployment_id,
organization_id=config.organization_id,
)
@staticmethod
def _configure_azure(config, model_name):
def _configure_azure(config: EmbeddingProviderConfig, model_name: str):
from chromadb.utils.embedding_functions.openai_embedding_function import (
OpenAIEmbeddingFunction,
)
return OpenAIEmbeddingFunction(
api_key=config.get("api_key"),
api_base=config.get("api_base"),
api_type=config.get("api_type", "azure"),
api_version=config.get("api_version"),
api_key=config.api_key,
api_base=config.api_base,
api_type=config.api_type if config.api_type else "azure",
api_version=config.api_version,
model_name=model_name,
default_headers=config.get("default_headers"),
dimensions=config.get("dimensions"),
deployment_id=config.get("deployment_id"),
organization_id=config.get("organization_id"),
default_headers=config.default_headers,
dimensions=config.dimensions,
deployment_id=config.deployment_id,
organization_id=config.organization_id,
)
@staticmethod
def _configure_ollama(config, model_name):
def _configure_ollama(config: EmbeddingProviderConfig, model_name: str):
from chromadb.utils.embedding_functions.ollama_embedding_function import (
OllamaEmbeddingFunction,
)
return OllamaEmbeddingFunction(
url=config.get("url", "http://localhost:11434/api/embeddings"),
url=config.url if config.url else "http://localhost:11434/api/embeddings",
model_name=model_name,
)
@staticmethod
def _configure_vertexai(config, model_name):
def _configure_vertexai(config: EmbeddingProviderConfig, model_name: str):
from chromadb.utils.embedding_functions.google_embedding_function import (
GoogleVertexEmbeddingFunction,
)
return GoogleVertexEmbeddingFunction(
model_name=model_name,
api_key=config.get("api_key"),
project_id=config.get("project_id"),
region=config.get("region"),
api_key=config.api_key,
project_id=config.project_id,
region=config.region,
)
@staticmethod
def _configure_google(config, model_name):
def _configure_google(config: EmbeddingProviderConfig, model_name: str):
from chromadb.utils.embedding_functions.google_embedding_function import (
GoogleGenerativeAiEmbeddingFunction,
)
return GoogleGenerativeAiEmbeddingFunction(
model_name=model_name,
api_key=config.get("api_key"),
task_type=config.get("task_type"),
api_key=config.api_key,
task_type=config.task_type,
)
@staticmethod
def _configure_cohere(config, model_name):
def _configure_cohere(config: EmbeddingProviderConfig, model_name: str):
from chromadb.utils.embedding_functions.cohere_embedding_function import (
CohereEmbeddingFunction,
)
return CohereEmbeddingFunction(
model_name=model_name,
api_key=config.get("api_key"),
api_key=config.api_key,
)
@staticmethod
def _configure_voyageai(config, model_name):
def _configure_voyageai(config: EmbeddingProviderConfig, model_name: str):
from chromadb.utils.embedding_functions.voyageai_embedding_function import (
VoyageAIEmbeddingFunction,
)
return VoyageAIEmbeddingFunction(
model_name=model_name,
api_key=config.get("api_key"),
api_key=config.api_key,
)
@staticmethod
def _configure_bedrock(config, model_name):
def _configure_bedrock(config: EmbeddingProviderConfig, model_name: str):
from chromadb.utils.embedding_functions.amazon_bedrock_embedding_function import (
AmazonBedrockEmbeddingFunction,
)
# Allow custom model_name override with backwards compatibility
kwargs = {"session": config.get("session")}
kwargs = {"session": config.session}
if model_name is not None:
kwargs["model_name"] = model_name
return AmazonBedrockEmbeddingFunction(**kwargs)
@staticmethod
def _configure_huggingface(config, model_name):
def _configure_huggingface(config: EmbeddingProviderConfig, model_name: str):
from chromadb.utils.embedding_functions.huggingface_embedding_function import (
HuggingFaceEmbeddingServer,
)
return HuggingFaceEmbeddingServer(
url=config.get("api_url"),
url=config.api_url,
)
@staticmethod
def _configure_watson(config, model_name):
def _configure_watson(config: EmbeddingProviderConfig, model_name: str):
try:
import ibm_watsonx_ai.foundation_models as watson_models
from ibm_watsonx_ai import Credentials
@@ -193,12 +273,10 @@ class EmbeddingConfigurator:
}
embedding = watson_models.Embeddings(
model_id=config.get("model"),
model_id=config.model,
params=embed_params,
credentials=Credentials(
api_key=config.get("api_key"), url=config.get("api_url")
),
project_id=config.get("project_id"),
credentials=Credentials(api_key=config.api_key, url=config.api_url),
project_id=config.project_id,
)
try:
@@ -211,8 +289,8 @@ class EmbeddingConfigurator:
return WatsonEmbeddingFunction()
@staticmethod
def _configure_custom(config):
custom_embedder = config.get("embedder")
def _configure_custom(config: EmbeddingProviderConfig):
custom_embedder = config.embedder
if isinstance(custom_embedder, EmbeddingFunction):
try:
validate_embedding_function(custom_embedder)