mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-14 02:28:30 +00:00
added additional sources
This commit is contained in:
0
src/crewai/knowledge/embedder/ollama.py
Normal file
0
src/crewai/knowledge/embedder/ollama.py
Normal file
82
src/crewai/knowledge/embedder/ollama_embedder.py
Normal file
82
src/crewai/knowledge/embedder/ollama_embedder.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
import numpy as np
|
||||
from openai import OpenAI
|
||||
|
||||
from .base_embedder import BaseEmbedder
|
||||
|
||||
|
||||
class OllamaEmbedder(BaseEmbedder):
|
||||
"""
|
||||
A wrapper class for text embedding models using Ollama's API
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str,
|
||||
api_key: Optional[str] = None,
|
||||
base_url: str = "http://localhost:11434/v1",
|
||||
):
|
||||
"""
|
||||
Initialize the embedding model
|
||||
|
||||
Args:
|
||||
model_name: Name of the model to use
|
||||
api_key: API key (defaults to 'ollama' or environment variable 'OLLAMA_API_KEY')
|
||||
base_url: Base URL for the Ollama API (default is 'http://localhost:11434/v1')
|
||||
"""
|
||||
self.model_name = model_name
|
||||
self.api_key = api_key or os.getenv("OLLAMA_API_KEY") or "ollama"
|
||||
self.base_url = base_url
|
||||
self.client = OpenAI(base_url=self.base_url, api_key=self.api_key)
|
||||
|
||||
def embed_chunks(self, chunks: List[str]) -> List[np.ndarray]:
|
||||
"""
|
||||
Generate embeddings for a list of text chunks
|
||||
|
||||
Args:
|
||||
chunks: List of text chunks to embed
|
||||
|
||||
Returns:
|
||||
List of embeddings
|
||||
"""
|
||||
return self.embed_texts(chunks)
|
||||
|
||||
def embed_texts(self, texts: List[str]) -> List[np.ndarray]:
|
||||
"""
|
||||
Generate embeddings for a list of texts
|
||||
|
||||
Args:
|
||||
texts: List of texts to embed
|
||||
|
||||
Returns:
|
||||
List of embeddings
|
||||
"""
|
||||
embeddings = []
|
||||
max_batch_size = 2048 # Adjust batch size if necessary
|
||||
for i in range(0, len(texts), max_batch_size):
|
||||
batch = texts[i : i + max_batch_size]
|
||||
response = self.client.embeddings.create(input=batch, model=self.model_name)
|
||||
batch_embeddings = [np.array(item.embedding) for item in response.data]
|
||||
embeddings.extend(batch_embeddings)
|
||||
return embeddings
|
||||
|
||||
def embed_text(self, text: str) -> np.ndarray:
|
||||
"""
|
||||
Generate embedding for a single text
|
||||
|
||||
Args:
|
||||
text: Text to embed
|
||||
|
||||
Returns:
|
||||
Embedding array
|
||||
"""
|
||||
return self.embed_texts([text])[0]
|
||||
|
||||
@property
|
||||
def dimension(self) -> int:
|
||||
"""Get the dimension of the embeddings"""
|
||||
# Embedding dimensions may vary; we'll determine it dynamically
|
||||
test_embed = self.embed_text("test")
|
||||
return len(test_embed)
|
||||
85
src/crewai/knowledge/embedder/openai.py
Normal file
85
src/crewai/knowledge/embedder/openai.py
Normal file
@@ -0,0 +1,85 @@
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
import numpy as np
|
||||
from openai import OpenAI
|
||||
|
||||
from .base_embedder import BaseEmbedder
|
||||
|
||||
|
||||
class OpenAIEmbedder(BaseEmbedder):
|
||||
"""
|
||||
A wrapper class for text embedding models using OpenAI's Embedding API
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str = "text-embedding-ada-002",
|
||||
api_key: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Initialize the embedding model
|
||||
|
||||
Args:
|
||||
model_name: Name of the model to use
|
||||
api_key: OpenAI API key
|
||||
"""
|
||||
self.model_name = model_name
|
||||
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
||||
if not self.api_key:
|
||||
raise ValueError(
|
||||
"OpenAI API key must be provided or set in the environment variable 'OPENAI_API_KEY'"
|
||||
)
|
||||
self.client = OpenAI(
|
||||
api_key=self.api_key,
|
||||
base_url="http://localhost:11434/v1",
|
||||
)
|
||||
|
||||
def embed_chunks(self, chunks: List[str]) -> List[np.ndarray]:
|
||||
"""
|
||||
Generate embeddings for a list of text chunks
|
||||
|
||||
Args:
|
||||
chunks: List of text chunks to embed
|
||||
|
||||
Returns:
|
||||
List of embeddings
|
||||
"""
|
||||
return self.embed_texts(chunks)
|
||||
|
||||
def embed_texts(self, texts: List[str]) -> List[np.ndarray]:
|
||||
"""
|
||||
Generate embeddings for a list of texts
|
||||
|
||||
Args:
|
||||
texts: List of texts to embed
|
||||
|
||||
Returns:
|
||||
List of embeddings
|
||||
"""
|
||||
embeddings = []
|
||||
max_batch_size = 2048 # OpenAI recommends smaller batch sizes
|
||||
for i in range(0, len(texts), max_batch_size):
|
||||
batch = texts[i : i + max_batch_size]
|
||||
response = self.client.embeddings.create(input=batch, model=self.model_name)
|
||||
batch_embeddings = [np.array(data.embedding) for data in response.data]
|
||||
embeddings.extend(batch_embeddings)
|
||||
return embeddings
|
||||
|
||||
def embed_text(self, text: str) -> np.ndarray:
|
||||
"""
|
||||
Generate embedding fors a single text
|
||||
|
||||
Args:
|
||||
text: Text to embed
|
||||
|
||||
Returns:
|
||||
Embedding array
|
||||
"""
|
||||
return self.embed_texts([text])[0]
|
||||
|
||||
@property
|
||||
def dimension(self) -> int:
|
||||
"""Get the dimension of the embeddings"""
|
||||
# For OpenAI's text-embedding-ada-002, the dimension is 1536
|
||||
return 1536
|
||||
38
src/crewai/knowledge/source/csv_knowledge_source.py
Normal file
38
src/crewai/knowledge/source/csv_knowledge_source.py
Normal file
@@ -0,0 +1,38 @@
|
||||
import csv
|
||||
from typing import List
|
||||
|
||||
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
|
||||
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
|
||||
|
||||
|
||||
class CSVKnowledgeSource(BaseFileKnowledgeSource):
|
||||
"""A knowledge source that stores and queries CSV file content using embeddings."""
|
||||
|
||||
def load_content(self) -> str:
|
||||
"""Load and preprocess CSV file content."""
|
||||
super().load_content() # Validate the file path
|
||||
with open(self.file_path, "r", encoding="utf-8") as csvfile:
|
||||
reader = csv.reader(csvfile)
|
||||
content = ""
|
||||
for row in reader:
|
||||
content += " ".join(row) + "\n"
|
||||
return content
|
||||
|
||||
def add(self, embedder: BaseEmbedder) -> None:
|
||||
"""
|
||||
Add CSV file content to the knowledge source, chunk it, compute embeddings,
|
||||
and save the embeddings.
|
||||
"""
|
||||
new_chunks = self._chunk_text(self.content)
|
||||
self.chunks.extend(new_chunks)
|
||||
# Compute embeddings for the new chunks
|
||||
new_embeddings = embedder.embed_chunks(new_chunks)
|
||||
# Save the embeddings
|
||||
self.chunk_embeddings.extend(new_embeddings)
|
||||
|
||||
def _chunk_text(self, text: str) -> List[str]:
|
||||
"""Utility method to split text into chunks."""
|
||||
return [
|
||||
text[i : i + self.chunk_size]
|
||||
for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
|
||||
]
|
||||
48
src/crewai/knowledge/source/excel_knowledge_source.py
Normal file
48
src/crewai/knowledge/source/excel_knowledge_source.py
Normal file
@@ -0,0 +1,48 @@
|
||||
from typing import List
|
||||
|
||||
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
|
||||
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
|
||||
|
||||
|
||||
class ExcelKnowledgeSource(BaseFileKnowledgeSource):
|
||||
"""A knowledge source that stores and queries Excel file content using embeddings."""
|
||||
|
||||
def load_content(self) -> str:
|
||||
"""Load and preprocess Excel file content."""
|
||||
super().load_content() # Validate the file path
|
||||
pd = self._import_dependencies()
|
||||
df = pd.read_excel(self.file_path)
|
||||
content = df.to_csv(index=False)
|
||||
return content
|
||||
|
||||
def _import_dependencies(self):
|
||||
"""Dynamically import dependencies."""
|
||||
try:
|
||||
import openpyxl
|
||||
import pandas as pd
|
||||
|
||||
return pd
|
||||
except ImportError as e:
|
||||
missing_package = str(e).split()[-1]
|
||||
raise ImportError(
|
||||
f"{missing_package} is not installed. Please install it with: pip install {missing_package}"
|
||||
)
|
||||
|
||||
def add(self, embedder: BaseEmbedder) -> None:
|
||||
"""
|
||||
Add Excel file content to the knowledge source, chunk it, compute embeddings,
|
||||
and save the embeddings.
|
||||
"""
|
||||
new_chunks = self._chunk_text(self.content)
|
||||
self.chunks.extend(new_chunks)
|
||||
# Compute embeddings for the new chunks
|
||||
new_embeddings = embedder.embed_chunks(new_chunks)
|
||||
# Save the embeddings
|
||||
self.chunk_embeddings.extend(new_embeddings)
|
||||
|
||||
def _chunk_text(self, text: str) -> List[str]:
|
||||
"""Utility method to split text into chunks."""
|
||||
return [
|
||||
text[i : i + self.chunk_size]
|
||||
for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
|
||||
]
|
||||
50
src/crewai/knowledge/source/json_knowledge_source.py
Normal file
50
src/crewai/knowledge/source/json_knowledge_source.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import json
|
||||
from typing import Any, List
|
||||
|
||||
from crewai.knowledge.embedder.base_embedder import BaseEmbedder
|
||||
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
|
||||
|
||||
|
||||
class JSONKnowledgeSource(BaseFileKnowledgeSource):
|
||||
"""A knowledge source that stores and queries JSON file content using embeddings."""
|
||||
|
||||
def load_content(self) -> str:
|
||||
"""Load and preprocess JSON file content."""
|
||||
super().load_content() # Validate the file path
|
||||
with open(self.file_path, "r", encoding="utf-8") as json_file:
|
||||
data = json.load(json_file)
|
||||
content = self._json_to_text(data)
|
||||
return content
|
||||
|
||||
def _json_to_text(self, data: Any, level: int = 0) -> str:
|
||||
"""Recursively convert JSON data to a text representation."""
|
||||
text = ""
|
||||
indent = " " * level
|
||||
if isinstance(data, dict):
|
||||
for key, value in data.items():
|
||||
text += f"{indent}{key}: {self._json_to_text(value, level + 1)}\n"
|
||||
elif isinstance(data, list):
|
||||
for item in data:
|
||||
text += f"{indent}- {self._json_to_text(item, level + 1)}\n"
|
||||
else:
|
||||
text += f"{str(data)}"
|
||||
return text
|
||||
|
||||
def add(self, embedder: BaseEmbedder) -> None:
|
||||
"""
|
||||
Add JSON file content to the knowledge source, chunk it, compute embeddings,
|
||||
and save the embeddings.
|
||||
"""
|
||||
new_chunks = self._chunk_text(self.content)
|
||||
self.chunks.extend(new_chunks)
|
||||
# Compute embeddings for the new chunks
|
||||
new_embeddings = embedder.embed_chunks(new_chunks)
|
||||
# Save the embeddings
|
||||
self.chunk_embeddings.extend(new_embeddings)
|
||||
|
||||
def _chunk_text(self, text: str) -> List[str]:
|
||||
"""Utility method to split text into chunks."""
|
||||
return [
|
||||
text[i : i + self.chunk_size]
|
||||
for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
|
||||
]
|
||||
Reference in New Issue
Block a user