diff --git a/pyproject.toml b/pyproject.toml index f287e5484..b21ab71ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,12 @@ fastembed = ["fastembed>=0.4.1"] pdfplumber = [ "pdfplumber>=0.11.4", ] +pandas = [ + "pandas>=2.2.3", +] +openpyxl = [ + "openpyxl>=3.1.5", +] [tool.uv] dev-dependencies = [ diff --git a/src/crewai/knowledge/embedder/ollama.py b/src/crewai/knowledge/embedder/ollama.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/crewai/knowledge/embedder/ollama_embedder.py b/src/crewai/knowledge/embedder/ollama_embedder.py new file mode 100644 index 000000000..3f7521cab --- /dev/null +++ b/src/crewai/knowledge/embedder/ollama_embedder.py @@ -0,0 +1,82 @@ +import os +from typing import List, Optional + +import numpy as np +from openai import OpenAI + +from .base_embedder import BaseEmbedder + + +class OllamaEmbedder(BaseEmbedder): + """ + A wrapper class for text embedding models using Ollama's API + """ + + def __init__( + self, + model_name: str, + api_key: Optional[str] = None, + base_url: str = "http://localhost:11434/v1", + ): + """ + Initialize the embedding model + + Args: + model_name: Name of the model to use + api_key: API key (defaults to 'ollama' or environment variable 'OLLAMA_API_KEY') + base_url: Base URL for the Ollama API (default is 'http://localhost:11434/v1') + """ + self.model_name = model_name + self.api_key = api_key or os.getenv("OLLAMA_API_KEY") or "ollama" + self.base_url = base_url + self.client = OpenAI(base_url=self.base_url, api_key=self.api_key) + + def embed_chunks(self, chunks: List[str]) -> List[np.ndarray]: + """ + Generate embeddings for a list of text chunks + + Args: + chunks: List of text chunks to embed + + Returns: + List of embeddings + """ + return self.embed_texts(chunks) + + def embed_texts(self, texts: List[str]) -> List[np.ndarray]: + """ + Generate embeddings for a list of texts + + Args: + texts: List of texts to embed + + Returns: + List of embeddings + """ + embeddings = [] + max_batch_size = 2048 # Adjust batch size if necessary + for i in range(0, len(texts), max_batch_size): + batch = texts[i : i + max_batch_size] + response = self.client.embeddings.create(input=batch, model=self.model_name) + batch_embeddings = [np.array(item.embedding) for item in response.data] + embeddings.extend(batch_embeddings) + return embeddings + + def embed_text(self, text: str) -> np.ndarray: + """ + Generate embedding for a single text + + Args: + text: Text to embed + + Returns: + Embedding array + """ + return self.embed_texts([text])[0] + + @property + def dimension(self) -> int: + """Get the dimension of the embeddings""" + # Embedding dimensions may vary; we'll determine it dynamically + test_embed = self.embed_text("test") + return len(test_embed) diff --git a/src/crewai/knowledge/embedder/openai.py b/src/crewai/knowledge/embedder/openai.py new file mode 100644 index 000000000..d38376bdc --- /dev/null +++ b/src/crewai/knowledge/embedder/openai.py @@ -0,0 +1,85 @@ +import os +from typing import List, Optional + +import numpy as np +from openai import OpenAI + +from .base_embedder import BaseEmbedder + + +class OpenAIEmbedder(BaseEmbedder): + """ + A wrapper class for text embedding models using OpenAI's Embedding API + """ + + def __init__( + self, + model_name: str = "text-embedding-ada-002", + api_key: Optional[str] = None, + ): + """ + Initialize the embedding model + + Args: + model_name: Name of the model to use + api_key: OpenAI API key + """ + self.model_name = model_name + self.api_key = api_key or os.getenv("OPENAI_API_KEY") + if not self.api_key: + raise ValueError( + "OpenAI API key must be provided or set in the environment variable 'OPENAI_API_KEY'" + ) + self.client = OpenAI( + api_key=self.api_key, + base_url="http://localhost:11434/v1", + ) + + def embed_chunks(self, chunks: List[str]) -> List[np.ndarray]: + """ + Generate embeddings for a list of text chunks + + Args: + chunks: List of text chunks to embed + + Returns: + List of embeddings + """ + return self.embed_texts(chunks) + + def embed_texts(self, texts: List[str]) -> List[np.ndarray]: + """ + Generate embeddings for a list of texts + + Args: + texts: List of texts to embed + + Returns: + List of embeddings + """ + embeddings = [] + max_batch_size = 2048 # OpenAI recommends smaller batch sizes + for i in range(0, len(texts), max_batch_size): + batch = texts[i : i + max_batch_size] + response = self.client.embeddings.create(input=batch, model=self.model_name) + batch_embeddings = [np.array(data.embedding) for data in response.data] + embeddings.extend(batch_embeddings) + return embeddings + + def embed_text(self, text: str) -> np.ndarray: + """ + Generate embedding fors a single text + + Args: + text: Text to embed + + Returns: + Embedding array + """ + return self.embed_texts([text])[0] + + @property + def dimension(self) -> int: + """Get the dimension of the embeddings""" + # For OpenAI's text-embedding-ada-002, the dimension is 1536 + return 1536 diff --git a/src/crewai/knowledge/source/csv_knowledge_source.py b/src/crewai/knowledge/source/csv_knowledge_source.py new file mode 100644 index 000000000..eea307a33 --- /dev/null +++ b/src/crewai/knowledge/source/csv_knowledge_source.py @@ -0,0 +1,38 @@ +import csv +from typing import List + +from crewai.knowledge.embedder.base_embedder import BaseEmbedder +from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource + + +class CSVKnowledgeSource(BaseFileKnowledgeSource): + """A knowledge source that stores and queries CSV file content using embeddings.""" + + def load_content(self) -> str: + """Load and preprocess CSV file content.""" + super().load_content() # Validate the file path + with open(self.file_path, "r", encoding="utf-8") as csvfile: + reader = csv.reader(csvfile) + content = "" + for row in reader: + content += " ".join(row) + "\n" + return content + + def add(self, embedder: BaseEmbedder) -> None: + """ + Add CSV file content to the knowledge source, chunk it, compute embeddings, + and save the embeddings. + """ + new_chunks = self._chunk_text(self.content) + self.chunks.extend(new_chunks) + # Compute embeddings for the new chunks + new_embeddings = embedder.embed_chunks(new_chunks) + # Save the embeddings + self.chunk_embeddings.extend(new_embeddings) + + def _chunk_text(self, text: str) -> List[str]: + """Utility method to split text into chunks.""" + return [ + text[i : i + self.chunk_size] + for i in range(0, len(text), self.chunk_size - self.chunk_overlap) + ] diff --git a/src/crewai/knowledge/source/excel_knowledge_source.py b/src/crewai/knowledge/source/excel_knowledge_source.py new file mode 100644 index 000000000..67c026d56 --- /dev/null +++ b/src/crewai/knowledge/source/excel_knowledge_source.py @@ -0,0 +1,48 @@ +from typing import List + +from crewai.knowledge.embedder.base_embedder import BaseEmbedder +from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource + + +class ExcelKnowledgeSource(BaseFileKnowledgeSource): + """A knowledge source that stores and queries Excel file content using embeddings.""" + + def load_content(self) -> str: + """Load and preprocess Excel file content.""" + super().load_content() # Validate the file path + pd = self._import_dependencies() + df = pd.read_excel(self.file_path) + content = df.to_csv(index=False) + return content + + def _import_dependencies(self): + """Dynamically import dependencies.""" + try: + import openpyxl + import pandas as pd + + return pd + except ImportError as e: + missing_package = str(e).split()[-1] + raise ImportError( + f"{missing_package} is not installed. Please install it with: pip install {missing_package}" + ) + + def add(self, embedder: BaseEmbedder) -> None: + """ + Add Excel file content to the knowledge source, chunk it, compute embeddings, + and save the embeddings. + """ + new_chunks = self._chunk_text(self.content) + self.chunks.extend(new_chunks) + # Compute embeddings for the new chunks + new_embeddings = embedder.embed_chunks(new_chunks) + # Save the embeddings + self.chunk_embeddings.extend(new_embeddings) + + def _chunk_text(self, text: str) -> List[str]: + """Utility method to split text into chunks.""" + return [ + text[i : i + self.chunk_size] + for i in range(0, len(text), self.chunk_size - self.chunk_overlap) + ] diff --git a/src/crewai/knowledge/source/json_knowledge_source.py b/src/crewai/knowledge/source/json_knowledge_source.py new file mode 100644 index 000000000..351e3e207 --- /dev/null +++ b/src/crewai/knowledge/source/json_knowledge_source.py @@ -0,0 +1,50 @@ +import json +from typing import Any, List + +from crewai.knowledge.embedder.base_embedder import BaseEmbedder +from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource + + +class JSONKnowledgeSource(BaseFileKnowledgeSource): + """A knowledge source that stores and queries JSON file content using embeddings.""" + + def load_content(self) -> str: + """Load and preprocess JSON file content.""" + super().load_content() # Validate the file path + with open(self.file_path, "r", encoding="utf-8") as json_file: + data = json.load(json_file) + content = self._json_to_text(data) + return content + + def _json_to_text(self, data: Any, level: int = 0) -> str: + """Recursively convert JSON data to a text representation.""" + text = "" + indent = " " * level + if isinstance(data, dict): + for key, value in data.items(): + text += f"{indent}{key}: {self._json_to_text(value, level + 1)}\n" + elif isinstance(data, list): + for item in data: + text += f"{indent}- {self._json_to_text(item, level + 1)}\n" + else: + text += f"{str(data)}" + return text + + def add(self, embedder: BaseEmbedder) -> None: + """ + Add JSON file content to the knowledge source, chunk it, compute embeddings, + and save the embeddings. + """ + new_chunks = self._chunk_text(self.content) + self.chunks.extend(new_chunks) + # Compute embeddings for the new chunks + new_embeddings = embedder.embed_chunks(new_chunks) + # Save the embeddings + self.chunk_embeddings.extend(new_embeddings) + + def _chunk_text(self, text: str) -> List[str]: + """Utility method to split text into chunks.""" + return [ + text[i : i + self.chunk_size] + for i in range(0, len(text), self.chunk_size - self.chunk_overlap) + ] diff --git a/tests/agent_test.py b/tests/agent_test.py index 6fcd79d2e..d9873d548 100644 --- a/tests/agent_test.py +++ b/tests/agent_test.py @@ -1577,7 +1577,7 @@ def test_agent_execute_task_with_ollama(): assert "AI" in result or "artificial intelligence" in result.lower() -# @pytest.mark.vcr(filter_headers=["authorization"]) +@pytest.mark.vcr(filter_headers=["authorization"]) def test_agent_with_knowledge_sources(): # Create a knowledge source with some content content = "Brandon's favorite color is blue and he likes Mexican food." diff --git a/tests/knowledge/knowledge_test.py b/tests/knowledge/knowledge_test.py index c77b06dee..806d765e5 100644 --- a/tests/knowledge/knowledge_test.py +++ b/tests/knowledge/knowledge_test.py @@ -3,6 +3,9 @@ from pathlib import Path from crewai.knowledge.knowledge import Knowledge +from crewai.knowledge.source.csv_knowledge_source import CSVKnowledgeSource +from crewai.knowledge.source.excel_knowledge_source import ExcelKnowledgeSource +from crewai.knowledge.source.json_knowledge_source import JSONKnowledgeSource from crewai.knowledge.source.pdf_knowledge_source import PDFKnowledgeSource from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource from crewai.knowledge.source.text_file_knowledge_source import TextFileKnowledgeSource @@ -345,3 +348,86 @@ def test_pdf_knowledge_source(): "crewai create crew latest-ai-development" in result.lower() for result in results ) + + +def test_csv_knowledge_source(tmpdir): + """Test CSVKnowledgeSource with a simple CSV file.""" + + # Create a CSV file with sample data + csv_content = [ + ["Name", "Age", "City"], + ["Brandon", "30", "New York"], + ["Alice", "25", "Los Angeles"], + ["Bob", "35", "Chicago"], + ] + csv_path = Path(tmpdir.join("data.csv")) + with open(csv_path, "w", encoding="utf-8") as f: + for row in csv_content: + f.write(",".join(row) + "\n") + + # Create a CSVKnowledgeSource + csv_source = CSVKnowledgeSource(file_path=csv_path) + knowledge_base = Knowledge(sources=[csv_source]) + + # Perform a query + query = "How old is Brandon?" + results = knowledge_base.query(query) + + # Assert that the correct information is retrieved + assert any("30" in result for result in results) + + +def test_json_knowledge_source(tmpdir): + """Test JSONKnowledgeSource with a simple JSON file.""" + + # Create a JSON file with sample data + json_data = { + "people": [ + {"name": "Brandon", "age": 30, "city": "New York"}, + {"name": "Alice", "age": 25, "city": "Los Angeles"}, + {"name": "Bob", "age": 35, "city": "Chicago"}, + ] + } + json_path = Path(tmpdir.join("data.json")) + with open(json_path, "w", encoding="utf-8") as f: + import json + + json.dump(json_data, f) + + # Create a JSONKnowledgeSource + json_source = JSONKnowledgeSource(file_path=json_path) + knowledge_base = Knowledge(sources=[json_source]) + + # Perform a query + query = "Where does Brandon live?" + results = knowledge_base.query(query) + + # Assert that the correct information is retrieved + assert any("New York" in result for result in results) + + +def test_excel_knowledge_source(tmpdir): + """Test ExcelKnowledgeSource with a simple Excel file.""" + + # Create an Excel file with sample data + import pandas as pd + + excel_data = { + "Name": ["Brandon", "Alice", "Bob"], + "Age": [30, 25, 35], + "City": ["New York", "Los Angeles", "Chicago"], + } + df = pd.DataFrame(excel_data) + excel_path = Path(tmpdir.join("data.xlsx")) + df.to_excel(excel_path, index=False) + + # Create an ExcelKnowledgeSource + excel_source = ExcelKnowledgeSource(file_path=excel_path) + knowledge_base = Knowledge(sources=[excel_source]) + + # Perform a query + query = "What is Brandon's age?" + results = knowledge_base.query(query) + + # Assert that the correct information is retrieved + assert any("30" in result for result in results) diff --git a/uv.lock b/uv.lock index 13a896f7a..c3914ece4 100644 --- a/uv.lock +++ b/uv.lock @@ -637,8 +637,11 @@ agentops = [ fastembed = [ { name = "fastembed" }, ] -network = [ - { name = "pdfplumber" }, +openpyxl = [ + { name = "openpyxl" }, +] +pandas = [ + { name = "pandas" }, ] pdfplumber = [ { name = "pdfplumber" }, @@ -683,10 +686,11 @@ requires-dist = [ { name = "langchain", specifier = ">=0.2.16" }, { name = "litellm", specifier = ">=1.44.22" }, { name = "openai", specifier = ">=1.13.3" }, + { name = "openpyxl", marker = "extra == 'openpyxl'", specifier = ">=3.1.5" }, { name = "opentelemetry-api", specifier = ">=1.22.0" }, { name = "opentelemetry-exporter-otlp-proto-http", specifier = ">=1.22.0" }, { name = "opentelemetry-sdk", specifier = ">=1.22.0" }, - { name = "pdfplumber", marker = "extra == 'network'", specifier = ">=0.11.4" }, + { name = "pandas", marker = "extra == 'pandas'", specifier = ">=2.2.3" }, { name = "pdfplumber", marker = "extra == 'pdfplumber'", specifier = ">=0.11.4" }, { name = "pydantic", specifier = ">=2.4.2" }, { name = "python-dotenv", specifier = ">=1.0.0" }, @@ -931,6 +935,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a7/51/0c78d26da4afbe68370306669556b274f1021cac02f3155d8da2be407763/embedchain-0.1.123-py3-none-any.whl", hash = "sha256:1210e993b6364d7c702b6bd44b053fc244dd77f2a65ea4b90b62709114ea6c25", size = 210909 }, ] +[[package]] +name = "et-xmlfile" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059 }, +] + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -2671,6 +2684,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ad/31/28a83e124e9f9dd04c83b5aeb6f8b1770f45addde4dd3d34d9a9091590ad/openai-1.52.1-py3-none-any.whl", hash = "sha256:f23e83df5ba04ee0e82c8562571e8cb596cd88f9a84ab783e6c6259e5ffbfb4a", size = 386945 }, ] +[[package]] +name = "openpyxl" +version = "3.1.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "et-xmlfile" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910 }, +] + [[package]] name = "opentelemetry-api" version = "1.27.0"