From f1a91c506b4628145984e68e904daa35870ea954 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 22 May 2025 10:15:58 +0000 Subject: [PATCH] Fix CI: Make pgvector an optional dependency, fix SQL injection and type errors Co-Authored-By: Joe Moura --- pyproject.toml | 5 +++ src/crewai/knowledge/storage/__init__.py | 6 ++- .../storage/pgvector_knowledge_storage.py | 44 +++++++++++++------ .../pgvector_knowledge_storage_test.py | 2 - 4 files changed, 41 insertions(+), 16 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6ccea2a1b..67e7ae742 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,6 +67,11 @@ docling = [ aisuite = [ "aisuite>=0.1.10", ] +pgvector = [ + "pgvector>=0.2.0", + "sqlalchemy>=2.0.0", + "psycopg2-binary>=2.9.0", +] [tool.uv] dev-dependencies = [ diff --git a/src/crewai/knowledge/storage/__init__.py b/src/crewai/knowledge/storage/__init__.py index 68ec7285c..e3a143a9f 100644 --- a/src/crewai/knowledge/storage/__init__.py +++ b/src/crewai/knowledge/storage/__init__.py @@ -1 +1,5 @@ -from crewai.knowledge.storage.pgvector_knowledge_storage import PGVectorKnowledgeStorage +try: + from crewai.knowledge.storage.pgvector_knowledge_storage import PGVectorKnowledgeStorage + __all__ = ["PGVectorKnowledgeStorage"] +except ImportError: + __all__ = [] diff --git a/src/crewai/knowledge/storage/pgvector_knowledge_storage.py b/src/crewai/knowledge/storage/pgvector_knowledge_storage.py index 1842b2987..07295d774 100644 --- a/src/crewai/knowledge/storage/pgvector_knowledge_storage.py +++ b/src/crewai/knowledge/storage/pgvector_knowledge_storage.py @@ -2,24 +2,34 @@ from typing import Any, Dict, List, Optional import hashlib import logging import os -from sqlalchemy import create_engine, Column, String, Text, Float +from sqlalchemy import create_engine, Column, String, Text from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker -from pgvector.sqlalchemy import Vector +from sqlalchemy.sql import text from crewai.knowledge.storage.base_knowledge_storage import BaseKnowledgeStorage from crewai.utilities import EmbeddingConfigurator +try: + from pgvector.sqlalchemy import Vector + HAS_PGVECTOR = True +except ImportError: + HAS_PGVECTOR = False + class VectorType: + def __init__(self, dimensions: int): + self.dimensions = dimensions + Vector = VectorType # type: ignore + Base = declarative_base() -class Document(Base): +class Document(Base): # type: ignore """SQLAlchemy model for document storage with pgvector.""" __tablename__ = "documents" id = Column(String, primary_key=True) content = Column(Text) metadata = Column(Text) # JSON serialized metadata - embedding = Column(Vector(1536)) # Adjust dimension based on embedding model + embedding: Column = Column(Vector(1536)) # Adjust dimension based on embedding model class PGVectorKnowledgeStorage(BaseKnowledgeStorage): """ @@ -45,6 +55,11 @@ class PGVectorKnowledgeStorage(BaseKnowledgeStorage): table_name: Name of the table to store documents embedding_dimension: Dimension of the embedding vectors """ + if not HAS_PGVECTOR: + raise ImportError( + "pgvector is not installed. Please install it with: pip install pgvector" + ) + self.connection_string = connection_string self.table_name = table_name self.embedding_dimension = embedding_dimension @@ -94,14 +109,17 @@ class PGVectorKnowledgeStorage(BaseKnowledgeStorage): try: query_embedding = self.embedder([query[0]])[0] - sql_query = f""" - SELECT id, content, metadata, 1 - (embedding <=> '{query_embedding}') as similarity + sql_query = text(f""" + SELECT id, content, metadata, 1 - (embedding <=> :query_embedding) as similarity FROM {self.table_name} - ORDER BY embedding <=> '{query_embedding}' - LIMIT {limit} - """ + ORDER BY embedding <=> :query_embedding + LIMIT :limit + """) - results = session.execute(sql_query).fetchall() + results = session.execute( + sql_query, + {"query_embedding": query_embedding, "limit": limit} + ).fetchall() formatted_results = [] for row in results: @@ -154,9 +172,9 @@ class PGVectorKnowledgeStorage(BaseKnowledgeStorage): existing = session.query(Document).filter(Document.id == doc_id).first() if existing: - existing.content = doc - existing.metadata = str(meta) if meta else None - existing.embedding = embedding + setattr(existing, "content", doc) + setattr(existing, "metadata", str(meta) if meta else None) + setattr(existing, "embedding", embedding) else: new_doc = Document( id=doc_id, diff --git a/tests/knowledge/pgvector_knowledge_storage_test.py b/tests/knowledge/pgvector_knowledge_storage_test.py index 95bb2472d..54f547d5d 100644 --- a/tests/knowledge/pgvector_knowledge_storage_test.py +++ b/tests/knowledge/pgvector_knowledge_storage_test.py @@ -1,6 +1,4 @@ -import os import pytest -from typing import Dict, Any, List from unittest.mock import patch, MagicMock from crewai.knowledge.storage.pgvector_knowledge_storage import PGVectorKnowledgeStorage