Files
crewAI/tests/knowledge/knowledge_test.py
Lucas Gomide 2902201bfa
Some checks failed
Notify Downstream / notify-downstream (push) Has been cancelled
Mark stale issues and pull requests / stale (push) Has been cancelled
pytest improvements to handle flaky test (#2726)
* build(dev): add pytest-randomly dependency

By randomizing the test execution order, this helps identify tests
that unintentionally depend on shared state or specific execution
order, which can lead to flaky or unreliable test behavior.

* build(dev): add pytest-timeout

This will prevent a test from running indefinitely

* test: block external requests in CI and set default 10s timeout per test

* test: adding missing cassettes

We notice that those cassettes are missing after enabling block-network on CI

* test: increase tests timeout on CI

* test: fix flaky test ValueError: Circular reference detected (id repeated)

* fix: prevent crash when event handler raises exception

Previously, if a registered event handler raised an exception during execution,
it could crash the entire application or interrupt the event dispatch process.
This change wraps handler execution in a try/except block within the `emit` method,
ensuring that exceptions are caught and logged without affecting other handlers or flow.

This improves the resilience of the event bus, especially when handling third-party
or temporary listeners.
2025-05-01 15:48:29 -04:00

606 lines
27 KiB
Python

"""Test Knowledge creation and querying functionality."""
from pathlib import Path
from typing import List, Union
from unittest.mock import patch
import pytest
from crewai.knowledge.source.crew_docling_source import CrewDoclingSource
from crewai.knowledge.source.csv_knowledge_source import CSVKnowledgeSource
from crewai.knowledge.source.excel_knowledge_source import ExcelKnowledgeSource
from crewai.knowledge.source.json_knowledge_source import JSONKnowledgeSource
from crewai.knowledge.source.pdf_knowledge_source import PDFKnowledgeSource
from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource
from crewai.knowledge.source.text_file_knowledge_source import TextFileKnowledgeSource
@pytest.fixture(autouse=True)
def mock_vector_db():
"""Mock vector database operations."""
with patch("crewai.knowledge.storage.knowledge_storage.KnowledgeStorage") as mock:
# Mock the query method to return a predefined response
instance = mock.return_value
instance.query.return_value = [
{
"context": "Brandon's favorite color is blue and he likes Mexican food.",
"score": 0.9,
}
]
instance.reset.return_value = None
yield instance
@pytest.fixture(autouse=True)
def reset_knowledge_storage(mock_vector_db):
"""Fixture to reset knowledge storage before each test."""
yield
def test_single_short_string(mock_vector_db):
# Create a knowledge base with a single short string
content = "Brandon's favorite color is blue and he likes Mexican food."
string_source = StringKnowledgeSource(
content=content, metadata={"preference": "personal"}
)
mock_vector_db.sources = [string_source]
mock_vector_db.query.return_value = [{"context": content, "score": 0.9}]
# Perform a query
query = "What is Brandon's favorite color?"
results = mock_vector_db.query(query)
# Assert that the results contain the expected information
assert any("blue" in result["context"].lower() for result in results)
# Verify the mock was called
mock_vector_db.query.assert_called_once()
# @pytest.mark.vcr(filter_headers=["authorization"])
def test_single_2k_character_string(mock_vector_db):
# Create a 2k character string with various facts about Brandon
content = (
"Brandon is a software engineer who lives in San Francisco. "
"He enjoys hiking and often visits the trails in the Bay Area. "
"Brandon has a pet dog named Max, who is a golden retriever. "
"He loves reading science fiction books, and his favorite author is Isaac Asimov. "
"Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
"He is also a fan of Mexican cuisine, especially tacos and burritos. "
"Brandon plays the guitar and often performs at local open mic nights. "
"He is learning French and plans to visit Paris next year. "
"Brandon is passionate about technology and often attends tech meetups in the city. "
"He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
"Brandon's favorite color is blue, and he often wears blue shirts. "
"He enjoys cooking and often tries new recipes on weekends. "
"Brandon is a morning person and likes to start his day with a run in the park. "
"He is also a coffee enthusiast and enjoys trying different coffee blends. "
"Brandon is a member of a local book club and enjoys discussing books with fellow members. "
"He is also a fan of board games and often hosts game nights at his place. "
"Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
"He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
"Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
"He is also a fan of the Golden State Warriors and enjoys watching their games. "
)
string_source = StringKnowledgeSource(
content=content, metadata={"preference": "personal"}
)
mock_vector_db.sources = [string_source]
mock_vector_db.query.return_value = [{"context": content, "score": 0.9}]
# Perform a query
query = "What is Brandon's favorite movie?"
results = mock_vector_db.query(query)
# Assert that the results contain the expected information
assert any("inception" in result["context"].lower() for result in results)
mock_vector_db.query.assert_called_once()
def test_multiple_short_strings(mock_vector_db):
# Create multiple short string sources
contents = [
"Brandon loves hiking.",
"Brandon has a dog named Max.",
"Brandon enjoys painting landscapes.",
]
string_sources = [
StringKnowledgeSource(content=content, metadata={"preference": "personal"})
for content in contents
]
# Mock the vector db query response
mock_vector_db.query.return_value = [
{"context": "Brandon has a dog named Max.", "score": 0.9}
]
mock_vector_db.sources = string_sources
# Perform a query
query = "What is the name of Brandon's pet?"
results = mock_vector_db.query(query)
# Assert that the correct information is retrieved
assert any("max" in result["context"].lower() for result in results)
# Verify the mock was called
mock_vector_db.query.assert_called_once()
def test_multiple_2k_character_strings(mock_vector_db):
# Create multiple 2k character strings with various facts about Brandon
contents = [
(
"Brandon is a software engineer who lives in San Francisco. "
"He enjoys hiking and often visits the trails in the Bay Area. "
"Brandon has a pet dog named Max, who is a golden retriever. "
"He loves reading science fiction books, and his favorite author is Isaac Asimov. "
"Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
"He is also a fan of Mexican cuisine, especially tacos and burritos. "
"Brandon plays the guitar and often performs at local open mic nights. "
"He is learning French and plans to visit Paris next year. "
"Brandon is passionate about technology and often attends tech meetups in the city. "
"He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
"Brandon's favorite color is blue, and he often wears blue shirts. "
"He enjoys cooking and often tries new recipes on weekends. "
"Brandon is a morning person and likes to start his day with a run in the park. "
"He is also a coffee enthusiast and enjoys trying different coffee blends. "
"Brandon is a member of a local book club and enjoys discussing books with fellow members. "
"He is also a fan of board games and often hosts game nights at his place. "
"Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
"He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
"Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
"He is also a fan of the Golden State Warriors and enjoys watching their games. "
)
* 2, # Repeat to ensure it's 2k characters
(
"Brandon loves traveling and has visited over 20 countries. "
"He is fluent in Spanish and often practices with his friends. "
"Brandon's favorite city is Barcelona, where he enjoys the architecture and culture. "
"He is a foodie and loves trying new cuisines, with a particular fondness for sushi. "
"Brandon is an avid cyclist and participates in local cycling events. "
"He is also a photographer and enjoys capturing landscapes and cityscapes. "
"Brandon is a tech enthusiast and follows the latest trends in gadgets and software. "
"He is also a fan of virtual reality and owns a VR headset. "
"Brandon's favorite book is 'The Hitchhiker's Guide to the Galaxy'. "
"He enjoys watching documentaries and learning about history and science. "
"Brandon is a coffee lover and has a collection of coffee mugs from different countries. "
"He is also a fan of jazz music and often attends live performances. "
"Brandon is a member of a local running club and participates in marathons. "
"He is also a volunteer at a local animal shelter and helps with dog walking. "
"Brandon's favorite holiday is Christmas, and he enjoys decorating his home. "
"He is also a fan of classic movies and has a collection of DVDs. "
"Brandon is a mentor for young professionals and enjoys giving career advice. "
"He is also a fan of puzzles and enjoys solving them in his free time. "
"Brandon's favorite sport is soccer, and he often plays with his friends. "
"He is also a fan of FC Barcelona and enjoys watching their matches. "
)
* 2, # Repeat to ensure it's 2k characters
]
string_sources = [
StringKnowledgeSource(content=content, metadata={"preference": "personal"})
for content in contents
]
mock_vector_db.sources = string_sources
mock_vector_db.query.return_value = [{"context": contents[1], "score": 0.9}]
# Perform a query
query = "What is Brandon's favorite book?"
results = mock_vector_db.query(query)
# Assert that the correct information is retrieved
assert any(
"the hitchhiker's guide to the galaxy" in result["context"].lower()
for result in results
)
mock_vector_db.query.assert_called_once()
def test_single_short_file(mock_vector_db, tmpdir):
# Create a single short text file
content = "Brandon's favorite sport is basketball."
file_path = Path(tmpdir.join("short_file.txt"))
with open(file_path, "w") as f:
f.write(content)
file_source = TextFileKnowledgeSource(
file_paths=[file_path], metadata={"preference": "personal"}
)
mock_vector_db.sources = [file_source]
mock_vector_db.query.return_value = [{"context": content, "score": 0.9}]
# Perform a query
query = "What sport does Brandon like?"
results = mock_vector_db.query(query)
# Assert that the results contain the expected information
assert any("basketball" in result["context"].lower() for result in results)
mock_vector_db.query.assert_called_once()
def test_single_2k_character_file(mock_vector_db, tmpdir):
# Create a single 2k character text file with various facts about Brandon
content = (
"Brandon is a software engineer who lives in San Francisco. "
"He enjoys hiking and often visits the trails in the Bay Area. "
"Brandon has a pet dog named Max, who is a golden retriever. "
"He loves reading science fiction books, and his favorite author is Isaac Asimov. "
"Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
"He is also a fan of Mexican cuisine, especially tacos and burritos. "
"Brandon plays the guitar and often performs at local open mic nights. "
"He is learning French and plans to visit Paris next year. "
"Brandon is passionate about technology and often attends tech meetups in the city. "
"He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
"Brandon's favorite color is blue, and he often wears blue shirts. "
"He enjoys cooking and often tries new recipes on weekends. "
"Brandon is a morning person and likes to start his day with a run in the park. "
"He is also a coffee enthusiast and enjoys trying different coffee blends. "
"Brandon is a member of a local book club and enjoys discussing books with fellow members. "
"He is also a fan of board games and often hosts game nights at his place. "
"Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
"He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
"Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
"He is also a fan of the Golden State Warriors and enjoys watching their games. "
) * 2 # Repeat to ensure it's 2k characters
file_path = Path(tmpdir.join("long_file.txt"))
with open(file_path, "w") as f:
f.write(content)
file_source = TextFileKnowledgeSource(
file_paths=[file_path], metadata={"preference": "personal"}
)
mock_vector_db.sources = [file_source]
mock_vector_db.query.return_value = [{"context": content, "score": 0.9}]
# Perform a query
query = "What is Brandon's favorite movie?"
results = mock_vector_db.query(query)
# Assert that the results contain the expected information
assert any("inception" in result["context"].lower() for result in results)
mock_vector_db.query.assert_called_once()
def test_multiple_short_files(mock_vector_db, tmpdir):
# Create multiple short text files
contents = [
{
"content": "Brandon works as a software engineer.",
"metadata": {"category": "profession", "source": "occupation"},
},
{
"content": "Brandon lives in New York.",
"metadata": {"category": "city", "source": "personal"},
},
{
"content": "Brandon enjoys cooking Italian food.",
"metadata": {"category": "hobby", "source": "personal"},
},
]
file_paths = []
for i, item in enumerate(contents):
file_path = Path(tmpdir.join(f"file_{i}.txt"))
with open(file_path, "w") as f:
f.write(item["content"])
file_paths.append((file_path, item["metadata"]))
file_sources = [
TextFileKnowledgeSource(file_paths=[path], metadata=metadata)
for path, metadata in file_paths
]
mock_vector_db.sources = file_sources
mock_vector_db.query.return_value = [
{"context": "Brandon lives in New York.", "score": 0.9}
]
# Perform a query
query = "What city does he reside in?"
results = mock_vector_db.query(query)
# Assert that the correct information is retrieved
assert any("new york" in result["context"].lower() for result in results)
mock_vector_db.query.assert_called_once()
def test_multiple_2k_character_files(mock_vector_db, tmpdir):
# Create multiple 2k character text files with various facts about Brandon
contents = [
(
"Brandon loves traveling and has visited over 20 countries. "
"He is fluent in Spanish and often practices with his friends. "
"Brandon's favorite city is Barcelona, where he enjoys the architecture and culture. "
"He is a foodie and loves trying new cuisines, with a particular fondness for sushi. "
"Brandon is an avid cyclist and participates in local cycling events. "
"He is also a photographer and enjoys capturing landscapes and cityscapes. "
"Brandon is a tech enthusiast and follows the latest trends in gadgets and software. "
"He is also a fan of virtual reality and owns a VR headset. "
"Brandon's favorite book is 'The Hitchhiker's Guide to the Galaxy'. "
"He enjoys watching documentaries and learning about history and science. "
"Brandon is a coffee lover and has a collection of coffee mugs from different countries. "
"He is also a fan of jazz music and often attends live performances. "
"Brandon is a member of a local running club and participates in marathons. "
"He is also a volunteer at a local animal shelter and helps with dog walking. "
"Brandon's favorite holiday is Christmas, and he enjoys decorating his home. "
"He is also a fan of classic movies and has a collection of DVDs. "
"Brandon is a mentor for young professionals and enjoys giving career advice. "
"He is also a fan of puzzles and enjoys solving them in his free time. "
"Brandon's favorite sport is soccer, and he often plays with his friends. "
"He is also a fan of FC Barcelona and enjoys watching their matches. "
)
* 2, # Repeat to ensure it's 2k characters
(
"Brandon is a software engineer who lives in San Francisco. "
"He enjoys hiking and often visits the trails in the Bay Area. "
"Brandon has a pet dog named Max, who is a golden retriever. "
"He loves reading science fiction books, and his favorite author is Isaac Asimov. "
"Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
"He is also a fan of Mexican cuisine, especially tacos and burritos. "
"Brandon plays the guitar and often performs at local open mic nights. "
"He is learning French and plans to visit Paris next year. "
"Brandon is passionate about technology and often attends tech meetups in the city. "
"He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
"Brandon's favorite color is blue, and he often wears blue shirts. "
"He enjoys cooking and often tries new recipes on weekends. "
"Brandon is a morning person and likes to start his day with a run in the park. "
"He is also a coffee enthusiast and enjoys trying different coffee blends. "
"Brandon is a member of a local book club and enjoys discussing books with fellow members. "
"He is also a fan of board games and often hosts game nights at his place. "
"Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
"He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
"Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
"He is also a fan of the Golden State Warriors and enjoys watching their games. "
)
* 2, # Repeat to ensure it's 2k characters
]
file_paths = []
for i, content in enumerate(contents):
file_path = Path(tmpdir.join(f"long_file_{i}.txt"))
with open(file_path, "w") as f:
f.write(content)
file_paths.append(file_path)
file_sources = [
TextFileKnowledgeSource(file_paths=[path], metadata={"preference": "personal"})
for path in file_paths
]
mock_vector_db.sources = file_sources
mock_vector_db.query.return_value = [
{
"context": "Brandon's favorite book is 'The Hitchhiker's Guide to the Galaxy'.",
"score": 0.9,
}
]
# Perform a query
query = "What is Brandon's favorite book?"
results = mock_vector_db.query(query)
# Assert that the correct information is retrieved
assert any(
"the hitchhiker's guide to the galaxy" in result["context"].lower()
for result in results
)
mock_vector_db.query.assert_called_once()
@pytest.mark.vcr(filter_headers=["authorization"])
def test_hybrid_string_and_files(mock_vector_db, tmpdir):
# Create string sources
string_contents = [
"Brandon is learning French.",
"Brandon visited Paris last summer.",
]
string_sources = [
StringKnowledgeSource(content=content, metadata={"preference": "personal"})
for content in string_contents
]
# Create file sources
file_contents = [
"Brandon prefers tea over coffee.",
"Brandon's favorite book is 'The Alchemist'.",
]
file_paths = []
for i, content in enumerate(file_contents):
file_path = Path(tmpdir.join(f"file_{i}.txt"))
with open(file_path, "w") as f:
f.write(content)
file_paths.append(file_path)
file_sources = [
TextFileKnowledgeSource(file_paths=[path], metadata={"preference": "personal"})
for path in file_paths
]
# Combine string and file sources
mock_vector_db.sources = string_sources + file_sources
mock_vector_db.query.return_value = [{"context": file_contents[1], "score": 0.9}]
# Perform a query
query = "What is Brandon's favorite book?"
results = mock_vector_db.query(query)
# Assert that the correct information is retrieved
assert any("the alchemist" in result["context"].lower() for result in results)
mock_vector_db.query.assert_called_once()
def test_pdf_knowledge_source(mock_vector_db):
# Get the directory of the current file
current_dir = Path(__file__).parent
# Construct the path to the PDF file
pdf_path = current_dir / "crewai_quickstart.pdf"
# Create a PDFKnowledgeSource
pdf_source = PDFKnowledgeSource(
file_paths=[pdf_path], metadata={"preference": "personal"}
)
mock_vector_db.sources = [pdf_source]
mock_vector_db.query.return_value = [
{"context": "crewai create crew latest-ai-development", "score": 0.9}
]
# Perform a query
query = "How do you create a crew?"
results = mock_vector_db.query(query)
# Assert that the correct information is retrieved
assert any(
"crewai create crew latest-ai-development" in result["context"].lower()
for result in results
)
mock_vector_db.query.assert_called_once()
@pytest.mark.vcr(filter_headers=["authorization"])
def test_csv_knowledge_source(mock_vector_db, tmpdir):
"""Test CSVKnowledgeSource with a simple CSV file."""
# Create a CSV file with sample data
csv_content = [
["Name", "Age", "City"],
["Brandon", "30", "New York"],
["Alice", "25", "Los Angeles"],
["Bob", "35", "Chicago"],
]
csv_path = Path(tmpdir.join("data.csv"))
with open(csv_path, "w", encoding="utf-8") as f:
for row in csv_content:
f.write(",".join(row) + "\n")
# Create a CSVKnowledgeSource
csv_source = CSVKnowledgeSource(
file_paths=[csv_path], metadata={"preference": "personal"}
)
mock_vector_db.sources = [csv_source]
mock_vector_db.query.return_value = [
{"context": "Brandon is 30 years old.", "score": 0.9}
]
# Perform a query
query = "How old is Brandon?"
results = mock_vector_db.query(query)
# Assert that the correct information is retrieved
assert any("30" in result["context"] for result in results)
mock_vector_db.query.assert_called_once()
def test_json_knowledge_source(mock_vector_db, tmpdir):
"""Test JSONKnowledgeSource with a simple JSON file."""
# Create a JSON file with sample data
json_data = {
"people": [
{"name": "Brandon", "age": 30, "city": "New York"},
{"name": "Alice", "age": 25, "city": "Los Angeles"},
{"name": "Bob", "age": 35, "city": "Chicago"},
]
}
json_path = Path(tmpdir.join("data.json"))
with open(json_path, "w", encoding="utf-8") as f:
import json
json.dump(json_data, f)
# Create a JSONKnowledgeSource
json_source = JSONKnowledgeSource(
file_paths=[json_path], metadata={"preference": "personal"}
)
mock_vector_db.sources = [json_source]
mock_vector_db.query.return_value = [
{"context": "Alice lives in Los Angeles.", "score": 0.9}
]
# Perform a query
query = "Where does Alice reside?"
results = mock_vector_db.query(query)
# Assert that the correct information is retrieved
assert any("los angeles" in result["context"].lower() for result in results)
mock_vector_db.query.assert_called_once()
def test_excel_knowledge_source(mock_vector_db, tmpdir):
"""Test ExcelKnowledgeSource with a simple Excel file."""
# Create an Excel file with sample data
import pandas as pd
excel_data = {
"Name": ["Brandon", "Alice", "Bob"],
"Age": [30, 25, 35],
"City": ["New York", "Los Angeles", "Chicago"],
}
df = pd.DataFrame(excel_data)
excel_path = Path(tmpdir.join("data.xlsx"))
df.to_excel(excel_path, index=False)
# Create an ExcelKnowledgeSource
excel_source = ExcelKnowledgeSource(
file_paths=[excel_path], metadata={"preference": "personal"}
)
mock_vector_db.sources = [excel_source]
mock_vector_db.query.return_value = [
{"context": "Brandon is 30 years old.", "score": 0.9}
]
# Perform a query
query = "What is Brandon's age?"
results = mock_vector_db.query(query)
# Assert that the correct information is retrieved
assert any("30" in result["context"] for result in results)
mock_vector_db.query.assert_called_once()
@pytest.mark.vcr
def test_docling_source(mock_vector_db):
docling_source = CrewDoclingSource(
file_paths=[
"https://lilianweng.github.io/posts/2024-11-28-reward-hacking/",
],
)
mock_vector_db.sources = [docling_source]
mock_vector_db.query.return_value = [
{
"context": "Reward hacking is a technique used to improve the performance of reinforcement learning agents.",
"score": 0.9,
}
]
# Perform a query
query = "What is reward hacking?"
results = mock_vector_db.query(query)
assert any("reward hacking" in result["context"].lower() for result in results)
mock_vector_db.query.assert_called_once()
@pytest.mark.vcr
def test_multiple_docling_sources():
urls: List[Union[Path, str]] = [
"https://lilianweng.github.io/posts/2024-11-28-reward-hacking/",
"https://lilianweng.github.io/posts/2024-07-07-hallucination/",
]
docling_source = CrewDoclingSource(file_paths=urls)
assert docling_source.file_paths == urls
assert docling_source.content is not None
def test_file_path_validation():
"""Test file path validation for knowledge sources."""
current_dir = Path(__file__).parent
pdf_path = current_dir / "crewai_quickstart.pdf"
# Test valid single file_path
source = PDFKnowledgeSource(file_path=pdf_path)
assert source.safe_file_paths == [pdf_path]
# Test valid file_paths list
source = PDFKnowledgeSource(file_paths=[pdf_path])
assert source.safe_file_paths == [pdf_path]
# Test both file_path and file_paths provided (should use file_paths)
source = PDFKnowledgeSource(file_path=pdf_path, file_paths=[pdf_path])
assert source.safe_file_paths == [pdf_path]
# Test neither file_path nor file_paths provided
with pytest.raises(
ValueError,
match="file_path/file_paths must be a Path, str, or a list of these types",
):
PDFKnowledgeSource()