mirror of
https://github.com/crewAIInc/crewAI.git
synced 2025-12-16 04:18:35 +00:00
* build(dev): add pytest-randomly dependency By randomizing the test execution order, this helps identify tests that unintentionally depend on shared state or specific execution order, which can lead to flaky or unreliable test behavior. * build(dev): add pytest-timeout This will prevent a test from running indefinitely * test: block external requests in CI and set default 10s timeout per test * test: adding missing cassettes We notice that those cassettes are missing after enabling block-network on CI * test: increase tests timeout on CI * test: fix flaky test ValueError: Circular reference detected (id repeated) * fix: prevent crash when event handler raises exception Previously, if a registered event handler raised an exception during execution, it could crash the entire application or interrupt the event dispatch process. This change wraps handler execution in a try/except block within the `emit` method, ensuring that exceptions are caught and logged without affecting other handlers or flow. This improves the resilience of the event bus, especially when handling third-party or temporary listeners.
606 lines
27 KiB
Python
606 lines
27 KiB
Python
"""Test Knowledge creation and querying functionality."""
|
|
|
|
from pathlib import Path
|
|
from typing import List, Union
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
|
|
from crewai.knowledge.source.crew_docling_source import CrewDoclingSource
|
|
from crewai.knowledge.source.csv_knowledge_source import CSVKnowledgeSource
|
|
from crewai.knowledge.source.excel_knowledge_source import ExcelKnowledgeSource
|
|
from crewai.knowledge.source.json_knowledge_source import JSONKnowledgeSource
|
|
from crewai.knowledge.source.pdf_knowledge_source import PDFKnowledgeSource
|
|
from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource
|
|
from crewai.knowledge.source.text_file_knowledge_source import TextFileKnowledgeSource
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def mock_vector_db():
|
|
"""Mock vector database operations."""
|
|
with patch("crewai.knowledge.storage.knowledge_storage.KnowledgeStorage") as mock:
|
|
# Mock the query method to return a predefined response
|
|
instance = mock.return_value
|
|
instance.query.return_value = [
|
|
{
|
|
"context": "Brandon's favorite color is blue and he likes Mexican food.",
|
|
"score": 0.9,
|
|
}
|
|
]
|
|
instance.reset.return_value = None
|
|
yield instance
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def reset_knowledge_storage(mock_vector_db):
|
|
"""Fixture to reset knowledge storage before each test."""
|
|
yield
|
|
|
|
|
|
def test_single_short_string(mock_vector_db):
|
|
# Create a knowledge base with a single short string
|
|
content = "Brandon's favorite color is blue and he likes Mexican food."
|
|
string_source = StringKnowledgeSource(
|
|
content=content, metadata={"preference": "personal"}
|
|
)
|
|
mock_vector_db.sources = [string_source]
|
|
mock_vector_db.query.return_value = [{"context": content, "score": 0.9}]
|
|
# Perform a query
|
|
query = "What is Brandon's favorite color?"
|
|
results = mock_vector_db.query(query)
|
|
|
|
# Assert that the results contain the expected information
|
|
assert any("blue" in result["context"].lower() for result in results)
|
|
# Verify the mock was called
|
|
mock_vector_db.query.assert_called_once()
|
|
|
|
|
|
# @pytest.mark.vcr(filter_headers=["authorization"])
|
|
def test_single_2k_character_string(mock_vector_db):
|
|
# Create a 2k character string with various facts about Brandon
|
|
content = (
|
|
"Brandon is a software engineer who lives in San Francisco. "
|
|
"He enjoys hiking and often visits the trails in the Bay Area. "
|
|
"Brandon has a pet dog named Max, who is a golden retriever. "
|
|
"He loves reading science fiction books, and his favorite author is Isaac Asimov. "
|
|
"Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
|
|
"He is also a fan of Mexican cuisine, especially tacos and burritos. "
|
|
"Brandon plays the guitar and often performs at local open mic nights. "
|
|
"He is learning French and plans to visit Paris next year. "
|
|
"Brandon is passionate about technology and often attends tech meetups in the city. "
|
|
"He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
|
|
"Brandon's favorite color is blue, and he often wears blue shirts. "
|
|
"He enjoys cooking and often tries new recipes on weekends. "
|
|
"Brandon is a morning person and likes to start his day with a run in the park. "
|
|
"He is also a coffee enthusiast and enjoys trying different coffee blends. "
|
|
"Brandon is a member of a local book club and enjoys discussing books with fellow members. "
|
|
"He is also a fan of board games and often hosts game nights at his place. "
|
|
"Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
|
|
"He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
|
|
"Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
|
|
"He is also a fan of the Golden State Warriors and enjoys watching their games. "
|
|
)
|
|
string_source = StringKnowledgeSource(
|
|
content=content, metadata={"preference": "personal"}
|
|
)
|
|
mock_vector_db.sources = [string_source]
|
|
mock_vector_db.query.return_value = [{"context": content, "score": 0.9}]
|
|
|
|
# Perform a query
|
|
query = "What is Brandon's favorite movie?"
|
|
results = mock_vector_db.query(query)
|
|
|
|
# Assert that the results contain the expected information
|
|
assert any("inception" in result["context"].lower() for result in results)
|
|
mock_vector_db.query.assert_called_once()
|
|
|
|
|
|
def test_multiple_short_strings(mock_vector_db):
|
|
# Create multiple short string sources
|
|
contents = [
|
|
"Brandon loves hiking.",
|
|
"Brandon has a dog named Max.",
|
|
"Brandon enjoys painting landscapes.",
|
|
]
|
|
string_sources = [
|
|
StringKnowledgeSource(content=content, metadata={"preference": "personal"})
|
|
for content in contents
|
|
]
|
|
|
|
# Mock the vector db query response
|
|
mock_vector_db.query.return_value = [
|
|
{"context": "Brandon has a dog named Max.", "score": 0.9}
|
|
]
|
|
|
|
mock_vector_db.sources = string_sources
|
|
|
|
# Perform a query
|
|
query = "What is the name of Brandon's pet?"
|
|
results = mock_vector_db.query(query)
|
|
|
|
# Assert that the correct information is retrieved
|
|
assert any("max" in result["context"].lower() for result in results)
|
|
# Verify the mock was called
|
|
mock_vector_db.query.assert_called_once()
|
|
|
|
|
|
def test_multiple_2k_character_strings(mock_vector_db):
|
|
# Create multiple 2k character strings with various facts about Brandon
|
|
contents = [
|
|
(
|
|
"Brandon is a software engineer who lives in San Francisco. "
|
|
"He enjoys hiking and often visits the trails in the Bay Area. "
|
|
"Brandon has a pet dog named Max, who is a golden retriever. "
|
|
"He loves reading science fiction books, and his favorite author is Isaac Asimov. "
|
|
"Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
|
|
"He is also a fan of Mexican cuisine, especially tacos and burritos. "
|
|
"Brandon plays the guitar and often performs at local open mic nights. "
|
|
"He is learning French and plans to visit Paris next year. "
|
|
"Brandon is passionate about technology and often attends tech meetups in the city. "
|
|
"He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
|
|
"Brandon's favorite color is blue, and he often wears blue shirts. "
|
|
"He enjoys cooking and often tries new recipes on weekends. "
|
|
"Brandon is a morning person and likes to start his day with a run in the park. "
|
|
"He is also a coffee enthusiast and enjoys trying different coffee blends. "
|
|
"Brandon is a member of a local book club and enjoys discussing books with fellow members. "
|
|
"He is also a fan of board games and often hosts game nights at his place. "
|
|
"Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
|
|
"He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
|
|
"Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
|
|
"He is also a fan of the Golden State Warriors and enjoys watching their games. "
|
|
)
|
|
* 2, # Repeat to ensure it's 2k characters
|
|
(
|
|
"Brandon loves traveling and has visited over 20 countries. "
|
|
"He is fluent in Spanish and often practices with his friends. "
|
|
"Brandon's favorite city is Barcelona, where he enjoys the architecture and culture. "
|
|
"He is a foodie and loves trying new cuisines, with a particular fondness for sushi. "
|
|
"Brandon is an avid cyclist and participates in local cycling events. "
|
|
"He is also a photographer and enjoys capturing landscapes and cityscapes. "
|
|
"Brandon is a tech enthusiast and follows the latest trends in gadgets and software. "
|
|
"He is also a fan of virtual reality and owns a VR headset. "
|
|
"Brandon's favorite book is 'The Hitchhiker's Guide to the Galaxy'. "
|
|
"He enjoys watching documentaries and learning about history and science. "
|
|
"Brandon is a coffee lover and has a collection of coffee mugs from different countries. "
|
|
"He is also a fan of jazz music and often attends live performances. "
|
|
"Brandon is a member of a local running club and participates in marathons. "
|
|
"He is also a volunteer at a local animal shelter and helps with dog walking. "
|
|
"Brandon's favorite holiday is Christmas, and he enjoys decorating his home. "
|
|
"He is also a fan of classic movies and has a collection of DVDs. "
|
|
"Brandon is a mentor for young professionals and enjoys giving career advice. "
|
|
"He is also a fan of puzzles and enjoys solving them in his free time. "
|
|
"Brandon's favorite sport is soccer, and he often plays with his friends. "
|
|
"He is also a fan of FC Barcelona and enjoys watching their matches. "
|
|
)
|
|
* 2, # Repeat to ensure it's 2k characters
|
|
]
|
|
string_sources = [
|
|
StringKnowledgeSource(content=content, metadata={"preference": "personal"})
|
|
for content in contents
|
|
]
|
|
|
|
mock_vector_db.sources = string_sources
|
|
mock_vector_db.query.return_value = [{"context": contents[1], "score": 0.9}]
|
|
|
|
# Perform a query
|
|
query = "What is Brandon's favorite book?"
|
|
results = mock_vector_db.query(query)
|
|
|
|
# Assert that the correct information is retrieved
|
|
assert any(
|
|
"the hitchhiker's guide to the galaxy" in result["context"].lower()
|
|
for result in results
|
|
)
|
|
mock_vector_db.query.assert_called_once()
|
|
|
|
|
|
def test_single_short_file(mock_vector_db, tmpdir):
|
|
# Create a single short text file
|
|
content = "Brandon's favorite sport is basketball."
|
|
file_path = Path(tmpdir.join("short_file.txt"))
|
|
with open(file_path, "w") as f:
|
|
f.write(content)
|
|
|
|
file_source = TextFileKnowledgeSource(
|
|
file_paths=[file_path], metadata={"preference": "personal"}
|
|
)
|
|
mock_vector_db.sources = [file_source]
|
|
mock_vector_db.query.return_value = [{"context": content, "score": 0.9}]
|
|
# Perform a query
|
|
query = "What sport does Brandon like?"
|
|
results = mock_vector_db.query(query)
|
|
|
|
# Assert that the results contain the expected information
|
|
assert any("basketball" in result["context"].lower() for result in results)
|
|
mock_vector_db.query.assert_called_once()
|
|
|
|
|
|
def test_single_2k_character_file(mock_vector_db, tmpdir):
|
|
# Create a single 2k character text file with various facts about Brandon
|
|
content = (
|
|
"Brandon is a software engineer who lives in San Francisco. "
|
|
"He enjoys hiking and often visits the trails in the Bay Area. "
|
|
"Brandon has a pet dog named Max, who is a golden retriever. "
|
|
"He loves reading science fiction books, and his favorite author is Isaac Asimov. "
|
|
"Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
|
|
"He is also a fan of Mexican cuisine, especially tacos and burritos. "
|
|
"Brandon plays the guitar and often performs at local open mic nights. "
|
|
"He is learning French and plans to visit Paris next year. "
|
|
"Brandon is passionate about technology and often attends tech meetups in the city. "
|
|
"He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
|
|
"Brandon's favorite color is blue, and he often wears blue shirts. "
|
|
"He enjoys cooking and often tries new recipes on weekends. "
|
|
"Brandon is a morning person and likes to start his day with a run in the park. "
|
|
"He is also a coffee enthusiast and enjoys trying different coffee blends. "
|
|
"Brandon is a member of a local book club and enjoys discussing books with fellow members. "
|
|
"He is also a fan of board games and often hosts game nights at his place. "
|
|
"Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
|
|
"He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
|
|
"Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
|
|
"He is also a fan of the Golden State Warriors and enjoys watching their games. "
|
|
) * 2 # Repeat to ensure it's 2k characters
|
|
file_path = Path(tmpdir.join("long_file.txt"))
|
|
with open(file_path, "w") as f:
|
|
f.write(content)
|
|
|
|
file_source = TextFileKnowledgeSource(
|
|
file_paths=[file_path], metadata={"preference": "personal"}
|
|
)
|
|
mock_vector_db.sources = [file_source]
|
|
mock_vector_db.query.return_value = [{"context": content, "score": 0.9}]
|
|
# Perform a query
|
|
query = "What is Brandon's favorite movie?"
|
|
results = mock_vector_db.query(query)
|
|
|
|
# Assert that the results contain the expected information
|
|
assert any("inception" in result["context"].lower() for result in results)
|
|
mock_vector_db.query.assert_called_once()
|
|
|
|
|
|
def test_multiple_short_files(mock_vector_db, tmpdir):
|
|
# Create multiple short text files
|
|
contents = [
|
|
{
|
|
"content": "Brandon works as a software engineer.",
|
|
"metadata": {"category": "profession", "source": "occupation"},
|
|
},
|
|
{
|
|
"content": "Brandon lives in New York.",
|
|
"metadata": {"category": "city", "source": "personal"},
|
|
},
|
|
{
|
|
"content": "Brandon enjoys cooking Italian food.",
|
|
"metadata": {"category": "hobby", "source": "personal"},
|
|
},
|
|
]
|
|
file_paths = []
|
|
for i, item in enumerate(contents):
|
|
file_path = Path(tmpdir.join(f"file_{i}.txt"))
|
|
with open(file_path, "w") as f:
|
|
f.write(item["content"])
|
|
file_paths.append((file_path, item["metadata"]))
|
|
|
|
file_sources = [
|
|
TextFileKnowledgeSource(file_paths=[path], metadata=metadata)
|
|
for path, metadata in file_paths
|
|
]
|
|
mock_vector_db.sources = file_sources
|
|
mock_vector_db.query.return_value = [
|
|
{"context": "Brandon lives in New York.", "score": 0.9}
|
|
]
|
|
# Perform a query
|
|
query = "What city does he reside in?"
|
|
results = mock_vector_db.query(query)
|
|
# Assert that the correct information is retrieved
|
|
assert any("new york" in result["context"].lower() for result in results)
|
|
mock_vector_db.query.assert_called_once()
|
|
|
|
|
|
def test_multiple_2k_character_files(mock_vector_db, tmpdir):
|
|
# Create multiple 2k character text files with various facts about Brandon
|
|
contents = [
|
|
(
|
|
"Brandon loves traveling and has visited over 20 countries. "
|
|
"He is fluent in Spanish and often practices with his friends. "
|
|
"Brandon's favorite city is Barcelona, where he enjoys the architecture and culture. "
|
|
"He is a foodie and loves trying new cuisines, with a particular fondness for sushi. "
|
|
"Brandon is an avid cyclist and participates in local cycling events. "
|
|
"He is also a photographer and enjoys capturing landscapes and cityscapes. "
|
|
"Brandon is a tech enthusiast and follows the latest trends in gadgets and software. "
|
|
"He is also a fan of virtual reality and owns a VR headset. "
|
|
"Brandon's favorite book is 'The Hitchhiker's Guide to the Galaxy'. "
|
|
"He enjoys watching documentaries and learning about history and science. "
|
|
"Brandon is a coffee lover and has a collection of coffee mugs from different countries. "
|
|
"He is also a fan of jazz music and often attends live performances. "
|
|
"Brandon is a member of a local running club and participates in marathons. "
|
|
"He is also a volunteer at a local animal shelter and helps with dog walking. "
|
|
"Brandon's favorite holiday is Christmas, and he enjoys decorating his home. "
|
|
"He is also a fan of classic movies and has a collection of DVDs. "
|
|
"Brandon is a mentor for young professionals and enjoys giving career advice. "
|
|
"He is also a fan of puzzles and enjoys solving them in his free time. "
|
|
"Brandon's favorite sport is soccer, and he often plays with his friends. "
|
|
"He is also a fan of FC Barcelona and enjoys watching their matches. "
|
|
)
|
|
* 2, # Repeat to ensure it's 2k characters
|
|
(
|
|
"Brandon is a software engineer who lives in San Francisco. "
|
|
"He enjoys hiking and often visits the trails in the Bay Area. "
|
|
"Brandon has a pet dog named Max, who is a golden retriever. "
|
|
"He loves reading science fiction books, and his favorite author is Isaac Asimov. "
|
|
"Brandon's favorite movie is Inception, and he enjoys watching it with his friends. "
|
|
"He is also a fan of Mexican cuisine, especially tacos and burritos. "
|
|
"Brandon plays the guitar and often performs at local open mic nights. "
|
|
"He is learning French and plans to visit Paris next year. "
|
|
"Brandon is passionate about technology and often attends tech meetups in the city. "
|
|
"He is also interested in AI and machine learning, and he is currently working on a project related to natural language processing. "
|
|
"Brandon's favorite color is blue, and he often wears blue shirts. "
|
|
"He enjoys cooking and often tries new recipes on weekends. "
|
|
"Brandon is a morning person and likes to start his day with a run in the park. "
|
|
"He is also a coffee enthusiast and enjoys trying different coffee blends. "
|
|
"Brandon is a member of a local book club and enjoys discussing books with fellow members. "
|
|
"He is also a fan of board games and often hosts game nights at his place. "
|
|
"Brandon is an advocate for environmental conservation and volunteers for local clean-up drives. "
|
|
"He is also a mentor for aspiring software developers and enjoys sharing his knowledge with others. "
|
|
"Brandon's favorite sport is basketball, and he often plays with his friends on weekends. "
|
|
"He is also a fan of the Golden State Warriors and enjoys watching their games. "
|
|
)
|
|
* 2, # Repeat to ensure it's 2k characters
|
|
]
|
|
file_paths = []
|
|
for i, content in enumerate(contents):
|
|
file_path = Path(tmpdir.join(f"long_file_{i}.txt"))
|
|
with open(file_path, "w") as f:
|
|
f.write(content)
|
|
file_paths.append(file_path)
|
|
|
|
file_sources = [
|
|
TextFileKnowledgeSource(file_paths=[path], metadata={"preference": "personal"})
|
|
for path in file_paths
|
|
]
|
|
mock_vector_db.sources = file_sources
|
|
mock_vector_db.query.return_value = [
|
|
{
|
|
"context": "Brandon's favorite book is 'The Hitchhiker's Guide to the Galaxy'.",
|
|
"score": 0.9,
|
|
}
|
|
]
|
|
# Perform a query
|
|
query = "What is Brandon's favorite book?"
|
|
results = mock_vector_db.query(query)
|
|
|
|
# Assert that the correct information is retrieved
|
|
assert any(
|
|
"the hitchhiker's guide to the galaxy" in result["context"].lower()
|
|
for result in results
|
|
)
|
|
mock_vector_db.query.assert_called_once()
|
|
|
|
|
|
@pytest.mark.vcr(filter_headers=["authorization"])
|
|
def test_hybrid_string_and_files(mock_vector_db, tmpdir):
|
|
# Create string sources
|
|
string_contents = [
|
|
"Brandon is learning French.",
|
|
"Brandon visited Paris last summer.",
|
|
]
|
|
string_sources = [
|
|
StringKnowledgeSource(content=content, metadata={"preference": "personal"})
|
|
for content in string_contents
|
|
]
|
|
|
|
# Create file sources
|
|
file_contents = [
|
|
"Brandon prefers tea over coffee.",
|
|
"Brandon's favorite book is 'The Alchemist'.",
|
|
]
|
|
file_paths = []
|
|
for i, content in enumerate(file_contents):
|
|
file_path = Path(tmpdir.join(f"file_{i}.txt"))
|
|
with open(file_path, "w") as f:
|
|
f.write(content)
|
|
file_paths.append(file_path)
|
|
|
|
file_sources = [
|
|
TextFileKnowledgeSource(file_paths=[path], metadata={"preference": "personal"})
|
|
for path in file_paths
|
|
]
|
|
|
|
# Combine string and file sources
|
|
mock_vector_db.sources = string_sources + file_sources
|
|
mock_vector_db.query.return_value = [{"context": file_contents[1], "score": 0.9}]
|
|
|
|
# Perform a query
|
|
query = "What is Brandon's favorite book?"
|
|
results = mock_vector_db.query(query)
|
|
|
|
# Assert that the correct information is retrieved
|
|
assert any("the alchemist" in result["context"].lower() for result in results)
|
|
mock_vector_db.query.assert_called_once()
|
|
|
|
|
|
def test_pdf_knowledge_source(mock_vector_db):
|
|
# Get the directory of the current file
|
|
current_dir = Path(__file__).parent
|
|
# Construct the path to the PDF file
|
|
pdf_path = current_dir / "crewai_quickstart.pdf"
|
|
|
|
# Create a PDFKnowledgeSource
|
|
pdf_source = PDFKnowledgeSource(
|
|
file_paths=[pdf_path], metadata={"preference": "personal"}
|
|
)
|
|
mock_vector_db.sources = [pdf_source]
|
|
mock_vector_db.query.return_value = [
|
|
{"context": "crewai create crew latest-ai-development", "score": 0.9}
|
|
]
|
|
|
|
# Perform a query
|
|
query = "How do you create a crew?"
|
|
results = mock_vector_db.query(query)
|
|
|
|
# Assert that the correct information is retrieved
|
|
assert any(
|
|
"crewai create crew latest-ai-development" in result["context"].lower()
|
|
for result in results
|
|
)
|
|
mock_vector_db.query.assert_called_once()
|
|
|
|
|
|
@pytest.mark.vcr(filter_headers=["authorization"])
|
|
def test_csv_knowledge_source(mock_vector_db, tmpdir):
|
|
"""Test CSVKnowledgeSource with a simple CSV file."""
|
|
|
|
# Create a CSV file with sample data
|
|
csv_content = [
|
|
["Name", "Age", "City"],
|
|
["Brandon", "30", "New York"],
|
|
["Alice", "25", "Los Angeles"],
|
|
["Bob", "35", "Chicago"],
|
|
]
|
|
csv_path = Path(tmpdir.join("data.csv"))
|
|
with open(csv_path, "w", encoding="utf-8") as f:
|
|
for row in csv_content:
|
|
f.write(",".join(row) + "\n")
|
|
|
|
# Create a CSVKnowledgeSource
|
|
csv_source = CSVKnowledgeSource(
|
|
file_paths=[csv_path], metadata={"preference": "personal"}
|
|
)
|
|
mock_vector_db.sources = [csv_source]
|
|
mock_vector_db.query.return_value = [
|
|
{"context": "Brandon is 30 years old.", "score": 0.9}
|
|
]
|
|
|
|
# Perform a query
|
|
query = "How old is Brandon?"
|
|
results = mock_vector_db.query(query)
|
|
|
|
# Assert that the correct information is retrieved
|
|
assert any("30" in result["context"] for result in results)
|
|
mock_vector_db.query.assert_called_once()
|
|
|
|
|
|
def test_json_knowledge_source(mock_vector_db, tmpdir):
|
|
"""Test JSONKnowledgeSource with a simple JSON file."""
|
|
|
|
# Create a JSON file with sample data
|
|
json_data = {
|
|
"people": [
|
|
{"name": "Brandon", "age": 30, "city": "New York"},
|
|
{"name": "Alice", "age": 25, "city": "Los Angeles"},
|
|
{"name": "Bob", "age": 35, "city": "Chicago"},
|
|
]
|
|
}
|
|
json_path = Path(tmpdir.join("data.json"))
|
|
with open(json_path, "w", encoding="utf-8") as f:
|
|
import json
|
|
|
|
json.dump(json_data, f)
|
|
|
|
# Create a JSONKnowledgeSource
|
|
json_source = JSONKnowledgeSource(
|
|
file_paths=[json_path], metadata={"preference": "personal"}
|
|
)
|
|
mock_vector_db.sources = [json_source]
|
|
mock_vector_db.query.return_value = [
|
|
{"context": "Alice lives in Los Angeles.", "score": 0.9}
|
|
]
|
|
|
|
# Perform a query
|
|
query = "Where does Alice reside?"
|
|
results = mock_vector_db.query(query)
|
|
|
|
# Assert that the correct information is retrieved
|
|
assert any("los angeles" in result["context"].lower() for result in results)
|
|
mock_vector_db.query.assert_called_once()
|
|
|
|
|
|
def test_excel_knowledge_source(mock_vector_db, tmpdir):
|
|
"""Test ExcelKnowledgeSource with a simple Excel file."""
|
|
|
|
# Create an Excel file with sample data
|
|
import pandas as pd
|
|
|
|
excel_data = {
|
|
"Name": ["Brandon", "Alice", "Bob"],
|
|
"Age": [30, 25, 35],
|
|
"City": ["New York", "Los Angeles", "Chicago"],
|
|
}
|
|
df = pd.DataFrame(excel_data)
|
|
excel_path = Path(tmpdir.join("data.xlsx"))
|
|
df.to_excel(excel_path, index=False)
|
|
|
|
# Create an ExcelKnowledgeSource
|
|
excel_source = ExcelKnowledgeSource(
|
|
file_paths=[excel_path], metadata={"preference": "personal"}
|
|
)
|
|
mock_vector_db.sources = [excel_source]
|
|
mock_vector_db.query.return_value = [
|
|
{"context": "Brandon is 30 years old.", "score": 0.9}
|
|
]
|
|
|
|
# Perform a query
|
|
query = "What is Brandon's age?"
|
|
results = mock_vector_db.query(query)
|
|
|
|
# Assert that the correct information is retrieved
|
|
assert any("30" in result["context"] for result in results)
|
|
mock_vector_db.query.assert_called_once()
|
|
|
|
|
|
@pytest.mark.vcr
|
|
def test_docling_source(mock_vector_db):
|
|
docling_source = CrewDoclingSource(
|
|
file_paths=[
|
|
"https://lilianweng.github.io/posts/2024-11-28-reward-hacking/",
|
|
],
|
|
)
|
|
mock_vector_db.sources = [docling_source]
|
|
mock_vector_db.query.return_value = [
|
|
{
|
|
"context": "Reward hacking is a technique used to improve the performance of reinforcement learning agents.",
|
|
"score": 0.9,
|
|
}
|
|
]
|
|
# Perform a query
|
|
query = "What is reward hacking?"
|
|
results = mock_vector_db.query(query)
|
|
assert any("reward hacking" in result["context"].lower() for result in results)
|
|
mock_vector_db.query.assert_called_once()
|
|
|
|
|
|
@pytest.mark.vcr
|
|
def test_multiple_docling_sources():
|
|
urls: List[Union[Path, str]] = [
|
|
"https://lilianweng.github.io/posts/2024-11-28-reward-hacking/",
|
|
"https://lilianweng.github.io/posts/2024-07-07-hallucination/",
|
|
]
|
|
docling_source = CrewDoclingSource(file_paths=urls)
|
|
|
|
assert docling_source.file_paths == urls
|
|
assert docling_source.content is not None
|
|
|
|
|
|
def test_file_path_validation():
|
|
"""Test file path validation for knowledge sources."""
|
|
current_dir = Path(__file__).parent
|
|
pdf_path = current_dir / "crewai_quickstart.pdf"
|
|
|
|
# Test valid single file_path
|
|
source = PDFKnowledgeSource(file_path=pdf_path)
|
|
assert source.safe_file_paths == [pdf_path]
|
|
|
|
# Test valid file_paths list
|
|
source = PDFKnowledgeSource(file_paths=[pdf_path])
|
|
assert source.safe_file_paths == [pdf_path]
|
|
|
|
# Test both file_path and file_paths provided (should use file_paths)
|
|
source = PDFKnowledgeSource(file_path=pdf_path, file_paths=[pdf_path])
|
|
assert source.safe_file_paths == [pdf_path]
|
|
|
|
# Test neither file_path nor file_paths provided
|
|
with pytest.raises(
|
|
ValueError,
|
|
match="file_path/file_paths must be a Path, str, or a list of these types",
|
|
):
|
|
PDFKnowledgeSource()
|