fix: Remove kwargs from all RagTools (#285)

* fix: remove kwargs from all (except mysql & pg) RagTools

The agent uses the tool description to decide what to propagate when a tool with **kwargs is found, but this often leads to failures during the tool invocation step.

This happens because the final description ends up like this:

```
CrewStructuredTool(name='Knowledge base', description='Tool Name: Knowledge base
Tool Arguments: {'query': {'description': None, 'type': 'str'}, 'kwargs': {'description': None, 'type': 'Any'}}
Tool Description: A knowledge base that can be used to answer questions.')
```

The agent then tries to infer and pass a kwargs parameter, which isn’t supported by the schema at all.

* feat: adding test to search tools

* feat: add db (chromadb folder) to .gitignore

* fix: fix github search integration

A few attributes were missing when calling the .add method: data_type and loader.

Also, update the query search according to the EmbedChain documentation, the query must include the type and repo keys

* fix: rollback YoutubeChannel paramenter

* chore: fix type hinting for CodeDocs search

* fix: ensure proper configuration when call `add`

According to the documentation, some search methods must be defined as either a loader or a data_type. This commit ensures that.

* build: add optional-dependencies for github and xml search

* test: mocking external requests from search_tool tests

* build: add pytest-recording as devDependencie
This commit is contained in:
Lucas Gomide
2025-05-05 15:15:50 -03:00
committed by GitHub
parent 93d043bcd4
commit fd4ef4f47a
23 changed files with 2051 additions and 279 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,309 @@
import os
import tempfile
from pathlib import Path
from unittest.mock import ANY, MagicMock
import pytest
from embedchain.models.data_type import DataType
from crewai_tools.tools import (
CodeDocsSearchTool,
CSVSearchTool,
DirectorySearchTool,
DOCXSearchTool,
GithubSearchTool,
JSONSearchTool,
MDXSearchTool,
PDFSearchTool,
TXTSearchTool,
WebsiteSearchTool,
XMLSearchTool,
YoutubeChannelSearchTool,
YoutubeVideoSearchTool,
)
from crewai_tools.tools.rag.rag_tool import Adapter
pytestmark = [pytest.mark.vcr(filter_headers=["authorization"])]
@pytest.fixture
def mock_adapter():
mock_adapter = MagicMock(spec=Adapter)
return mock_adapter
def test_directory_search_tool():
with tempfile.TemporaryDirectory() as temp_dir:
test_file = Path(temp_dir) / "test.txt"
test_file.write_text("This is a test file for directory search")
tool = DirectorySearchTool(directory=temp_dir)
result = tool._run(search_query="test file")
assert "test file" in result.lower()
def test_pdf_search_tool(mock_adapter):
mock_adapter.query.return_value = "this is a test"
tool = PDFSearchTool(pdf="test.pdf", adapter=mock_adapter)
result = tool._run(query="test content")
assert "this is a test" in result.lower()
mock_adapter.add.assert_called_once_with("test.pdf", data_type=DataType.PDF_FILE)
mock_adapter.query.assert_called_once_with("test content")
mock_adapter.query.reset_mock()
mock_adapter.add.reset_mock()
tool = PDFSearchTool(adapter=mock_adapter)
result = tool._run(pdf="test.pdf", query="test content")
assert "this is a test" in result.lower()
mock_adapter.add.assert_called_once_with("test.pdf", data_type=DataType.PDF_FILE)
mock_adapter.query.assert_called_once_with("test content")
def test_txt_search_tool():
with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as temp_file:
temp_file.write(b"This is a test file for txt search")
temp_file_path = temp_file.name
try:
tool = TXTSearchTool()
tool.add(temp_file_path)
result = tool._run(search_query="test file")
assert "test file" in result.lower()
finally:
os.unlink(temp_file_path)
def test_docx_search_tool(mock_adapter):
mock_adapter.query.return_value = "this is a test"
tool = DOCXSearchTool(docx="test.docx", adapter=mock_adapter)
result = tool._run(search_query="test content")
assert "this is a test" in result.lower()
mock_adapter.add.assert_called_once_with("test.docx", data_type=DataType.DOCX)
mock_adapter.query.assert_called_once_with("test content")
mock_adapter.query.reset_mock()
mock_adapter.add.reset_mock()
tool = DOCXSearchTool(adapter=mock_adapter)
result = tool._run(docx="test.docx", search_query="test content")
assert "this is a test" in result.lower()
mock_adapter.add.assert_called_once_with("test.docx", data_type=DataType.DOCX)
mock_adapter.query.assert_called_once_with("test content")
def test_json_search_tool():
with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as temp_file:
temp_file.write(b'{"test": "This is a test JSON file"}')
temp_file_path = temp_file.name
try:
tool = JSONSearchTool()
result = tool._run(search_query="test JSON", json_path=temp_file_path)
assert "test json" in result.lower()
finally:
os.unlink(temp_file_path)
def test_xml_search_tool(mock_adapter):
mock_adapter.query.return_value = "this is a test"
tool = XMLSearchTool(adapter=mock_adapter)
result = tool._run(search_query="test XML", xml="test.xml")
assert "this is a test" in result.lower()
mock_adapter.add.assert_called_once_with("test.xml")
mock_adapter.query.assert_called_once_with("test XML")
def test_csv_search_tool():
with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as temp_file:
temp_file.write(b"name,description\ntest,This is a test CSV file")
temp_file_path = temp_file.name
try:
tool = CSVSearchTool()
tool.add(temp_file_path)
result = tool._run(search_query="test CSV")
assert "test csv" in result.lower()
finally:
os.unlink(temp_file_path)
def test_mdx_search_tool():
with tempfile.NamedTemporaryFile(suffix=".mdx", delete=False) as temp_file:
temp_file.write(b"# Test MDX\nThis is a test MDX file")
temp_file_path = temp_file.name
try:
tool = MDXSearchTool()
tool.add(temp_file_path)
result = tool._run(search_query="test MDX")
assert "test mdx" in result.lower()
finally:
os.unlink(temp_file_path)
def test_website_search_tool(mock_adapter):
mock_adapter.query.return_value = "this is a test"
website = "https://crewai.com"
search_query = "what is crewai?"
tool = WebsiteSearchTool(website=website, adapter=mock_adapter)
result = tool._run(search_query=search_query)
mock_adapter.query.assert_called_once_with("what is crewai?")
mock_adapter.add.assert_called_once_with(website, data_type=DataType.WEB_PAGE)
assert "this is a test" in result.lower()
mock_adapter.query.reset_mock()
mock_adapter.add.reset_mock()
tool = WebsiteSearchTool(adapter=mock_adapter)
result = tool._run(website=website, search_query=search_query)
mock_adapter.query.assert_called_once_with("what is crewai?")
mock_adapter.add.assert_called_once_with(website, data_type=DataType.WEB_PAGE)
assert "this is a test" in result.lower()
def test_youtube_video_search_tool(mock_adapter):
mock_adapter.query.return_value = "some video description"
youtube_video_url = "https://www.youtube.com/watch?v=sample-video-id"
search_query = "what is the video about?"
tool = YoutubeVideoSearchTool(
youtube_video_url=youtube_video_url,
adapter=mock_adapter,
)
result = tool._run(search_query=search_query)
assert "some video description" in result
mock_adapter.add.assert_called_once_with(
youtube_video_url, data_type=DataType.YOUTUBE_VIDEO
)
mock_adapter.query.assert_called_once_with(search_query)
mock_adapter.query.reset_mock()
mock_adapter.add.reset_mock()
tool = YoutubeVideoSearchTool(adapter=mock_adapter)
result = tool._run(youtube_video_url=youtube_video_url, search_query=search_query)
assert "some video description" in result
mock_adapter.add.assert_called_once_with(
youtube_video_url, data_type=DataType.YOUTUBE_VIDEO
)
mock_adapter.query.assert_called_once_with(search_query)
def test_youtube_channel_search_tool(mock_adapter):
mock_adapter.query.return_value = "channel description"
youtube_channel_handle = "@crewai"
search_query = "what is the channel about?"
tool = YoutubeChannelSearchTool(
youtube_channel_handle=youtube_channel_handle, adapter=mock_adapter
)
result = tool._run(search_query=search_query)
assert "channel description" in result
mock_adapter.add.assert_called_once_with(
youtube_channel_handle, data_type=DataType.YOUTUBE_CHANNEL
)
mock_adapter.query.assert_called_once_with(search_query)
mock_adapter.query.reset_mock()
mock_adapter.add.reset_mock()
tool = YoutubeChannelSearchTool(adapter=mock_adapter)
result = tool._run(
youtube_channel_handle=youtube_channel_handle, search_query=search_query
)
assert "channel description" in result
mock_adapter.add.assert_called_once_with(
youtube_channel_handle, data_type=DataType.YOUTUBE_CHANNEL
)
mock_adapter.query.assert_called_once_with(search_query)
def test_code_docs_search_tool(mock_adapter):
mock_adapter.query.return_value = "test documentation"
docs_url = "https://crewai.com/any-docs-url"
search_query = "test documentation"
tool = CodeDocsSearchTool(docs_url=docs_url, adapter=mock_adapter)
result = tool._run(search_query=search_query)
assert "test documentation" in result
mock_adapter.add.assert_called_once_with(docs_url, data_type=DataType.DOCS_SITE)
mock_adapter.query.assert_called_once_with(search_query)
mock_adapter.query.reset_mock()
mock_adapter.add.reset_mock()
tool = CodeDocsSearchTool(adapter=mock_adapter)
result = tool._run(docs_url=docs_url, search_query=search_query)
assert "test documentation" in result
mock_adapter.add.assert_called_once_with(docs_url, data_type=DataType.DOCS_SITE)
mock_adapter.query.assert_called_once_with(search_query)
def test_github_search_tool(mock_adapter):
mock_adapter.query.return_value = "repo description"
# ensure the provided repo and content types are used after initialization
tool = GithubSearchTool(
gh_token="test_token",
github_repo="crewai/crewai",
content_types=["code"],
adapter=mock_adapter,
)
result = tool._run(search_query="tell me about crewai repo")
assert "repo description" in result
mock_adapter.add.assert_called_once_with(
"repo:crewai/crewai type:code", data_type="github", loader=ANY
)
mock_adapter.query.assert_called_once_with("tell me about crewai repo")
# ensure content types provided by run call is used
mock_adapter.query.reset_mock()
mock_adapter.add.reset_mock()
tool = GithubSearchTool(gh_token="test_token", adapter=mock_adapter)
result = tool._run(
github_repo="crewai/crewai",
content_types=["code", "issue"],
search_query="tell me about crewai repo",
)
assert "repo description" in result
mock_adapter.add.assert_called_once_with(
"repo:crewai/crewai type:code,issue", data_type="github", loader=ANY
)
mock_adapter.query.assert_called_once_with("tell me about crewai repo")
# ensure default content types are used if not provided
mock_adapter.query.reset_mock()
mock_adapter.add.reset_mock()
tool = GithubSearchTool(gh_token="test_token", adapter=mock_adapter)
result = tool._run(
github_repo="crewai/crewai",
search_query="tell me about crewai repo",
)
assert "repo description" in result
mock_adapter.add.assert_called_once_with(
"repo:crewai/crewai type:code,repo,pr,issue", data_type="github", loader=ANY
)
mock_adapter.query.assert_called_once_with("tell me about crewai repo")
# ensure nothing is added if no repo is provided
mock_adapter.query.reset_mock()
mock_adapter.add.reset_mock()
tool = GithubSearchTool(gh_token="test_token", adapter=mock_adapter)
result = tool._run(search_query="tell me about crewai repo")
mock_adapter.add.assert_not_called()
mock_adapter.query.assert_called_once_with("tell me about crewai repo")