mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-09 08:08:32 +00:00
Feat/docling-support (#1763)
* added tool for docling support * docling support installation * use file_paths instead of file_path * fix import * organized imports * run_type docs * needs to be list * fixed logic * logged but file_path is backwards compatible * use file_paths instead of file_path 2 * added test for multiple sources for file_paths * fix run-types * enabling local files to work and type cleanup * linted * fix test and types * fixed run types * fix types * renamed to CrewDoclingSource * linted * added docs * resolve conflicts --------- Co-authored-by: Brandon Hancock (bhancock_ai) <109994880+bhancockio@users.noreply.github.com> Co-authored-by: Brandon Hancock <brandon@brandonhancock.io>
This commit is contained in:
@@ -1,10 +1,12 @@
|
||||
"""Test Knowledge creation and querying functionality."""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import List, Union
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.knowledge.source.crew_docling_source import CrewDoclingSource
|
||||
from crewai.knowledge.source.csv_knowledge_source import CSVKnowledgeSource
|
||||
from crewai.knowledge.source.excel_knowledge_source import ExcelKnowledgeSource
|
||||
from crewai.knowledge.source.json_knowledge_source import JSONKnowledgeSource
|
||||
@@ -200,7 +202,7 @@ def test_single_short_file(mock_vector_db, tmpdir):
|
||||
f.write(content)
|
||||
|
||||
file_source = TextFileKnowledgeSource(
|
||||
file_path=file_path, metadata={"preference": "personal"}
|
||||
file_paths=[file_path], metadata={"preference": "personal"}
|
||||
)
|
||||
mock_vector_db.sources = [file_source]
|
||||
mock_vector_db.query.return_value = [{"context": content, "score": 0.9}]
|
||||
@@ -242,7 +244,7 @@ def test_single_2k_character_file(mock_vector_db, tmpdir):
|
||||
f.write(content)
|
||||
|
||||
file_source = TextFileKnowledgeSource(
|
||||
file_path=file_path, metadata={"preference": "personal"}
|
||||
file_paths=[file_path], metadata={"preference": "personal"}
|
||||
)
|
||||
mock_vector_db.sources = [file_source]
|
||||
mock_vector_db.query.return_value = [{"context": content, "score": 0.9}]
|
||||
@@ -279,7 +281,7 @@ def test_multiple_short_files(mock_vector_db, tmpdir):
|
||||
file_paths.append((file_path, item["metadata"]))
|
||||
|
||||
file_sources = [
|
||||
TextFileKnowledgeSource(file_path=path, metadata=metadata)
|
||||
TextFileKnowledgeSource(file_paths=[path], metadata=metadata)
|
||||
for path, metadata in file_paths
|
||||
]
|
||||
mock_vector_db.sources = file_sources
|
||||
@@ -352,7 +354,7 @@ def test_multiple_2k_character_files(mock_vector_db, tmpdir):
|
||||
file_paths.append(file_path)
|
||||
|
||||
file_sources = [
|
||||
TextFileKnowledgeSource(file_path=path, metadata={"preference": "personal"})
|
||||
TextFileKnowledgeSource(file_paths=[path], metadata={"preference": "personal"})
|
||||
for path in file_paths
|
||||
]
|
||||
mock_vector_db.sources = file_sources
|
||||
@@ -399,7 +401,7 @@ def test_hybrid_string_and_files(mock_vector_db, tmpdir):
|
||||
file_paths.append(file_path)
|
||||
|
||||
file_sources = [
|
||||
TextFileKnowledgeSource(file_path=path, metadata={"preference": "personal"})
|
||||
TextFileKnowledgeSource(file_paths=[path], metadata={"preference": "personal"})
|
||||
for path in file_paths
|
||||
]
|
||||
|
||||
@@ -424,7 +426,7 @@ def test_pdf_knowledge_source(mock_vector_db):
|
||||
|
||||
# Create a PDFKnowledgeSource
|
||||
pdf_source = PDFKnowledgeSource(
|
||||
file_path=pdf_path, metadata={"preference": "personal"}
|
||||
file_paths=[pdf_path], metadata={"preference": "personal"}
|
||||
)
|
||||
mock_vector_db.sources = [pdf_source]
|
||||
mock_vector_db.query.return_value = [
|
||||
@@ -461,7 +463,7 @@ def test_csv_knowledge_source(mock_vector_db, tmpdir):
|
||||
|
||||
# Create a CSVKnowledgeSource
|
||||
csv_source = CSVKnowledgeSource(
|
||||
file_path=csv_path, metadata={"preference": "personal"}
|
||||
file_paths=[csv_path], metadata={"preference": "personal"}
|
||||
)
|
||||
mock_vector_db.sources = [csv_source]
|
||||
mock_vector_db.query.return_value = [
|
||||
@@ -496,7 +498,7 @@ def test_json_knowledge_source(mock_vector_db, tmpdir):
|
||||
|
||||
# Create a JSONKnowledgeSource
|
||||
json_source = JSONKnowledgeSource(
|
||||
file_path=json_path, metadata={"preference": "personal"}
|
||||
file_paths=[json_path], metadata={"preference": "personal"}
|
||||
)
|
||||
mock_vector_db.sources = [json_source]
|
||||
mock_vector_db.query.return_value = [
|
||||
@@ -529,7 +531,7 @@ def test_excel_knowledge_source(mock_vector_db, tmpdir):
|
||||
|
||||
# Create an ExcelKnowledgeSource
|
||||
excel_source = ExcelKnowledgeSource(
|
||||
file_path=excel_path, metadata={"preference": "personal"}
|
||||
file_paths=[excel_path], metadata={"preference": "personal"}
|
||||
)
|
||||
mock_vector_db.sources = [excel_source]
|
||||
mock_vector_db.query.return_value = [
|
||||
@@ -543,3 +545,42 @@ def test_excel_knowledge_source(mock_vector_db, tmpdir):
|
||||
# Assert that the correct information is retrieved
|
||||
assert any("30" in result["context"] for result in results)
|
||||
mock_vector_db.query.assert_called_once()
|
||||
|
||||
|
||||
def test_docling_source(mock_vector_db):
|
||||
docling_source = CrewDoclingSource(
|
||||
file_paths=[
|
||||
"https://lilianweng.github.io/posts/2024-11-28-reward-hacking/",
|
||||
],
|
||||
)
|
||||
mock_vector_db.sources = [docling_source]
|
||||
mock_vector_db.query.return_value = [
|
||||
{
|
||||
"context": "Reward hacking is a technique used to improve the performance of reinforcement learning agents.",
|
||||
"score": 0.9,
|
||||
}
|
||||
]
|
||||
# Perform a query
|
||||
query = "What is reward hacking?"
|
||||
results = mock_vector_db.query(query)
|
||||
assert any("reward hacking" in result["context"].lower() for result in results)
|
||||
mock_vector_db.query.assert_called_once()
|
||||
|
||||
|
||||
def test_multiple_docling_sources():
|
||||
urls: List[Union[Path, str]] = [
|
||||
"https://lilianweng.github.io/posts/2024-11-28-reward-hacking/",
|
||||
"https://lilianweng.github.io/posts/2024-07-07-hallucination/",
|
||||
]
|
||||
docling_source = CrewDoclingSource(file_paths=urls)
|
||||
|
||||
assert docling_source.file_paths == urls
|
||||
assert docling_source.content is not None
|
||||
|
||||
|
||||
def test_docling_source_with_local_file():
|
||||
current_dir = Path(__file__).parent
|
||||
pdf_path = current_dir / "crewai_quickstart.pdf"
|
||||
docling_source = CrewDoclingSource(file_paths=[pdf_path])
|
||||
assert docling_source.file_paths == [pdf_path]
|
||||
assert docling_source.content is not None
|
||||
|
||||
Reference in New Issue
Block a user