mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-05-01 23:32:39 +00:00
Compare commits
4 Commits
cleaning-u
...
devin/1739
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c79700874c | ||
|
|
639e5342de | ||
|
|
257780ff6a | ||
|
|
22d886be11 |
@@ -91,7 +91,7 @@ result = crew.kickoff(inputs={"question": "What city does John live in and how o
|
||||
```
|
||||
|
||||
|
||||
Here's another example with the `CrewDoclingSource`. The CrewDoclingSource is actually quite versatile and can handle multiple file formats including MD, PDF, DOCX, HTML, and more.
|
||||
Here's another example with the `CrewDoclingSource`. The CrewDoclingSource is actually quite versatile and can handle multiple file formats including TXT, PDF, DOCX, HTML, and more.
|
||||
|
||||
<Note>
|
||||
You need to install `docling` for the following example to work: `uv add docling`
|
||||
@@ -152,10 +152,10 @@ Here are examples of how to use different types of knowledge sources:
|
||||
|
||||
### Text File Knowledge Source
|
||||
```python
|
||||
from crewai.knowledge.source.text_file_knowledge_source import TextFileKnowledgeSource
|
||||
from crewai.knowledge.source.crew_docling_source import CrewDoclingSource
|
||||
|
||||
# Create a text file knowledge source
|
||||
text_source = TextFileKnowledgeSource(
|
||||
text_source = CrewDoclingSource(
|
||||
file_paths=["document.txt", "another.txt"]
|
||||
)
|
||||
|
||||
|
||||
@@ -268,7 +268,7 @@ analysis_task = Task(
|
||||
|
||||
Task guardrails provide a way to validate and transform task outputs before they
|
||||
are passed to the next task. This feature helps ensure data quality and provides
|
||||
feedback to agents when their output doesn't meet specific criteria.
|
||||
efeedback to agents when their output doesn't meet specific criteria.
|
||||
|
||||
### Using Task Guardrails
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ import warnings
|
||||
from concurrent.futures import Future
|
||||
from copy import copy as shallow_copy
|
||||
from hashlib import md5
|
||||
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
||||
from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, Union
|
||||
|
||||
from pydantic import (
|
||||
UUID4,
|
||||
@@ -37,7 +37,7 @@ from crewai.tasks.conditional_task import ConditionalTask
|
||||
from crewai.tasks.task_output import TaskOutput
|
||||
from crewai.telemetry import Telemetry
|
||||
from crewai.tools.agent_tools.agent_tools import AgentTools
|
||||
from crewai.tools.base_tool import Tool
|
||||
from crewai.tools.base_tool import BaseTool, Tool
|
||||
from crewai.types.usage_metrics import UsageMetrics
|
||||
from crewai.utilities import I18N, FileHandler, Logger, RPMController
|
||||
from crewai.utilities.constants import TRAINING_DATA_FILE
|
||||
@@ -179,7 +179,7 @@ class Crew(BaseModel):
|
||||
default=None,
|
||||
description="Maximum number of requests per minute for the crew execution to be respected.",
|
||||
)
|
||||
prompt_file: str = Field(
|
||||
prompt_file: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Path to the prompt json file to be used for the crew.",
|
||||
)
|
||||
@@ -473,7 +473,16 @@ class Crew(BaseModel):
|
||||
"missing_keys_in_config", "Config should have 'agents' and 'tasks'.", {}
|
||||
)
|
||||
|
||||
self.process = self.config.get("process", self.process)
|
||||
# Get process from config with proper type handling
|
||||
process_value = self.config.get("process")
|
||||
if process_value is not None:
|
||||
if not isinstance(process_value, Process):
|
||||
try:
|
||||
process_value = Process(process_value)
|
||||
except ValueError:
|
||||
raise ValueError(f"Invalid process value: {process_value}")
|
||||
self.process = process_value
|
||||
|
||||
self.agents = [Agent(**agent) for agent in self.config["agents"]]
|
||||
self.tasks = [self._create_task(task) for task in self.config["tasks"]]
|
||||
|
||||
@@ -749,8 +758,12 @@ class Crew(BaseModel):
|
||||
)
|
||||
|
||||
# Determine which tools to use - task tools take precedence over agent tools
|
||||
tools_for_task = task.tools or agent_to_use.tools or []
|
||||
tools_for_task = self._prepare_tools(agent_to_use, task, tools_for_task)
|
||||
initial_tools: List[BaseTool] = []
|
||||
if task.tools:
|
||||
initial_tools = list(task.tools)
|
||||
elif agent_to_use.tools:
|
||||
initial_tools = list(agent_to_use.tools)
|
||||
tools_for_task = self._prepare_tools(agent_to_use, task, initial_tools)
|
||||
|
||||
self._log_task_start(task, agent_to_use.role)
|
||||
|
||||
@@ -766,10 +779,12 @@ class Crew(BaseModel):
|
||||
context = self._get_context(
|
||||
task, [last_sync_output] if last_sync_output else []
|
||||
)
|
||||
# Convert Sequence to List for execute_async
|
||||
tools_list = list(tools_for_task) if tools_for_task else None
|
||||
future = task.execute_async(
|
||||
agent=agent_to_use,
|
||||
context=context,
|
||||
tools=tools_for_task,
|
||||
tools=tools_list,
|
||||
)
|
||||
futures.append((task, future, task_index))
|
||||
else:
|
||||
@@ -778,10 +793,12 @@ class Crew(BaseModel):
|
||||
futures.clear()
|
||||
|
||||
context = self._get_context(task, task_outputs)
|
||||
# Convert Sequence to List for execute_sync
|
||||
tools_list = list(tools_for_task) if tools_for_task else None
|
||||
task_output = task.execute_sync(
|
||||
agent=agent_to_use,
|
||||
context=context,
|
||||
tools=tools_for_task,
|
||||
tools=tools_list,
|
||||
)
|
||||
task_outputs.append(task_output)
|
||||
self._process_task_result(task, task_output)
|
||||
@@ -819,27 +836,37 @@ class Crew(BaseModel):
|
||||
return None
|
||||
|
||||
def _prepare_tools(
|
||||
self, agent: BaseAgent, task: Task, tools: List[Tool]
|
||||
) -> List[Tool]:
|
||||
self, agent: BaseAgent, task: Task, tools: Sequence[BaseTool]
|
||||
) -> Sequence[BaseTool]:
|
||||
"""Prepare tools for the agent.
|
||||
|
||||
Args:
|
||||
agent: The agent to prepare tools for
|
||||
task: The task being executed
|
||||
tools: Initial set of tools
|
||||
|
||||
Returns:
|
||||
Updated sequence of tools with additional capabilities based on agent configuration
|
||||
"""
|
||||
# Add delegation tools if agent allows delegation
|
||||
if agent.allow_delegation:
|
||||
if getattr(agent, "allow_delegation", False):
|
||||
if self.process == Process.hierarchical:
|
||||
if self.manager_agent:
|
||||
tools = self._update_manager_tools(task, tools)
|
||||
tools = self._update_manager_tools(task, tools) # type: ignore[arg-type]
|
||||
else:
|
||||
raise ValueError(
|
||||
"Manager agent is required for hierarchical process."
|
||||
)
|
||||
|
||||
elif agent and agent.allow_delegation:
|
||||
tools = self._add_delegation_tools(task, tools)
|
||||
elif agent:
|
||||
tools = self._add_delegation_tools(task, tools) # type: ignore[arg-type]
|
||||
|
||||
# Add code execution tools if agent allows code execution
|
||||
if agent.allow_code_execution:
|
||||
tools = self._add_code_execution_tools(agent, tools)
|
||||
if getattr(agent, "allow_code_execution", False):
|
||||
tools = self._add_code_execution_tools(agent, tools) # type: ignore[arg-type]
|
||||
|
||||
if agent and agent.multimodal:
|
||||
tools = self._add_multimodal_tools(agent, tools)
|
||||
if agent and getattr(agent, "multimodal", False):
|
||||
tools = self._add_multimodal_tools(agent, tools) # type: ignore[arg-type]
|
||||
|
||||
return tools
|
||||
|
||||
@@ -849,9 +876,17 @@ class Crew(BaseModel):
|
||||
return task.agent
|
||||
|
||||
def _merge_tools(
|
||||
self, existing_tools: List[Tool], new_tools: List[Tool]
|
||||
) -> List[Tool]:
|
||||
"""Merge new tools into existing tools list, avoiding duplicates by tool name."""
|
||||
self, existing_tools: Sequence[BaseTool], new_tools: Sequence[BaseTool]
|
||||
) -> Sequence[BaseTool]:
|
||||
"""Merge new tools into existing tools list, avoiding duplicates by tool name.
|
||||
|
||||
Args:
|
||||
existing_tools: Current sequence of tools
|
||||
new_tools: New tools to merge in
|
||||
|
||||
Returns:
|
||||
Updated sequence of tools with duplicates removed
|
||||
"""
|
||||
if not new_tools:
|
||||
return existing_tools
|
||||
|
||||
@@ -867,24 +902,67 @@ class Crew(BaseModel):
|
||||
return tools
|
||||
|
||||
def _inject_delegation_tools(
|
||||
self, tools: List[Tool], task_agent: BaseAgent, agents: List[BaseAgent]
|
||||
):
|
||||
delegation_tools = task_agent.get_delegation_tools(agents)
|
||||
self, tools: Sequence[BaseTool], task_agent: BaseAgent, agents: List[BaseAgent]
|
||||
) -> Sequence[BaseTool]:
|
||||
"""Add delegation tools for the agent.
|
||||
|
||||
Args:
|
||||
tools: Current set of tools
|
||||
task_agent: Agent that will use the tools
|
||||
agents: List of agents that can be delegated to
|
||||
|
||||
Returns:
|
||||
Updated sequence of tools with delegation capabilities
|
||||
"""
|
||||
delegation_tools = task_agent.get_delegation_tools(agents) # type: ignore[attr-defined]
|
||||
return self._merge_tools(tools, delegation_tools)
|
||||
|
||||
def _add_multimodal_tools(self, agent: BaseAgent, tools: List[Tool]):
|
||||
multimodal_tools = agent.get_multimodal_tools()
|
||||
def _add_multimodal_tools(
|
||||
self, agent: BaseAgent, tools: Sequence[BaseTool]
|
||||
) -> Sequence[BaseTool]:
|
||||
"""Add multimodal tools for the agent.
|
||||
|
||||
Args:
|
||||
agent: Agent that will use the tools
|
||||
tools: Current set of tools
|
||||
|
||||
Returns:
|
||||
Updated sequence of tools with multimodal capabilities
|
||||
"""
|
||||
multimodal_tools = agent.get_multimodal_tools() # type: ignore[attr-defined]
|
||||
return self._merge_tools(tools, multimodal_tools)
|
||||
|
||||
def _add_code_execution_tools(self, agent: BaseAgent, tools: List[Tool]):
|
||||
code_tools = agent.get_code_execution_tools()
|
||||
def _add_code_execution_tools(
|
||||
self, agent: BaseAgent, tools: Sequence[BaseTool]
|
||||
) -> Sequence[BaseTool]:
|
||||
"""Add code execution tools for the agent.
|
||||
|
||||
Args:
|
||||
agent: Agent that will use the tools
|
||||
tools: Current set of tools
|
||||
|
||||
Returns:
|
||||
Updated sequence of tools with code execution capabilities
|
||||
"""
|
||||
code_tools = agent.get_code_execution_tools() # type: ignore[attr-defined]
|
||||
return self._merge_tools(tools, code_tools)
|
||||
|
||||
def _add_delegation_tools(self, task: Task, tools: List[Tool]):
|
||||
def _add_delegation_tools(
|
||||
self, task: Task, tools: Sequence[BaseTool]
|
||||
) -> Sequence[BaseTool]:
|
||||
"""Add delegation tools for the task's agent.
|
||||
|
||||
Args:
|
||||
task: Task being executed
|
||||
tools: Current set of tools
|
||||
|
||||
Returns:
|
||||
Updated sequence of tools with delegation capabilities
|
||||
"""
|
||||
agents_for_delegation = [agent for agent in self.agents if agent != task.agent]
|
||||
if len(self.agents) > 1 and len(agents_for_delegation) > 0 and task.agent:
|
||||
if not tools:
|
||||
tools = []
|
||||
tools = [] # type: ignore[assignment]
|
||||
tools = self._inject_delegation_tools(
|
||||
tools, task.agent, agents_for_delegation
|
||||
)
|
||||
@@ -896,7 +974,18 @@ class Crew(BaseModel):
|
||||
task_name=task.name, task=task.description, agent=role, status="started"
|
||||
)
|
||||
|
||||
def _update_manager_tools(self, task: Task, tools: List[Tool]):
|
||||
def _update_manager_tools(
|
||||
self, task: Task, tools: Sequence[BaseTool]
|
||||
) -> Sequence[BaseTool]:
|
||||
"""Update tools for manager agent.
|
||||
|
||||
Args:
|
||||
task: Task being executed
|
||||
tools: Current set of tools
|
||||
|
||||
Returns:
|
||||
Updated sequence of tools with manager capabilities
|
||||
"""
|
||||
if self.manager_agent:
|
||||
if task.agent:
|
||||
tools = self._inject_delegation_tools(tools, task.agent, [task.agent])
|
||||
@@ -1148,19 +1237,42 @@ class Crew(BaseModel):
|
||||
def test(
|
||||
self,
|
||||
n_iterations: int,
|
||||
llm: Optional[Union[str, LLM]] = None,
|
||||
openai_model_name: Optional[str] = None,
|
||||
inputs: Optional[Dict[str, Any]] = None,
|
||||
) -> None:
|
||||
"""Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures."""
|
||||
"""Test and evaluate the Crew with the given inputs for n iterations.
|
||||
|
||||
Args:
|
||||
n_iterations: Number of test iterations to run
|
||||
llm: LLM instance or model name to use for evaluation
|
||||
openai_model_name: (Deprecated) OpenAI model name to use for evaluation
|
||||
inputs: Optional dictionary of inputs for the crew
|
||||
|
||||
Raises:
|
||||
ValueError: If inputs is not a dictionary or if LLM configuration is invalid
|
||||
TypeError: If n_iterations is not a positive integer
|
||||
"""
|
||||
if n_iterations < 1:
|
||||
raise TypeError("n_iterations must be a positive integer")
|
||||
|
||||
if inputs is not None and not isinstance(inputs, dict):
|
||||
raise ValueError("inputs must be a dictionary")
|
||||
|
||||
# Validate LLM configuration
|
||||
if isinstance(llm, str) and not llm.strip():
|
||||
raise ValueError("LLM model name cannot be empty")
|
||||
|
||||
test_llm: Union[str, LLM, None] = llm if llm is not None else openai_model_name
|
||||
test_crew = self.copy()
|
||||
|
||||
self._test_execution_span = test_crew._telemetry.test_execution_span(
|
||||
test_crew,
|
||||
n_iterations,
|
||||
inputs,
|
||||
openai_model_name, # type: ignore[arg-type]
|
||||
) # type: ignore[arg-type]
|
||||
evaluator = CrewEvaluator(test_crew, openai_model_name) # type: ignore[arg-type]
|
||||
test_llm,
|
||||
)
|
||||
evaluator = CrewEvaluator(test_crew, test_llm)
|
||||
|
||||
for i in range(1, n_iterations + 1):
|
||||
evaluator.set_iteration(i)
|
||||
|
||||
@@ -1,138 +1,28 @@
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterator, List, Optional, Union
|
||||
from urllib.parse import urlparse
|
||||
from typing import Dict, List
|
||||
|
||||
from pydantic import Field, field_validator
|
||||
|
||||
from crewai.knowledge.source.base_knowledge_source import BaseKnowledgeSource
|
||||
from crewai.utilities.constants import KNOWLEDGE_DIRECTORY
|
||||
from crewai.utilities.logger import Logger
|
||||
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
|
||||
|
||||
|
||||
class ExcelKnowledgeSource(BaseKnowledgeSource):
|
||||
class ExcelKnowledgeSource(BaseFileKnowledgeSource):
|
||||
"""A knowledge source that stores and queries Excel file content using embeddings."""
|
||||
|
||||
# override content to be a dict of file paths to sheet names to csv content
|
||||
|
||||
_logger: Logger = Logger(verbose=True)
|
||||
|
||||
file_path: Optional[Union[Path, List[Path], str, List[str]]] = Field(
|
||||
default=None,
|
||||
description="[Deprecated] The path to the file. Use file_paths instead.",
|
||||
)
|
||||
file_paths: Optional[Union[Path, List[Path], str, List[str]]] = Field(
|
||||
default_factory=list, description="The path to the file"
|
||||
)
|
||||
chunks: List[str] = Field(default_factory=list)
|
||||
content: Dict[Path, Dict[str, str]] = Field(default_factory=dict)
|
||||
safe_file_paths: List[Path] = Field(default_factory=list)
|
||||
|
||||
@field_validator("file_path", "file_paths", mode="before")
|
||||
def validate_file_path(cls, v, info):
|
||||
"""Validate that at least one of file_path or file_paths is provided."""
|
||||
# Single check if both are None, O(1) instead of nested conditions
|
||||
if (
|
||||
v is None
|
||||
and info.data.get(
|
||||
"file_path" if info.field_name == "file_paths" else "file_paths"
|
||||
)
|
||||
is None
|
||||
):
|
||||
raise ValueError("Either file_path or file_paths must be provided")
|
||||
return v
|
||||
|
||||
def _process_file_paths(self) -> List[Path]:
|
||||
"""Convert file_path to a list of Path objects."""
|
||||
|
||||
if hasattr(self, "file_path") and self.file_path is not None:
|
||||
self._logger.log(
|
||||
"warning",
|
||||
"The 'file_path' attribute is deprecated and will be removed in a future version. Please use 'file_paths' instead.",
|
||||
color="yellow",
|
||||
)
|
||||
self.file_paths = self.file_path
|
||||
|
||||
if self.file_paths is None:
|
||||
raise ValueError("Your source must be provided with a file_paths: []")
|
||||
|
||||
# Convert single path to list
|
||||
path_list: List[Union[Path, str]] = (
|
||||
[self.file_paths]
|
||||
if isinstance(self.file_paths, (str, Path))
|
||||
else list(self.file_paths)
|
||||
if isinstance(self.file_paths, list)
|
||||
else []
|
||||
)
|
||||
|
||||
if not path_list:
|
||||
raise ValueError(
|
||||
"file_path/file_paths must be a Path, str, or a list of these types"
|
||||
)
|
||||
|
||||
return [self.convert_to_path(path) for path in path_list]
|
||||
|
||||
def validate_content(self):
|
||||
"""Validate the paths."""
|
||||
for path in self.safe_file_paths:
|
||||
if not path.exists():
|
||||
self._logger.log(
|
||||
"error",
|
||||
f"File not found: {path}. Try adding sources to the knowledge directory. If it's inside the knowledge directory, use the relative path.",
|
||||
color="red",
|
||||
)
|
||||
raise FileNotFoundError(f"File not found: {path}")
|
||||
if not path.is_file():
|
||||
self._logger.log(
|
||||
"error",
|
||||
f"Path is not a file: {path}",
|
||||
color="red",
|
||||
)
|
||||
|
||||
def model_post_init(self, _) -> None:
|
||||
if self.file_path:
|
||||
self._logger.log(
|
||||
"warning",
|
||||
"The 'file_path' attribute is deprecated and will be removed in a future version. Please use 'file_paths' instead.",
|
||||
color="yellow",
|
||||
)
|
||||
self.file_paths = self.file_path
|
||||
self.safe_file_paths = self._process_file_paths()
|
||||
self.validate_content()
|
||||
self.content = self._load_content()
|
||||
|
||||
def _load_content(self) -> Dict[Path, Dict[str, str]]:
|
||||
"""Load and preprocess Excel file content from multiple sheets.
|
||||
|
||||
Each sheet's content is converted to CSV format and stored.
|
||||
|
||||
Returns:
|
||||
Dict[Path, Dict[str, str]]: A mapping of file paths to their respective sheet contents.
|
||||
|
||||
Raises:
|
||||
ImportError: If required dependencies are missing.
|
||||
FileNotFoundError: If the specified Excel file cannot be opened.
|
||||
"""
|
||||
def load_content(self) -> Dict[Path, str]:
|
||||
"""Load and preprocess Excel file content."""
|
||||
pd = self._import_dependencies()
|
||||
|
||||
content_dict = {}
|
||||
for file_path in self.safe_file_paths:
|
||||
file_path = self.convert_to_path(file_path)
|
||||
with pd.ExcelFile(file_path) as xl:
|
||||
sheet_dict = {
|
||||
str(sheet_name): str(
|
||||
pd.read_excel(xl, sheet_name).to_csv(index=False)
|
||||
)
|
||||
for sheet_name in xl.sheet_names
|
||||
}
|
||||
content_dict[file_path] = sheet_dict
|
||||
df = pd.read_excel(file_path)
|
||||
content = df.to_csv(index=False)
|
||||
content_dict[file_path] = content
|
||||
return content_dict
|
||||
|
||||
def convert_to_path(self, path: Union[Path, str]) -> Path:
|
||||
"""Convert a path to a Path object."""
|
||||
return Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
|
||||
|
||||
def _import_dependencies(self):
|
||||
"""Dynamically import dependencies."""
|
||||
try:
|
||||
import openpyxl # noqa
|
||||
import pandas as pd
|
||||
|
||||
return pd
|
||||
@@ -148,14 +38,10 @@ class ExcelKnowledgeSource(BaseKnowledgeSource):
|
||||
and save the embeddings.
|
||||
"""
|
||||
# Convert dictionary values to a single string if content is a dictionary
|
||||
# Updated to account for .xlsx workbooks with multiple tabs/sheets
|
||||
content_str = ""
|
||||
for value in self.content.values():
|
||||
if isinstance(value, dict):
|
||||
for sheet_value in value.values():
|
||||
content_str += str(sheet_value) + "\n"
|
||||
else:
|
||||
content_str += str(value) + "\n"
|
||||
if isinstance(self.content, dict):
|
||||
content_str = "\n".join(str(value) for value in self.content.values())
|
||||
else:
|
||||
content_str = str(self.content)
|
||||
|
||||
new_chunks = self._chunk_text(content_str)
|
||||
self.chunks.extend(new_chunks)
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from collections import defaultdict
|
||||
from typing import Union
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from rich.box import HEAVY_EDGE
|
||||
@@ -6,6 +7,7 @@ from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
from crewai.agent import Agent
|
||||
from crewai.llm import LLM
|
||||
from crewai.task import Task
|
||||
from crewai.tasks.task_output import TaskOutput
|
||||
from crewai.telemetry import Telemetry
|
||||
@@ -32,9 +34,27 @@ class CrewEvaluator:
|
||||
run_execution_times: defaultdict = defaultdict(list)
|
||||
iteration: int = 0
|
||||
|
||||
def __init__(self, crew, openai_model_name: str):
|
||||
def __init__(self, crew, llm: Union[str, LLM, None] = None):
|
||||
"""Initialize the CrewEvaluator.
|
||||
|
||||
Args:
|
||||
crew: The crew to evaluate
|
||||
llm: LLM instance or model name to use for evaluation
|
||||
|
||||
Raises:
|
||||
ValueError: If LLM model name is empty or invalid
|
||||
RuntimeError: If evaluator agent initialization fails
|
||||
"""
|
||||
self.crew = crew
|
||||
self.openai_model_name = openai_model_name
|
||||
|
||||
if isinstance(llm, str) and not llm.strip():
|
||||
raise ValueError("LLM model name cannot be empty")
|
||||
|
||||
try:
|
||||
self._llm = llm if isinstance(llm, LLM) else LLM(model=llm) if llm else None
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to initialize LLM: {str(e)}")
|
||||
|
||||
self._telemetry = Telemetry()
|
||||
self._setup_for_evaluating()
|
||||
|
||||
@@ -51,7 +71,7 @@ class CrewEvaluator:
|
||||
),
|
||||
backstory="Evaluator agent for crew evaluation with precise capabilities to evaluate the performance of the agents in the crew based on the tasks they have performed",
|
||||
verbose=False,
|
||||
llm=self.openai_model_name,
|
||||
llm=self._llm,
|
||||
)
|
||||
|
||||
def _evaluation_task(
|
||||
@@ -181,7 +201,7 @@ class CrewEvaluator:
|
||||
self.crew,
|
||||
evaluation_result.pydantic.quality,
|
||||
current_task.execution_duration,
|
||||
self.openai_model_name,
|
||||
self._llm.model if self._llm else "default",
|
||||
)
|
||||
self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality)
|
||||
self.run_execution_times[self.iteration].append(
|
||||
|
||||
@@ -15,6 +15,7 @@ from crewai.agents.cache import CacheHandler
|
||||
from crewai.crew import Crew
|
||||
from crewai.crews.crew_output import CrewOutput
|
||||
from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource
|
||||
from crewai.llm import LLM
|
||||
from crewai.memory.contextual.contextual_memory import ContextualMemory
|
||||
from crewai.process import Process
|
||||
from crewai.project import crew
|
||||
@@ -24,6 +25,9 @@ from crewai.tasks.output_format import OutputFormat
|
||||
from crewai.tasks.task_output import TaskOutput
|
||||
from crewai.types.usage_metrics import UsageMetrics
|
||||
from crewai.utilities import Logger
|
||||
from crewai.utilities.evaluators.crew_evaluator_handler import (
|
||||
TaskEvaluationPydanticOutput,
|
||||
)
|
||||
from crewai.utilities.rpm_controller import RPMController
|
||||
from crewai.utilities.task_output_storage_handler import TaskOutputStorageHandler
|
||||
|
||||
@@ -51,7 +55,6 @@ writer = Agent(
|
||||
|
||||
def test_crew_with_only_conditional_tasks_raises_error():
|
||||
"""Test that creating a crew with only conditional tasks raises an error."""
|
||||
|
||||
def condition_func(task_output: TaskOutput) -> bool:
|
||||
return True
|
||||
|
||||
@@ -83,7 +86,6 @@ def test_crew_with_only_conditional_tasks_raises_error():
|
||||
tasks=[conditional1, conditional2, conditional3],
|
||||
)
|
||||
|
||||
|
||||
def test_crew_config_conditional_requirement():
|
||||
with pytest.raises(ValueError):
|
||||
Crew(process=Process.sequential)
|
||||
@@ -591,12 +593,12 @@ def test_crew_with_delegating_agents_should_not_override_task_tools():
|
||||
_, kwargs = mock_execute_sync.call_args
|
||||
tools = kwargs["tools"]
|
||||
|
||||
assert any(
|
||||
isinstance(tool, TestTool) for tool in tools
|
||||
), "TestTool should be present"
|
||||
assert any(
|
||||
"delegate" in tool.name.lower() for tool in tools
|
||||
), "Delegation tool should be present"
|
||||
assert any(isinstance(tool, TestTool) for tool in tools), (
|
||||
"TestTool should be present"
|
||||
)
|
||||
assert any("delegate" in tool.name.lower() for tool in tools), (
|
||||
"Delegation tool should be present"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@@ -655,12 +657,12 @@ def test_crew_with_delegating_agents_should_not_override_agent_tools():
|
||||
_, kwargs = mock_execute_sync.call_args
|
||||
tools = kwargs["tools"]
|
||||
|
||||
assert any(
|
||||
isinstance(tool, TestTool) for tool in new_ceo.tools
|
||||
), "TestTool should be present"
|
||||
assert any(
|
||||
"delegate" in tool.name.lower() for tool in tools
|
||||
), "Delegation tool should be present"
|
||||
assert any(isinstance(tool, TestTool) for tool in new_ceo.tools), (
|
||||
"TestTool should be present"
|
||||
)
|
||||
assert any("delegate" in tool.name.lower() for tool in tools), (
|
||||
"Delegation tool should be present"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@@ -784,17 +786,17 @@ def test_task_tools_override_agent_tools_with_allow_delegation():
|
||||
used_tools = kwargs["tools"]
|
||||
|
||||
# Confirm AnotherTestTool is present but TestTool is not
|
||||
assert any(
|
||||
isinstance(tool, AnotherTestTool) for tool in used_tools
|
||||
), "AnotherTestTool should be present"
|
||||
assert not any(
|
||||
isinstance(tool, TestTool) for tool in used_tools
|
||||
), "TestTool should not be present among used tools"
|
||||
assert any(isinstance(tool, AnotherTestTool) for tool in used_tools), (
|
||||
"AnotherTestTool should be present"
|
||||
)
|
||||
assert not any(isinstance(tool, TestTool) for tool in used_tools), (
|
||||
"TestTool should not be present among used tools"
|
||||
)
|
||||
|
||||
# Confirm delegation tool(s) are present
|
||||
assert any(
|
||||
"delegate" in tool.name.lower() for tool in used_tools
|
||||
), "Delegation tool should be present"
|
||||
assert any("delegate" in tool.name.lower() for tool in used_tools), (
|
||||
"Delegation tool should be present"
|
||||
)
|
||||
|
||||
# Finally, make sure the agent's original tools remain unchanged
|
||||
assert len(researcher_with_delegation.tools) == 1
|
||||
@@ -1595,9 +1597,9 @@ def test_code_execution_flag_adds_code_tool_upon_kickoff():
|
||||
|
||||
# Verify that exactly one tool was used and it was a CodeInterpreterTool
|
||||
assert len(used_tools) == 1, "Should have exactly one tool"
|
||||
assert isinstance(
|
||||
used_tools[0], CodeInterpreterTool
|
||||
), "Tool should be CodeInterpreterTool"
|
||||
assert isinstance(used_tools[0], CodeInterpreterTool), (
|
||||
"Tool should be CodeInterpreterTool"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@@ -1954,7 +1956,6 @@ def test_task_callback_on_crew():
|
||||
|
||||
def test_task_callback_both_on_task_and_crew():
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
mock_callback_on_task = MagicMock()
|
||||
mock_callback_on_crew = MagicMock()
|
||||
|
||||
@@ -2104,22 +2105,21 @@ def test_conditional_task_uses_last_output():
|
||||
expected_output="First output",
|
||||
agent=researcher,
|
||||
)
|
||||
|
||||
def condition_fails(task_output: TaskOutput) -> bool:
|
||||
# This condition will never be met
|
||||
return "never matches" in task_output.raw.lower()
|
||||
|
||||
|
||||
def condition_succeeds(task_output: TaskOutput) -> bool:
|
||||
# This condition will match first task's output
|
||||
return "first success" in task_output.raw.lower()
|
||||
|
||||
|
||||
conditional_task1 = ConditionalTask(
|
||||
description="Second task - conditional that fails condition",
|
||||
expected_output="Second output",
|
||||
agent=researcher,
|
||||
condition=condition_fails,
|
||||
)
|
||||
|
||||
|
||||
conditional_task2 = ConditionalTask(
|
||||
description="Third task - conditional that succeeds using first task output",
|
||||
expected_output="Third output",
|
||||
@@ -2138,37 +2138,35 @@ def test_conditional_task_uses_last_output():
|
||||
raw="First success output", # Will be used by third task's condition
|
||||
agent=researcher.role,
|
||||
)
|
||||
mock_skipped = TaskOutput(
|
||||
description="Second task output",
|
||||
raw="", # Empty output since condition fails
|
||||
agent=researcher.role,
|
||||
)
|
||||
mock_third = TaskOutput(
|
||||
description="Third task output",
|
||||
raw="Third task executed", # Output when condition succeeds using first task output
|
||||
agent=writer.role,
|
||||
)
|
||||
|
||||
|
||||
# Set up mocks for task execution and conditional logic
|
||||
with patch.object(ConditionalTask, "should_execute") as mock_should_execute:
|
||||
# First conditional fails, second succeeds
|
||||
mock_should_execute.side_effect = [False, True]
|
||||
|
||||
with patch.object(Task, "execute_sync") as mock_execute:
|
||||
mock_execute.side_effect = [mock_first, mock_third]
|
||||
result = crew.kickoff()
|
||||
|
||||
|
||||
# Verify execution behavior
|
||||
assert mock_execute.call_count == 2 # Only first and third tasks execute
|
||||
assert mock_should_execute.call_count == 2 # Both conditionals checked
|
||||
|
||||
# Verify outputs collection:
|
||||
# First executed task output, followed by an automatically generated (skipped) output, then the conditional execution
|
||||
|
||||
# Verify outputs collection
|
||||
assert len(result.tasks_output) == 3
|
||||
assert (
|
||||
result.tasks_output[0].raw == "First success output"
|
||||
) # First task succeeded
|
||||
assert (
|
||||
result.tasks_output[1].raw == ""
|
||||
) # Second task skipped (condition failed)
|
||||
assert (
|
||||
result.tasks_output[2].raw == "Third task executed"
|
||||
) # Third task used first task's output
|
||||
|
||||
assert result.tasks_output[0].raw == "First success output" # First task succeeded
|
||||
assert result.tasks_output[1].raw == "" # Second task skipped (condition failed)
|
||||
assert result.tasks_output[2].raw == "Third task executed" # Third task used first task's output
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
def test_conditional_tasks_result_collection():
|
||||
@@ -2178,20 +2176,20 @@ def test_conditional_tasks_result_collection():
|
||||
expected_output="First output",
|
||||
agent=researcher,
|
||||
)
|
||||
|
||||
|
||||
def condition_never_met(task_output: TaskOutput) -> bool:
|
||||
return "never matches" in task_output.raw.lower()
|
||||
|
||||
|
||||
def condition_always_met(task_output: TaskOutput) -> bool:
|
||||
return "success" in task_output.raw.lower()
|
||||
|
||||
|
||||
task2 = ConditionalTask(
|
||||
description="Conditional task that never executes",
|
||||
expected_output="Second output",
|
||||
agent=researcher,
|
||||
condition=condition_never_met,
|
||||
)
|
||||
|
||||
|
||||
task3 = ConditionalTask(
|
||||
description="Conditional task that always executes",
|
||||
expected_output="Third output",
|
||||
@@ -2210,46 +2208,35 @@ def test_conditional_tasks_result_collection():
|
||||
raw="Success output", # Triggers third task's condition
|
||||
agent=researcher.role,
|
||||
)
|
||||
mock_skipped = TaskOutput(
|
||||
description="Skipped output",
|
||||
raw="", # Empty output for skipped task
|
||||
agent=researcher.role,
|
||||
)
|
||||
mock_conditional = TaskOutput(
|
||||
description="Conditional output",
|
||||
raw="Conditional task executed",
|
||||
agent=writer.role,
|
||||
)
|
||||
|
||||
|
||||
# Set up mocks for task execution and conditional logic
|
||||
with patch.object(ConditionalTask, "should_execute") as mock_should_execute:
|
||||
# First conditional fails, second succeeds
|
||||
mock_should_execute.side_effect = [False, True]
|
||||
|
||||
with patch.object(Task, "execute_sync") as mock_execute:
|
||||
mock_execute.side_effect = [mock_success, mock_conditional]
|
||||
result = crew.kickoff()
|
||||
|
||||
|
||||
# Verify execution behavior
|
||||
assert mock_execute.call_count == 2 # Only first and third tasks execute
|
||||
assert mock_should_execute.call_count == 2 # Both conditionals checked
|
||||
|
||||
# Verify task output collection:
|
||||
# There should be three outputs: normal task, skipped conditional task (empty output),
|
||||
# and the conditional task that executed.
|
||||
assert len(result.tasks_output) == 3
|
||||
assert (
|
||||
result.tasks_output[0].raw == "Success output"
|
||||
) # Normal task executed
|
||||
assert result.tasks_output[1].raw == "" # Second task skipped
|
||||
assert (
|
||||
result.tasks_output[2].raw == "Conditional task executed"
|
||||
) # Third task executed
|
||||
|
||||
|
||||
# Verify task output collection
|
||||
assert len(result.tasks_output) == 3
|
||||
assert (
|
||||
result.tasks_output[0].raw == "Success output"
|
||||
) # Normal task executed
|
||||
assert result.tasks_output[1].raw == "" # Second task skipped
|
||||
assert (
|
||||
result.tasks_output[2].raw == "Conditional task executed"
|
||||
) # Third task executed
|
||||
|
||||
assert result.tasks_output[0].raw == "Success output" # Normal task executed
|
||||
assert result.tasks_output[1].raw == "" # Second task skipped
|
||||
assert result.tasks_output[2].raw == "Conditional task executed" # Third task executed
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
def test_multiple_conditional_tasks():
|
||||
@@ -2259,20 +2246,20 @@ def test_multiple_conditional_tasks():
|
||||
expected_output="Research output",
|
||||
agent=researcher,
|
||||
)
|
||||
|
||||
|
||||
def condition1(task_output: TaskOutput) -> bool:
|
||||
return "success" in task_output.raw.lower()
|
||||
|
||||
|
||||
def condition2(task_output: TaskOutput) -> bool:
|
||||
return "proceed" in task_output.raw.lower()
|
||||
|
||||
|
||||
task2 = ConditionalTask(
|
||||
description="First conditional task",
|
||||
expected_output="Conditional output 1",
|
||||
agent=writer,
|
||||
condition=condition1,
|
||||
)
|
||||
|
||||
|
||||
task3 = ConditionalTask(
|
||||
description="Second conditional task",
|
||||
expected_output="Conditional output 2",
|
||||
@@ -2291,7 +2278,7 @@ def test_multiple_conditional_tasks():
|
||||
raw="Success and proceed output",
|
||||
agent=researcher.role,
|
||||
)
|
||||
|
||||
|
||||
# Set up mocks for task execution
|
||||
with patch.object(Task, "execute_sync", return_value=mock_success) as mock_execute:
|
||||
result = crew.kickoff()
|
||||
@@ -2299,7 +2286,6 @@ def test_multiple_conditional_tasks():
|
||||
assert mock_execute.call_count == 3
|
||||
assert len(result.tasks_output) == 3
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
def test_using_contextual_memory():
|
||||
from unittest.mock import patch
|
||||
@@ -3323,39 +3309,95 @@ def test_conditional_should_execute():
|
||||
|
||||
|
||||
@mock.patch("crewai.crew.CrewEvaluator")
|
||||
@mock.patch("crewai.crew.Crew.copy")
|
||||
@mock.patch("crewai.crew.Crew.kickoff")
|
||||
def test_crew_testing_function(kickoff_mock, copy_mock, crew_evaluator):
|
||||
@mock.patch.object(Crew, "copy")
|
||||
@mock.patch.object(Crew, "kickoff")
|
||||
def test_crew_test_with_custom_llm(mock_kickoff, mock_copy, mock_evaluator):
|
||||
"""Test that Crew.test() works with a custom LLM implementation."""
|
||||
task = Task(description="Test task", expected_output="Test output", agent=researcher)
|
||||
crew = Crew(agents=[researcher], tasks=[task])
|
||||
mock_copy.return_value = crew
|
||||
mock_evaluator.return_value = mock.MagicMock()
|
||||
|
||||
llm = LLM(model="gpt-4")
|
||||
crew.test(n_iterations=1, llm=llm)
|
||||
|
||||
# Verify CrewEvaluator was called with the LLM instance
|
||||
mock_evaluator.assert_called_once()
|
||||
args = mock_evaluator.call_args[0]
|
||||
assert args[1] == llm
|
||||
|
||||
@mock.patch("crewai.crew.CrewEvaluator")
|
||||
@mock.patch.object(Crew, "copy")
|
||||
@mock.patch.object(Crew, "kickoff")
|
||||
def test_crew_test_backward_compatibility(mock_kickoff, mock_copy, mock_evaluator):
|
||||
"""Test that Crew.test() maintains backward compatibility with openai_model_name."""
|
||||
task = Task(description="Test task", expected_output="Test output", agent=researcher)
|
||||
crew = Crew(agents=[researcher], tasks=[task])
|
||||
mock_copy.return_value = crew
|
||||
mock_evaluator.return_value = mock.MagicMock()
|
||||
|
||||
crew.test(n_iterations=1, openai_model_name="gpt-4")
|
||||
|
||||
# Verify CrewEvaluator was called with the model name
|
||||
mock_evaluator.assert_called_once()
|
||||
args = mock_evaluator.call_args[0]
|
||||
assert args[1] == "gpt-4"
|
||||
|
||||
@mock.patch("crewai.crew.CrewEvaluator")
|
||||
@mock.patch.object(Crew, "copy")
|
||||
@mock.patch.object(Crew, "kickoff")
|
||||
def test_crew_test_with_invalid_inputs(mock_kickoff, mock_copy, mock_evaluator):
|
||||
"""Test that Crew.test() validates inputs properly."""
|
||||
task = Task(description="Test task", expected_output="Test output", agent=researcher)
|
||||
crew = Crew(agents=[researcher], tasks=[task])
|
||||
mock_copy.return_value = crew
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
crew.test(n_iterations=0) # Invalid iterations
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
crew.test(n_iterations=1, inputs="invalid") # Invalid inputs type
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
crew.test(n_iterations=1, llm="") # Empty LLM name
|
||||
|
||||
@mock.patch("crewai.crew.CrewEvaluator")
|
||||
@mock.patch.object(Crew, "copy")
|
||||
@mock.patch.object(Crew, "kickoff")
|
||||
def test_crew_test_concurrent_execution(mock_kickoff, mock_copy, mock_evaluator):
|
||||
"""Test that Crew.test() handles concurrent execution properly."""
|
||||
task = Task(description="Test task", expected_output="Test output", agent=researcher)
|
||||
crew = Crew(agents=[researcher], tasks=[task])
|
||||
mock_copy.return_value = crew
|
||||
mock_evaluator.return_value = mock.MagicMock()
|
||||
n_iterations = 3
|
||||
|
||||
crew.test(n_iterations=n_iterations)
|
||||
assert mock_evaluator.return_value.set_iteration.call_count == n_iterations
|
||||
|
||||
@mock.patch("crewai.crew.CrewEvaluator")
|
||||
@mock.patch.object(Crew, "copy")
|
||||
@mock.patch.object(Crew, "kickoff")
|
||||
def test_crew_testing_function(mock_kickoff, mock_copy, mock_evaluator):
|
||||
"""Test that Crew.test() works with basic functionality."""
|
||||
task = Task(
|
||||
description="Come up with a list of 5 interesting ideas to explore for an article, then write one amazing paragraph highlight for each idea that showcases how good an article about this topic could be. Return the list of ideas with their paragraph and your notes.",
|
||||
expected_output="5 bullet points with a paragraph for each idea.",
|
||||
description="Test task",
|
||||
expected_output="Test output",
|
||||
agent=researcher,
|
||||
)
|
||||
|
||||
crew = Crew(
|
||||
agents=[researcher],
|
||||
tasks=[task],
|
||||
)
|
||||
|
||||
# Create a mock for the copied crew
|
||||
copy_mock.return_value = crew
|
||||
|
||||
n_iterations = 2
|
||||
crew.test(n_iterations, openai_model_name="gpt-4o-mini", inputs={"topic": "AI"})
|
||||
|
||||
# Ensure kickoff is called on the copied crew
|
||||
kickoff_mock.assert_has_calls(
|
||||
[mock.call(inputs={"topic": "AI"}), mock.call(inputs={"topic": "AI"})]
|
||||
)
|
||||
|
||||
crew_evaluator.assert_has_calls(
|
||||
[
|
||||
mock.call(crew, "gpt-4o-mini"),
|
||||
mock.call().set_iteration(1),
|
||||
mock.call().set_iteration(2),
|
||||
mock.call().print_crew_evaluation_result(),
|
||||
]
|
||||
)
|
||||
crew = Crew(agents=[researcher], tasks=[task])
|
||||
mock_copy.return_value = crew
|
||||
mock_evaluator.return_value = mock.MagicMock()
|
||||
|
||||
crew.test(n_iterations=1)
|
||||
|
||||
# Verify CrewEvaluator was called with None as llm (default behavior)
|
||||
mock_evaluator.assert_called_once()
|
||||
args = mock_evaluator.call_args[0]
|
||||
assert args[1] is None
|
||||
|
||||
# Verify kickoff was called
|
||||
mock_kickoff.assert_called_once()
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@@ -3418,9 +3460,9 @@ def test_fetch_inputs():
|
||||
expected_placeholders = {"role_detail", "topic", "field"}
|
||||
actual_placeholders = crew.fetch_inputs()
|
||||
|
||||
assert (
|
||||
actual_placeholders == expected_placeholders
|
||||
), f"Expected {expected_placeholders}, but got {actual_placeholders}"
|
||||
assert actual_placeholders == expected_placeholders, (
|
||||
f"Expected {expected_placeholders}, but got {actual_placeholders}"
|
||||
)
|
||||
|
||||
|
||||
def test_task_tools_preserve_code_execution_tools():
|
||||
@@ -3493,20 +3535,20 @@ def test_task_tools_preserve_code_execution_tools():
|
||||
used_tools = kwargs["tools"]
|
||||
|
||||
# Verify all expected tools are present
|
||||
assert any(
|
||||
isinstance(tool, TestTool) for tool in used_tools
|
||||
), "Task's TestTool should be present"
|
||||
assert any(
|
||||
isinstance(tool, CodeInterpreterTool) for tool in used_tools
|
||||
), "CodeInterpreterTool should be present"
|
||||
assert any(
|
||||
"delegate" in tool.name.lower() for tool in used_tools
|
||||
), "Delegation tool should be present"
|
||||
assert any(isinstance(tool, TestTool) for tool in used_tools), (
|
||||
"Task's TestTool should be present"
|
||||
)
|
||||
assert any(isinstance(tool, CodeInterpreterTool) for tool in used_tools), (
|
||||
"CodeInterpreterTool should be present"
|
||||
)
|
||||
assert any("delegate" in tool.name.lower() for tool in used_tools), (
|
||||
"Delegation tool should be present"
|
||||
)
|
||||
|
||||
# Verify the total number of tools (TestTool + CodeInterpreter + 2 delegation tools)
|
||||
assert (
|
||||
len(used_tools) == 4
|
||||
), "Should have TestTool, CodeInterpreter, and 2 delegation tools"
|
||||
assert len(used_tools) == 4, (
|
||||
"Should have TestTool, CodeInterpreter, and 2 delegation tools"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@@ -3550,9 +3592,9 @@ def test_multimodal_flag_adds_multimodal_tools():
|
||||
used_tools = kwargs["tools"]
|
||||
|
||||
# Check that the multimodal tool was added
|
||||
assert any(
|
||||
isinstance(tool, AddImageTool) for tool in used_tools
|
||||
), "AddImageTool should be present when agent is multimodal"
|
||||
assert any(isinstance(tool, AddImageTool) for tool in used_tools), (
|
||||
"AddImageTool should be present when agent is multimodal"
|
||||
)
|
||||
|
||||
# Verify we have exactly one tool (just the AddImageTool)
|
||||
assert len(used_tools) == 1, "Should only have the AddImageTool"
|
||||
@@ -3778,9 +3820,9 @@ def test_crew_guardrail_feedback_in_context():
|
||||
assert len(execution_contexts) > 1, "Task should have been executed multiple times"
|
||||
|
||||
# Verify that the second execution included the guardrail feedback
|
||||
assert (
|
||||
"Output must contain the keyword 'IMPORTANT'" in execution_contexts[1]
|
||||
), "Guardrail feedback should be included in retry context"
|
||||
assert "Output must contain the keyword 'IMPORTANT'" in execution_contexts[1], (
|
||||
"Guardrail feedback should be included in retry context"
|
||||
)
|
||||
|
||||
# Verify final output meets guardrail requirements
|
||||
assert "IMPORTANT" in result.raw, "Final output should contain required keyword"
|
||||
|
||||
@@ -4,6 +4,7 @@ import pytest
|
||||
|
||||
from crewai.agent import Agent
|
||||
from crewai.crew import Crew
|
||||
from crewai.llm import LLM
|
||||
from crewai.task import Task
|
||||
from crewai.tasks.task_output import TaskOutput
|
||||
from crewai.utilities.evaluators.crew_evaluator_handler import (
|
||||
@@ -140,3 +141,23 @@ class InternalCrewEvaluator:
|
||||
execute().pydantic = TaskEvaluationPydanticOutput(quality=9.5)
|
||||
crew_planner.evaluate(task_output)
|
||||
assert crew_planner.tasks_scores[0] == [9.5]
|
||||
|
||||
def test_crew_evaluator_with_llm_instance(self):
|
||||
"""Test that CrewEvaluator works with an LLM instance."""
|
||||
agent = Agent(role="Agent 1", goal="Goal 1", backstory="Backstory 1")
|
||||
task = Task(description="Task 1", expected_output="Output 1", agent=agent)
|
||||
crew = Crew(agents=[agent], tasks=[task])
|
||||
|
||||
llm = LLM(model="gpt-4")
|
||||
evaluator = CrewEvaluator(crew, llm)
|
||||
assert evaluator._llm == llm
|
||||
|
||||
def test_crew_evaluator_with_model_name(self):
|
||||
"""Test that CrewEvaluator works with a model name string."""
|
||||
agent = Agent(role="Agent 1", goal="Goal 1", backstory="Backstory 1")
|
||||
task = Task(description="Task 1", expected_output="Output 1", agent=agent)
|
||||
crew = Crew(agents=[agent], tasks=[task])
|
||||
|
||||
evaluator = CrewEvaluator(crew, "gpt-4")
|
||||
assert isinstance(evaluator._llm, LLM)
|
||||
assert evaluator._llm.model == "gpt-4"
|
||||
|
||||
Reference in New Issue
Block a user