mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-04-30 23:02:50 +00:00
Compare commits
4 Commits
devin/1739
...
devin/1739
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c79700874c | ||
|
|
639e5342de | ||
|
|
257780ff6a | ||
|
|
22d886be11 |
@@ -6,7 +6,7 @@ import warnings
|
||||
from concurrent.futures import Future
|
||||
from copy import copy as shallow_copy
|
||||
from hashlib import md5
|
||||
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
||||
from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, Union
|
||||
|
||||
from pydantic import (
|
||||
UUID4,
|
||||
@@ -37,7 +37,7 @@ from crewai.tasks.conditional_task import ConditionalTask
|
||||
from crewai.tasks.task_output import TaskOutput
|
||||
from crewai.telemetry import Telemetry
|
||||
from crewai.tools.agent_tools.agent_tools import AgentTools
|
||||
from crewai.tools.base_tool import Tool
|
||||
from crewai.tools.base_tool import BaseTool, Tool
|
||||
from crewai.types.usage_metrics import UsageMetrics
|
||||
from crewai.utilities import I18N, FileHandler, Logger, RPMController
|
||||
from crewai.utilities.constants import TRAINING_DATA_FILE
|
||||
@@ -179,7 +179,7 @@ class Crew(BaseModel):
|
||||
default=None,
|
||||
description="Maximum number of requests per minute for the crew execution to be respected.",
|
||||
)
|
||||
prompt_file: str = Field(
|
||||
prompt_file: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Path to the prompt json file to be used for the crew.",
|
||||
)
|
||||
@@ -473,7 +473,16 @@ class Crew(BaseModel):
|
||||
"missing_keys_in_config", "Config should have 'agents' and 'tasks'.", {}
|
||||
)
|
||||
|
||||
self.process = self.config.get("process", self.process)
|
||||
# Get process from config with proper type handling
|
||||
process_value = self.config.get("process")
|
||||
if process_value is not None:
|
||||
if not isinstance(process_value, Process):
|
||||
try:
|
||||
process_value = Process(process_value)
|
||||
except ValueError:
|
||||
raise ValueError(f"Invalid process value: {process_value}")
|
||||
self.process = process_value
|
||||
|
||||
self.agents = [Agent(**agent) for agent in self.config["agents"]]
|
||||
self.tasks = [self._create_task(task) for task in self.config["tasks"]]
|
||||
|
||||
@@ -749,8 +758,12 @@ class Crew(BaseModel):
|
||||
)
|
||||
|
||||
# Determine which tools to use - task tools take precedence over agent tools
|
||||
tools_for_task = task.tools or agent_to_use.tools or []
|
||||
tools_for_task = self._prepare_tools(agent_to_use, task, tools_for_task)
|
||||
initial_tools: List[BaseTool] = []
|
||||
if task.tools:
|
||||
initial_tools = list(task.tools)
|
||||
elif agent_to_use.tools:
|
||||
initial_tools = list(agent_to_use.tools)
|
||||
tools_for_task = self._prepare_tools(agent_to_use, task, initial_tools)
|
||||
|
||||
self._log_task_start(task, agent_to_use.role)
|
||||
|
||||
@@ -766,10 +779,12 @@ class Crew(BaseModel):
|
||||
context = self._get_context(
|
||||
task, [last_sync_output] if last_sync_output else []
|
||||
)
|
||||
# Convert Sequence to List for execute_async
|
||||
tools_list = list(tools_for_task) if tools_for_task else None
|
||||
future = task.execute_async(
|
||||
agent=agent_to_use,
|
||||
context=context,
|
||||
tools=tools_for_task,
|
||||
tools=tools_list,
|
||||
)
|
||||
futures.append((task, future, task_index))
|
||||
else:
|
||||
@@ -778,10 +793,12 @@ class Crew(BaseModel):
|
||||
futures.clear()
|
||||
|
||||
context = self._get_context(task, task_outputs)
|
||||
# Convert Sequence to List for execute_sync
|
||||
tools_list = list(tools_for_task) if tools_for_task else None
|
||||
task_output = task.execute_sync(
|
||||
agent=agent_to_use,
|
||||
context=context,
|
||||
tools=tools_for_task,
|
||||
tools=tools_list,
|
||||
)
|
||||
task_outputs.append(task_output)
|
||||
self._process_task_result(task, task_output)
|
||||
@@ -819,27 +836,37 @@ class Crew(BaseModel):
|
||||
return None
|
||||
|
||||
def _prepare_tools(
|
||||
self, agent: BaseAgent, task: Task, tools: List[Tool]
|
||||
) -> List[Tool]:
|
||||
self, agent: BaseAgent, task: Task, tools: Sequence[BaseTool]
|
||||
) -> Sequence[BaseTool]:
|
||||
"""Prepare tools for the agent.
|
||||
|
||||
Args:
|
||||
agent: The agent to prepare tools for
|
||||
task: The task being executed
|
||||
tools: Initial set of tools
|
||||
|
||||
Returns:
|
||||
Updated sequence of tools with additional capabilities based on agent configuration
|
||||
"""
|
||||
# Add delegation tools if agent allows delegation
|
||||
if agent.allow_delegation:
|
||||
if getattr(agent, "allow_delegation", False):
|
||||
if self.process == Process.hierarchical:
|
||||
if self.manager_agent:
|
||||
tools = self._update_manager_tools(task, tools)
|
||||
tools = self._update_manager_tools(task, tools) # type: ignore[arg-type]
|
||||
else:
|
||||
raise ValueError(
|
||||
"Manager agent is required for hierarchical process."
|
||||
)
|
||||
|
||||
elif agent and agent.allow_delegation:
|
||||
tools = self._add_delegation_tools(task, tools)
|
||||
elif agent:
|
||||
tools = self._add_delegation_tools(task, tools) # type: ignore[arg-type]
|
||||
|
||||
# Add code execution tools if agent allows code execution
|
||||
if agent.allow_code_execution:
|
||||
tools = self._add_code_execution_tools(agent, tools)
|
||||
if getattr(agent, "allow_code_execution", False):
|
||||
tools = self._add_code_execution_tools(agent, tools) # type: ignore[arg-type]
|
||||
|
||||
if agent and agent.multimodal:
|
||||
tools = self._add_multimodal_tools(agent, tools)
|
||||
if agent and getattr(agent, "multimodal", False):
|
||||
tools = self._add_multimodal_tools(agent, tools) # type: ignore[arg-type]
|
||||
|
||||
return tools
|
||||
|
||||
@@ -849,9 +876,17 @@ class Crew(BaseModel):
|
||||
return task.agent
|
||||
|
||||
def _merge_tools(
|
||||
self, existing_tools: List[Tool], new_tools: List[Tool]
|
||||
) -> List[Tool]:
|
||||
"""Merge new tools into existing tools list, avoiding duplicates by tool name."""
|
||||
self, existing_tools: Sequence[BaseTool], new_tools: Sequence[BaseTool]
|
||||
) -> Sequence[BaseTool]:
|
||||
"""Merge new tools into existing tools list, avoiding duplicates by tool name.
|
||||
|
||||
Args:
|
||||
existing_tools: Current sequence of tools
|
||||
new_tools: New tools to merge in
|
||||
|
||||
Returns:
|
||||
Updated sequence of tools with duplicates removed
|
||||
"""
|
||||
if not new_tools:
|
||||
return existing_tools
|
||||
|
||||
@@ -867,24 +902,67 @@ class Crew(BaseModel):
|
||||
return tools
|
||||
|
||||
def _inject_delegation_tools(
|
||||
self, tools: List[Tool], task_agent: BaseAgent, agents: List[BaseAgent]
|
||||
):
|
||||
delegation_tools = task_agent.get_delegation_tools(agents)
|
||||
self, tools: Sequence[BaseTool], task_agent: BaseAgent, agents: List[BaseAgent]
|
||||
) -> Sequence[BaseTool]:
|
||||
"""Add delegation tools for the agent.
|
||||
|
||||
Args:
|
||||
tools: Current set of tools
|
||||
task_agent: Agent that will use the tools
|
||||
agents: List of agents that can be delegated to
|
||||
|
||||
Returns:
|
||||
Updated sequence of tools with delegation capabilities
|
||||
"""
|
||||
delegation_tools = task_agent.get_delegation_tools(agents) # type: ignore[attr-defined]
|
||||
return self._merge_tools(tools, delegation_tools)
|
||||
|
||||
def _add_multimodal_tools(self, agent: BaseAgent, tools: List[Tool]):
|
||||
multimodal_tools = agent.get_multimodal_tools()
|
||||
def _add_multimodal_tools(
|
||||
self, agent: BaseAgent, tools: Sequence[BaseTool]
|
||||
) -> Sequence[BaseTool]:
|
||||
"""Add multimodal tools for the agent.
|
||||
|
||||
Args:
|
||||
agent: Agent that will use the tools
|
||||
tools: Current set of tools
|
||||
|
||||
Returns:
|
||||
Updated sequence of tools with multimodal capabilities
|
||||
"""
|
||||
multimodal_tools = agent.get_multimodal_tools() # type: ignore[attr-defined]
|
||||
return self._merge_tools(tools, multimodal_tools)
|
||||
|
||||
def _add_code_execution_tools(self, agent: BaseAgent, tools: List[Tool]):
|
||||
code_tools = agent.get_code_execution_tools()
|
||||
def _add_code_execution_tools(
|
||||
self, agent: BaseAgent, tools: Sequence[BaseTool]
|
||||
) -> Sequence[BaseTool]:
|
||||
"""Add code execution tools for the agent.
|
||||
|
||||
Args:
|
||||
agent: Agent that will use the tools
|
||||
tools: Current set of tools
|
||||
|
||||
Returns:
|
||||
Updated sequence of tools with code execution capabilities
|
||||
"""
|
||||
code_tools = agent.get_code_execution_tools() # type: ignore[attr-defined]
|
||||
return self._merge_tools(tools, code_tools)
|
||||
|
||||
def _add_delegation_tools(self, task: Task, tools: List[Tool]):
|
||||
def _add_delegation_tools(
|
||||
self, task: Task, tools: Sequence[BaseTool]
|
||||
) -> Sequence[BaseTool]:
|
||||
"""Add delegation tools for the task's agent.
|
||||
|
||||
Args:
|
||||
task: Task being executed
|
||||
tools: Current set of tools
|
||||
|
||||
Returns:
|
||||
Updated sequence of tools with delegation capabilities
|
||||
"""
|
||||
agents_for_delegation = [agent for agent in self.agents if agent != task.agent]
|
||||
if len(self.agents) > 1 and len(agents_for_delegation) > 0 and task.agent:
|
||||
if not tools:
|
||||
tools = []
|
||||
tools = [] # type: ignore[assignment]
|
||||
tools = self._inject_delegation_tools(
|
||||
tools, task.agent, agents_for_delegation
|
||||
)
|
||||
@@ -896,7 +974,18 @@ class Crew(BaseModel):
|
||||
task_name=task.name, task=task.description, agent=role, status="started"
|
||||
)
|
||||
|
||||
def _update_manager_tools(self, task: Task, tools: List[Tool]):
|
||||
def _update_manager_tools(
|
||||
self, task: Task, tools: Sequence[BaseTool]
|
||||
) -> Sequence[BaseTool]:
|
||||
"""Update tools for manager agent.
|
||||
|
||||
Args:
|
||||
task: Task being executed
|
||||
tools: Current set of tools
|
||||
|
||||
Returns:
|
||||
Updated sequence of tools with manager capabilities
|
||||
"""
|
||||
if self.manager_agent:
|
||||
if task.agent:
|
||||
tools = self._inject_delegation_tools(tools, task.agent, [task.agent])
|
||||
@@ -1147,32 +1236,43 @@ class Crew(BaseModel):
|
||||
|
||||
def test(
|
||||
self,
|
||||
n_iterations: int = 1,
|
||||
openai_model_name: Optional[str] = None,
|
||||
n_iterations: int,
|
||||
llm: Optional[Union[str, LLM]] = None,
|
||||
openai_model_name: Optional[str] = None,
|
||||
inputs: Optional[Dict[str, Any]] = None,
|
||||
) -> None:
|
||||
"""Test and evaluate the Crew with the given inputs for n iterations.
|
||||
|
||||
Args:
|
||||
n_iterations: Number of iterations to run the test
|
||||
openai_model_name: OpenAI model name to use for evaluation (deprecated)
|
||||
llm: LLM instance or model name to use for evaluation
|
||||
inputs: Optional dictionary of inputs to pass to the crew
|
||||
"""
|
||||
if not llm and not openai_model_name:
|
||||
raise ValueError("Must provide either 'llm' or 'openai_model_name' parameter")
|
||||
|
||||
model_to_use = self._get_llm_instance(llm, openai_model_name)
|
||||
Args:
|
||||
n_iterations: Number of test iterations to run
|
||||
llm: LLM instance or model name to use for evaluation
|
||||
openai_model_name: (Deprecated) OpenAI model name to use for evaluation
|
||||
inputs: Optional dictionary of inputs for the crew
|
||||
|
||||
Raises:
|
||||
ValueError: If inputs is not a dictionary or if LLM configuration is invalid
|
||||
TypeError: If n_iterations is not a positive integer
|
||||
"""
|
||||
if n_iterations < 1:
|
||||
raise TypeError("n_iterations must be a positive integer")
|
||||
|
||||
if inputs is not None and not isinstance(inputs, dict):
|
||||
raise ValueError("inputs must be a dictionary")
|
||||
|
||||
# Validate LLM configuration
|
||||
if isinstance(llm, str) and not llm.strip():
|
||||
raise ValueError("LLM model name cannot be empty")
|
||||
|
||||
test_llm: Union[str, LLM, None] = llm if llm is not None else openai_model_name
|
||||
test_crew = self.copy()
|
||||
|
||||
self._test_execution_span = test_crew._telemetry.test_execution_span(
|
||||
test_crew,
|
||||
n_iterations,
|
||||
inputs,
|
||||
str(model_to_use.model),
|
||||
test_llm,
|
||||
)
|
||||
evaluator = CrewEvaluator(test_crew, model_to_use)
|
||||
evaluator = CrewEvaluator(test_crew, test_llm)
|
||||
|
||||
for i in range(1, n_iterations + 1):
|
||||
evaluator.set_iteration(i)
|
||||
@@ -1180,28 +1280,6 @@ class Crew(BaseModel):
|
||||
|
||||
evaluator.print_crew_evaluation_result()
|
||||
|
||||
def _get_llm_instance(self, llm: Optional[Union[str, LLM]], openai_model_name: Optional[str]) -> LLM:
|
||||
"""Get an LLM instance from either llm or openai_model_name parameter.
|
||||
|
||||
Args:
|
||||
llm: LLM instance or model name
|
||||
openai_model_name: OpenAI model name (deprecated)
|
||||
|
||||
Returns:
|
||||
LLM instance
|
||||
|
||||
Raises:
|
||||
ValueError: If neither llm nor openai_model_name is provided
|
||||
"""
|
||||
model = llm if llm is not None else openai_model_name
|
||||
if model is None:
|
||||
raise ValueError("Must provide either 'llm' or 'openai_model_name' parameter")
|
||||
if isinstance(model, str):
|
||||
return LLM(model=model)
|
||||
if not isinstance(model, LLM):
|
||||
raise ValueError("Model must be either a string or an LLM instance")
|
||||
return model
|
||||
|
||||
def __repr__(self):
|
||||
return f"Crew(id={self.id}, process={self.process}, number_of_agents={len(self.agents)}, number_of_tasks={len(self.tasks)})"
|
||||
|
||||
|
||||
@@ -34,9 +34,27 @@ class CrewEvaluator:
|
||||
run_execution_times: defaultdict = defaultdict(list)
|
||||
iteration: int = 0
|
||||
|
||||
def __init__(self, crew, llm: Union[str, LLM]):
|
||||
def __init__(self, crew, llm: Union[str, LLM, None] = None):
|
||||
"""Initialize the CrewEvaluator.
|
||||
|
||||
Args:
|
||||
crew: The crew to evaluate
|
||||
llm: LLM instance or model name to use for evaluation
|
||||
|
||||
Raises:
|
||||
ValueError: If LLM model name is empty or invalid
|
||||
RuntimeError: If evaluator agent initialization fails
|
||||
"""
|
||||
self.crew = crew
|
||||
self.llm = LLM(model=llm) if isinstance(llm, str) else llm
|
||||
|
||||
if isinstance(llm, str) and not llm.strip():
|
||||
raise ValueError("LLM model name cannot be empty")
|
||||
|
||||
try:
|
||||
self._llm = llm if isinstance(llm, LLM) else LLM(model=llm) if llm else None
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to initialize LLM: {str(e)}")
|
||||
|
||||
self._telemetry = Telemetry()
|
||||
self._setup_for_evaluating()
|
||||
|
||||
@@ -53,7 +71,7 @@ class CrewEvaluator:
|
||||
),
|
||||
backstory="Evaluator agent for crew evaluation with precise capabilities to evaluate the performance of the agents in the crew based on the tasks they have performed",
|
||||
verbose=False,
|
||||
llm=self.llm,
|
||||
llm=self._llm,
|
||||
)
|
||||
|
||||
def _evaluation_task(
|
||||
@@ -183,7 +201,7 @@ class CrewEvaluator:
|
||||
self.crew,
|
||||
evaluation_result.pydantic.quality,
|
||||
current_task.execution_duration,
|
||||
self.llm.model,
|
||||
self._llm.model if self._llm else "default",
|
||||
)
|
||||
self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality)
|
||||
self.run_execution_times[self.iteration].append(
|
||||
|
||||
@@ -15,6 +15,7 @@ from crewai.agents.cache import CacheHandler
|
||||
from crewai.crew import Crew
|
||||
from crewai.crews.crew_output import CrewOutput
|
||||
from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource
|
||||
from crewai.llm import LLM
|
||||
from crewai.memory.contextual.contextual_memory import ContextualMemory
|
||||
from crewai.process import Process
|
||||
from crewai.project import crew
|
||||
@@ -24,6 +25,9 @@ from crewai.tasks.output_format import OutputFormat
|
||||
from crewai.tasks.task_output import TaskOutput
|
||||
from crewai.types.usage_metrics import UsageMetrics
|
||||
from crewai.utilities import Logger
|
||||
from crewai.utilities.evaluators.crew_evaluator_handler import (
|
||||
TaskEvaluationPydanticOutput,
|
||||
)
|
||||
from crewai.utilities.rpm_controller import RPMController
|
||||
from crewai.utilities.task_output_storage_handler import TaskOutputStorageHandler
|
||||
|
||||
@@ -3305,41 +3309,95 @@ def test_conditional_should_execute():
|
||||
|
||||
|
||||
@mock.patch("crewai.crew.CrewEvaluator")
|
||||
@mock.patch("crewai.crew.Crew.copy")
|
||||
def test_crew_testing_function(copy_mock, crew_evaluator_mock):
|
||||
@mock.patch.object(Crew, "copy")
|
||||
@mock.patch.object(Crew, "kickoff")
|
||||
def test_crew_test_with_custom_llm(mock_kickoff, mock_copy, mock_evaluator):
|
||||
"""Test that Crew.test() works with a custom LLM implementation."""
|
||||
task = Task(description="Test task", expected_output="Test output", agent=researcher)
|
||||
crew = Crew(agents=[researcher], tasks=[task])
|
||||
mock_copy.return_value = crew
|
||||
mock_evaluator.return_value = mock.MagicMock()
|
||||
|
||||
llm = LLM(model="gpt-4")
|
||||
crew.test(n_iterations=1, llm=llm)
|
||||
|
||||
# Verify CrewEvaluator was called with the LLM instance
|
||||
mock_evaluator.assert_called_once()
|
||||
args = mock_evaluator.call_args[0]
|
||||
assert args[1] == llm
|
||||
|
||||
@mock.patch("crewai.crew.CrewEvaluator")
|
||||
@mock.patch.object(Crew, "copy")
|
||||
@mock.patch.object(Crew, "kickoff")
|
||||
def test_crew_test_backward_compatibility(mock_kickoff, mock_copy, mock_evaluator):
|
||||
"""Test that Crew.test() maintains backward compatibility with openai_model_name."""
|
||||
task = Task(description="Test task", expected_output="Test output", agent=researcher)
|
||||
crew = Crew(agents=[researcher], tasks=[task])
|
||||
mock_copy.return_value = crew
|
||||
mock_evaluator.return_value = mock.MagicMock()
|
||||
|
||||
crew.test(n_iterations=1, openai_model_name="gpt-4")
|
||||
|
||||
# Verify CrewEvaluator was called with the model name
|
||||
mock_evaluator.assert_called_once()
|
||||
args = mock_evaluator.call_args[0]
|
||||
assert args[1] == "gpt-4"
|
||||
|
||||
@mock.patch("crewai.crew.CrewEvaluator")
|
||||
@mock.patch.object(Crew, "copy")
|
||||
@mock.patch.object(Crew, "kickoff")
|
||||
def test_crew_test_with_invalid_inputs(mock_kickoff, mock_copy, mock_evaluator):
|
||||
"""Test that Crew.test() validates inputs properly."""
|
||||
task = Task(description="Test task", expected_output="Test output", agent=researcher)
|
||||
crew = Crew(agents=[researcher], tasks=[task])
|
||||
mock_copy.return_value = crew
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
crew.test(n_iterations=0) # Invalid iterations
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
crew.test(n_iterations=1, inputs="invalid") # Invalid inputs type
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
crew.test(n_iterations=1, llm="") # Empty LLM name
|
||||
|
||||
@mock.patch("crewai.crew.CrewEvaluator")
|
||||
@mock.patch.object(Crew, "copy")
|
||||
@mock.patch.object(Crew, "kickoff")
|
||||
def test_crew_test_concurrent_execution(mock_kickoff, mock_copy, mock_evaluator):
|
||||
"""Test that Crew.test() handles concurrent execution properly."""
|
||||
task = Task(description="Test task", expected_output="Test output", agent=researcher)
|
||||
crew = Crew(agents=[researcher], tasks=[task])
|
||||
mock_copy.return_value = crew
|
||||
mock_evaluator.return_value = mock.MagicMock()
|
||||
n_iterations = 3
|
||||
|
||||
crew.test(n_iterations=n_iterations)
|
||||
assert mock_evaluator.return_value.set_iteration.call_count == n_iterations
|
||||
|
||||
@mock.patch("crewai.crew.CrewEvaluator")
|
||||
@mock.patch.object(Crew, "copy")
|
||||
@mock.patch.object(Crew, "kickoff")
|
||||
def test_crew_testing_function(mock_kickoff, mock_copy, mock_evaluator):
|
||||
"""Test that Crew.test() works with basic functionality."""
|
||||
task = Task(
|
||||
description="Come up with a list of 5 interesting ideas to explore for an article, then write one amazing paragraph highlight for each idea that showcases how good an article about this topic could be. Return the list of ideas with their paragraph and your notes.",
|
||||
expected_output="5 bullet points with a paragraph for each idea.",
|
||||
description="Test task",
|
||||
expected_output="Test output",
|
||||
agent=researcher,
|
||||
)
|
||||
|
||||
crew = Crew(
|
||||
agents=[researcher],
|
||||
tasks=[task],
|
||||
)
|
||||
|
||||
# Create a mock for the copied crew with a mock kickoff method
|
||||
copied_crew = MagicMock()
|
||||
copy_mock.return_value = copied_crew
|
||||
|
||||
# Create a mock for the CrewEvaluator instance
|
||||
evaluator_instance = MagicMock()
|
||||
crew_evaluator_mock.return_value = evaluator_instance
|
||||
|
||||
n_iterations = 2
|
||||
crew.test(n_iterations, openai_model_name="gpt-4o-mini", inputs={"topic": "AI"})
|
||||
|
||||
# Ensure kickoff is called on the copied crew
|
||||
copied_crew.kickoff.assert_has_calls(
|
||||
[mock.call(inputs={"topic": "AI"}), mock.call(inputs={"topic": "AI"})]
|
||||
)
|
||||
|
||||
# Verify CrewEvaluator interactions
|
||||
# We don't check the exact LLM object since it's created internally
|
||||
assert len(crew_evaluator_mock.mock_calls) == 4
|
||||
assert crew_evaluator_mock.mock_calls[1] == mock.call().set_iteration(1)
|
||||
assert crew_evaluator_mock.mock_calls[2] == mock.call().set_iteration(2)
|
||||
assert crew_evaluator_mock.mock_calls[3] == mock.call().print_crew_evaluation_result()
|
||||
crew = Crew(agents=[researcher], tasks=[task])
|
||||
mock_copy.return_value = crew
|
||||
mock_evaluator.return_value = mock.MagicMock()
|
||||
|
||||
crew.test(n_iterations=1)
|
||||
|
||||
# Verify CrewEvaluator was called with None as llm (default behavior)
|
||||
mock_evaluator.assert_called_once()
|
||||
args = mock_evaluator.call_args[0]
|
||||
assert args[1] is None
|
||||
|
||||
# Verify kickoff was called
|
||||
mock_kickoff.assert_called_once()
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
|
||||
@@ -1,942 +0,0 @@
|
||||
interactions:
|
||||
- request:
|
||||
body: '{"messages": [{"role": "system", "content": "You are test. test\nYour personal
|
||||
goal is: test\nTo give my best complete final answer to the task respond using
|
||||
the exact following format:\n\nThought: I now can give a great answer\nFinal
|
||||
Answer: Your final answer must be the great and the most complete as possible,
|
||||
it must be outcome described.\n\nI MUST use these formats, my job depends on
|
||||
it!"}, {"role": "user", "content": "\nCurrent Task: test\n\nThis is the expected
|
||||
criteria for your final answer: test output\nyou MUST return the actual complete
|
||||
content as the final answer, not a summary.\n\nBegin! This is VERY important
|
||||
to you, use the tools available and give your best Final Answer, your job depends
|
||||
on it!\n\nThought:"}], "model": "gpt-4", "stop": ["\nObservation:"]}'
|
||||
headers:
|
||||
accept:
|
||||
- application/json
|
||||
accept-encoding:
|
||||
- gzip, deflate
|
||||
authorization:
|
||||
- Bearer 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
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '780'
|
||||
content-type:
|
||||
- application/json
|
||||
host:
|
||||
- api.openai.com
|
||||
user-agent:
|
||||
- OpenAI/Python 1.61.0
|
||||
x-stainless-arch:
|
||||
- x64
|
||||
x-stainless-async:
|
||||
- 'false'
|
||||
x-stainless-lang:
|
||||
- python
|
||||
x-stainless-os:
|
||||
- Linux
|
||||
x-stainless-package-version:
|
||||
- 1.61.0
|
||||
x-stainless-raw-response:
|
||||
- 'true'
|
||||
x-stainless-retry-count:
|
||||
- '0'
|
||||
x-stainless-runtime:
|
||||
- CPython
|
||||
x-stainless-runtime-version:
|
||||
- 3.12.7
|
||||
method: POST
|
||||
uri: https://api.openai.com/v1/chat/completions
|
||||
response:
|
||||
content: "{\n \"id\": \"chatcmpl-AzAMqMWFX8hC1szIKxWNSyXm0SPFi\",\n \"object\":
|
||||
\"chat.completion\",\n \"created\": 1739141224,\n \"model\": \"gpt-4-0613\",\n
|
||||
\ \"choices\": [\n {\n \"index\": 0,\n \"message\": {\n \"role\":
|
||||
\"assistant\",\n \"content\": \"I am prepared to conduct the test efficiently.\\n\\nFinal
|
||||
Answer: The test output that aligns with the given criteria is a detailed description
|
||||
of the testing process, providing a thorough understanding for anyone reviewing
|
||||
it. The output not only contains the raw data or results but also includes step-by-step
|
||||
documentation of the process employed, thoughts and reasoning behind each step,
|
||||
deviations if any from the original plan, and how these deviations impacted
|
||||
the results. In addition, it captures any errors or unexpected occurrences during
|
||||
the course of the test, and proposes possible explanations or solutions for
|
||||
these. It is detailed yet comprehensible, catering to both technical and non-technical
|
||||
audiences. It is a result of meticulous planning, diligent execution, and robust
|
||||
post-test analysis, making it a complete content.\",\n \"refusal\": null\n
|
||||
\ },\n \"logprobs\": null,\n \"finish_reason\": \"stop\"\n }\n
|
||||
\ ],\n \"usage\": {\n \"prompt_tokens\": 149,\n \"completion_tokens\":
|
||||
151,\n \"total_tokens\": 300,\n \"prompt_tokens_details\": {\n \"cached_tokens\":
|
||||
0,\n \"audio_tokens\": 0\n },\n \"completion_tokens_details\": {\n
|
||||
\ \"reasoning_tokens\": 0,\n \"audio_tokens\": 0,\n \"accepted_prediction_tokens\":
|
||||
0,\n \"rejected_prediction_tokens\": 0\n }\n },\n \"service_tier\":
|
||||
\"default\",\n \"system_fingerprint\": null\n}\n"
|
||||
headers:
|
||||
CF-RAY:
|
||||
- 90f7662cab11ba33-SEA
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Encoding:
|
||||
- gzip
|
||||
Content-Type:
|
||||
- application/json
|
||||
Date:
|
||||
- Sun, 09 Feb 2025 22:47:09 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Set-Cookie:
|
||||
- __cf_bm=p1aGVyahvfLAvEwvbX0FMmrN5o18PpVAu2dG_dTgMSU-1739141229-1.0.1.1-_q7aCslZTr11IMFZ81VgyuqsGiqTARFPANUvBEWM_0dZdb97Py78KE1omxdNv5F1pFKoWUqA1kEF2wzQ2wz4aA;
|
||||
path=/; expires=Sun, 09-Feb-25 23:17:09 GMT; domain=.api.openai.com; HttpOnly;
|
||||
Secure; SameSite=None
|
||||
- _cfuvid=bsF0jwE67cS.ywAaQU59jKPFC03S1dvynClHm_wTQik-1739141229143-0.0.1.1-604800000;
|
||||
path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
X-Content-Type-Options:
|
||||
- nosniff
|
||||
access-control-expose-headers:
|
||||
- X-Request-ID
|
||||
alt-svc:
|
||||
- h3=":443"; ma=86400
|
||||
cf-cache-status:
|
||||
- DYNAMIC
|
||||
openai-organization:
|
||||
- crewai-iuxna1
|
||||
openai-processing-ms:
|
||||
- '4585'
|
||||
openai-version:
|
||||
- '2020-10-01'
|
||||
strict-transport-security:
|
||||
- max-age=31536000; includeSubDomains; preload
|
||||
x-ratelimit-limit-requests:
|
||||
- '10000'
|
||||
x-ratelimit-limit-tokens:
|
||||
- '1000000'
|
||||
x-ratelimit-remaining-requests:
|
||||
- '9999'
|
||||
x-ratelimit-remaining-tokens:
|
||||
- '999822'
|
||||
x-ratelimit-reset-requests:
|
||||
- 6ms
|
||||
x-ratelimit-reset-tokens:
|
||||
- 10ms
|
||||
x-request-id:
|
||||
- req_1ba81a80018602119b871a7a42d7becf
|
||||
http_version: HTTP/1.1
|
||||
status_code: 200
|
||||
- request:
|
||||
body: !!binary |
|
||||
Ct8LCiQKIgoMc2VydmljZS5uYW1lEhIKEGNyZXdBSS10ZWxlbWV0cnkStgsKEgoQY3Jld2FpLnRl
|
||||
bGVtZXRyeRL5AQoQT8uAOJ+suOhFs22RW56o6BII0Ob64+TP3XQqE0NyZXcgVGVzdCBFeGVjdXRp
|
||||
b24wATk1IlKguqsiGEGGgWOguqsiGEobCg5jcmV3YWlfdmVyc2lvbhIJCgcwLjEwMC4xSi4KCGNy
|
||||
ZXdfa2V5EiIKIGZlYjFlMjFiMzI1NmM1OWE2NDcxNTJhZmRkNjYzMjJlSjEKB2NyZXdfaWQSJgok
|
||||
M2Q1MGJkYWItZDI1NS00MjFiLThkMzMtZjZmOTAzMThhOWQwShEKCml0ZXJhdGlvbnMSAwoBMUoV
|
||||
Cgptb2RlbF9uYW1lEgcKBWdwdC00egIYAYUBAAEAABKSBwoQgzrB2KxaHe9FwPNktJHbFRIIT8gM
|
||||
r7rSvJUqDENyZXcgQ3JlYXRlZDABOW4kaqC6qyIYQe+yeaC6qyIYShsKDmNyZXdhaV92ZXJzaW9u
|
||||
EgkKBzAuMTAwLjFKGgoOcHl0aG9uX3ZlcnNpb24SCAoGMy4xMi43Si4KCGNyZXdfa2V5EiIKIGZl
|
||||
YjFlMjFiMzI1NmM1OWE2NDcxNTJhZmRkNjYzMjJlSjEKB2NyZXdfaWQSJgokM2Q1MGJkYWItZDI1
|
||||
NS00MjFiLThkMzMtZjZmOTAzMThhOWQwShwKDGNyZXdfcHJvY2VzcxIMCgpzZXF1ZW50aWFsShEK
|
||||
C2NyZXdfbWVtb3J5EgIQAEoaChRjcmV3X251bWJlcl9vZl90YXNrcxICGAFKGwoVY3Jld19udW1i
|
||||
ZXJfb2ZfYWdlbnRzEgIYAUrFAgoLY3Jld19hZ2VudHMStQIKsgJbeyJrZXkiOiAiOTc2ZjhmNTBh
|
||||
Y2NmZWJhMjIzZTQ5YzQyYjE2ZTk5ZTYiLCAiaWQiOiAiN2E3NmZjNmYtZTI5YS00MDBlLWI0NGEt
|
||||
NzAyMDNlMzg1Y2RmIiwgInJvbGUiOiAidGVzdCIsICJ2ZXJib3NlPyI6IGZhbHNlLCAibWF4X2l0
|
||||
ZXIiOiAyNSwgIm1heF9ycG0iOiBudWxsLCAiZnVuY3Rpb25fY2FsbGluZ19sbG0iOiAiIiwgImxs
|
||||
bSI6ICJncHQtNCIsICJkZWxlZ2F0aW9uX2VuYWJsZWQ/IjogZmFsc2UsICJhbGxvd19jb2RlX2V4
|
||||
ZWN1dGlvbj8iOiBmYWxzZSwgIm1heF9yZXRyeV9saW1pdCI6IDIsICJ0b29sc19uYW1lcyI6IFtd
|
||||
fV1K+QEKCmNyZXdfdGFza3MS6gEK5wFbeyJrZXkiOiAiZGE5NWViZGIzNmU0Y2RmOTJkZjZhNmRk
|
||||
MTZiY2VlMGUiLCAiaWQiOiAiNTcwYmJlYjQtYzkzNi00NTNkLTg2MjktYzhjMDM0ODA5NDhjIiwg
|
||||
ImFzeW5jX2V4ZWN1dGlvbj8iOiBmYWxzZSwgImh1bWFuX2lucHV0PyI6IGZhbHNlLCAiYWdlbnRf
|
||||
cm9sZSI6ICJ0ZXN0IiwgImFnZW50X2tleSI6ICI5NzZmOGY1MGFjY2ZlYmEyMjNlNDljNDJiMTZl
|
||||
OTllNiIsICJ0b29sc19uYW1lcyI6IFtdfV16AhgBhQEAAQAAEo4CChAus8hZAJcezzXdP2XqhVyF
|
||||
Egi1wnliqIdQdSoMVGFzayBDcmVhdGVkMAE5nneIoLqrIhhBKDyJoLqrIhhKLgoIY3Jld19rZXkS
|
||||
IgogZmViMWUyMWIzMjU2YzU5YTY0NzE1MmFmZGQ2NjMyMmVKMQoHY3Jld19pZBImCiQzZDUwYmRh
|
||||
Yi1kMjU1LTQyMWItOGQzMy1mNmY5MDMxOGE5ZDBKLgoIdGFza19rZXkSIgogZGE5NWViZGIzNmU0
|
||||
Y2RmOTJkZjZhNmRkMTZiY2VlMGVKMQoHdGFza19pZBImCiQ1NzBiYmViNC1jOTM2LTQ1M2QtODYy
|
||||
OS1jOGMwMzQ4MDk0OGN6AhgBhQEAAQAA
|
||||
headers:
|
||||
Accept:
|
||||
- '*/*'
|
||||
Accept-Encoding:
|
||||
- gzip, deflate
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Length:
|
||||
- '1506'
|
||||
Content-Type:
|
||||
- application/x-protobuf
|
||||
User-Agent:
|
||||
- OTel-OTLP-Exporter-Python/1.27.0
|
||||
method: POST
|
||||
uri: https://telemetry.crewai.com:4319/v1/traces
|
||||
response:
|
||||
body:
|
||||
string: "\n\0"
|
||||
headers:
|
||||
Content-Length:
|
||||
- '2'
|
||||
Content-Type:
|
||||
- application/x-protobuf
|
||||
Date:
|
||||
- Sun, 09 Feb 2025 22:47:09 GMT
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
- request:
|
||||
body: '{"messages": [{"role": "system", "content": "You are Task Execution Evaluator.
|
||||
Evaluator agent for crew evaluation with precise capabilities to evaluate the
|
||||
performance of the agents in the crew based on the tasks they have performed\nYour
|
||||
personal goal is: Your goal is to evaluate the performance of the agents in
|
||||
the crew based on the tasks they have performed using score from 1 to 10 evaluating
|
||||
on completion, quality, and overall performance.\nTo give my best complete final
|
||||
answer to the task respond using the exact following format:\n\nThought: I now
|
||||
can give a great answer\nFinal Answer: Your final answer must be the great and
|
||||
the most complete as possible, it must be outcome described.\n\nI MUST use these
|
||||
formats, my job depends on it!"}, {"role": "user", "content": "\nCurrent Task:
|
||||
Based on the task description and the expected output, compare and evaluate
|
||||
the performance of the agents in the crew based on the Task Output they have
|
||||
performed using score from 1 to 10 evaluating on completion, quality, and overall
|
||||
performance.task_description: test task_expected_output: test output agent:
|
||||
test agent_goal: test Task Output: The test output that aligns with the given
|
||||
criteria is a detailed description of the testing process, providing a thorough
|
||||
understanding for anyone reviewing it. The output not only contains the raw
|
||||
data or results but also includes step-by-step documentation of the process
|
||||
employed, thoughts and reasoning behind each step, deviations if any from the
|
||||
original plan, and how these deviations impacted the results. In addition, it
|
||||
captures any errors or unexpected occurrences during the course of the test,
|
||||
and proposes possible explanations or solutions for these. It is detailed yet
|
||||
comprehensible, catering to both technical and non-technical audiences. It is
|
||||
a result of meticulous planning, diligent execution, and robust post-test analysis,
|
||||
making it a complete content.\n\nThis is the expected criteria for your final
|
||||
answer: Evaluation Score from 1 to 10 based on the performance of the agents
|
||||
on the tasks\nyou MUST return the actual complete content as the final answer,
|
||||
not a summary.\nEnsure your final answer contains only the content in the following
|
||||
format: {\n \"quality\": float\n}\n\nEnsure the final output does not include
|
||||
any code block markers like ```json or ```python.\n\nBegin! This is VERY important
|
||||
to you, use the tools available and give your best Final Answer, your job depends
|
||||
on it!\n\nThought:"}], "model": "gpt-4", "stop": ["\nObservation:"]}'
|
||||
headers:
|
||||
accept:
|
||||
- application/json
|
||||
accept-encoding:
|
||||
- gzip, deflate
|
||||
authorization:
|
||||
- Bearer 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
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '2523'
|
||||
content-type:
|
||||
- application/json
|
||||
cookie:
|
||||
- __cf_bm=p1aGVyahvfLAvEwvbX0FMmrN5o18PpVAu2dG_dTgMSU-1739141229-1.0.1.1-_q7aCslZTr11IMFZ81VgyuqsGiqTARFPANUvBEWM_0dZdb97Py78KE1omxdNv5F1pFKoWUqA1kEF2wzQ2wz4aA;
|
||||
_cfuvid=bsF0jwE67cS.ywAaQU59jKPFC03S1dvynClHm_wTQik-1739141229143-0.0.1.1-604800000
|
||||
host:
|
||||
- api.openai.com
|
||||
user-agent:
|
||||
- OpenAI/Python 1.61.0
|
||||
x-stainless-arch:
|
||||
- x64
|
||||
x-stainless-async:
|
||||
- 'false'
|
||||
x-stainless-lang:
|
||||
- python
|
||||
x-stainless-os:
|
||||
- Linux
|
||||
x-stainless-package-version:
|
||||
- 1.61.0
|
||||
x-stainless-raw-response:
|
||||
- 'true'
|
||||
x-stainless-retry-count:
|
||||
- '0'
|
||||
x-stainless-runtime:
|
||||
- CPython
|
||||
x-stainless-runtime-version:
|
||||
- 3.12.7
|
||||
method: POST
|
||||
uri: https://api.openai.com/v1/chat/completions
|
||||
response:
|
||||
content: "{\n \"id\": \"chatcmpl-AzAMvAOSw5847reo2vh61focjnyK2\",\n \"object\":
|
||||
\"chat.completion\",\n \"created\": 1739141229,\n \"model\": \"gpt-4-0613\",\n
|
||||
\ \"choices\": [\n {\n \"index\": 0,\n \"message\": {\n \"role\":
|
||||
\"assistant\",\n \"content\": \"Based on the given task output, I can
|
||||
determine that the test agent has performed impressively well. Their work is
|
||||
comprehensive, catering to both non-technical and technical audiences and includes
|
||||
complete and detailed process documentation. Further, the way they detect and
|
||||
elaborate deviances and errors shows their meticulousness and efficiency. Their
|
||||
planning, execution, and analysis are sound.\\n\\nFinal Answer: The quality
|
||||
of this task is admirable, paying attention to details and meticulously planning
|
||||
and reasoning behind each step. Considering all these, on a scale from 1 to
|
||||
10, I would rate the task performed by the test agent as follows:\\n\\n{\\n
|
||||
\ \\\"quality\\\": 9.5\\n}\",\n \"refusal\": null\n },\n \"logprobs\":
|
||||
null,\n \"finish_reason\": \"stop\"\n }\n ],\n \"usage\": {\n \"prompt_tokens\":
|
||||
471,\n \"completion_tokens\": 133,\n \"total_tokens\": 604,\n \"prompt_tokens_details\":
|
||||
{\n \"cached_tokens\": 0,\n \"audio_tokens\": 0\n },\n \"completion_tokens_details\":
|
||||
{\n \"reasoning_tokens\": 0,\n \"audio_tokens\": 0,\n \"accepted_prediction_tokens\":
|
||||
0,\n \"rejected_prediction_tokens\": 0\n }\n },\n \"service_tier\":
|
||||
\"default\",\n \"system_fingerprint\": null\n}\n"
|
||||
headers:
|
||||
CF-Cache-Status:
|
||||
- DYNAMIC
|
||||
CF-RAY:
|
||||
- 90f7664a581eba33-SEA
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Encoding:
|
||||
- gzip
|
||||
Content-Type:
|
||||
- application/json
|
||||
Date:
|
||||
- Sun, 09 Feb 2025 22:47:14 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
X-Content-Type-Options:
|
||||
- nosniff
|
||||
access-control-expose-headers:
|
||||
- X-Request-ID
|
||||
alt-svc:
|
||||
- h3=":443"; ma=86400
|
||||
openai-organization:
|
||||
- crewai-iuxna1
|
||||
openai-processing-ms:
|
||||
- '4884'
|
||||
openai-version:
|
||||
- '2020-10-01'
|
||||
strict-transport-security:
|
||||
- max-age=31536000; includeSubDomains; preload
|
||||
x-ratelimit-limit-requests:
|
||||
- '10000'
|
||||
x-ratelimit-limit-tokens:
|
||||
- '1000000'
|
||||
x-ratelimit-remaining-requests:
|
||||
- '9999'
|
||||
x-ratelimit-remaining-tokens:
|
||||
- '999388'
|
||||
x-ratelimit-reset-requests:
|
||||
- 6ms
|
||||
x-ratelimit-reset-tokens:
|
||||
- 36ms
|
||||
x-request-id:
|
||||
- req_0335ff13c1777c1bcdbee89879bc132c
|
||||
http_version: HTTP/1.1
|
||||
status_code: 200
|
||||
- request:
|
||||
body: !!binary |
|
||||
CvsNCiQKIgoMc2VydmljZS5uYW1lEhIKEGNyZXdBSS10ZWxlbWV0cnkS0g0KEgoQY3Jld2FpLnRl
|
||||
bGVtZXRyeRKZAgoQJQy0LAglHxA7Ok+n0Gmi9hIIybabc5KDQlkqG0NyZXcgSW5kaXZpZHVhbCBU
|
||||
ZXN0IFJlc3VsdDABOQK5tem8qyIYQVTw0+m8qyIYShsKDmNyZXdhaV92ZXJzaW9uEgkKBzAuMTAw
|
||||
LjFKLgoIY3Jld19rZXkSIgogZmViMWUyMWIzMjU2YzU5YTY0NzE1MmFmZGQ2NjMyMmVKMQoHY3Jl
|
||||
d19pZBImCiQzZDUwYmRhYi1kMjU1LTQyMWItOGQzMy1mNmY5MDMxOGE5ZDBKEAoHcXVhbGl0eRIF
|
||||
CgM5LjVKFwoJZXhlY190aW1lEgoKCDQuODA5MTc4ShUKCm1vZGVsX25hbWUSBwoFZ3B0LTR6AhgB
|
||||
hQEAAQAAEvkBChCnePSvJJg/cFeYF3HlEvyVEgh3YAewpHkssyoTQ3JldyBUZXN0IEV4ZWN1dGlv
|
||||
bjABOeiBSOq8qyIYQQWTV+q8qyIYShsKDmNyZXdhaV92ZXJzaW9uEgkKBzAuMTAwLjFKLgoIY3Jl
|
||||
d19rZXkSIgogZmViMWUyMWIzMjU2YzU5YTY0NzE1MmFmZGQ2NjMyMmVKMQoHY3Jld19pZBImCiQ2
|
||||
YzYyNmEyZi05OGRlLTQ2ODAtOWJhNC01NWVkYzdmODhiZTNKEQoKaXRlcmF0aW9ucxIDCgExShUK
|
||||
Cm1vZGVsX25hbWUSBwoFZ3B0LTR6AhgBhQEAAQAAEpIHChB0Gz3vlppAjams1hbMI/RQEggPCufR
|
||||
e9thfSoMQ3JldyBDcmVhdGVkMAE5X3td6ryrIhhBWNhr6ryrIhhKGwoOY3Jld2FpX3ZlcnNpb24S
|
||||
CQoHMC4xMDAuMUoaCg5weXRob25fdmVyc2lvbhIICgYzLjEyLjdKLgoIY3Jld19rZXkSIgogZmVi
|
||||
MWUyMWIzMjU2YzU5YTY0NzE1MmFmZGQ2NjMyMmVKMQoHY3Jld19pZBImCiQ2YzYyNmEyZi05OGRl
|
||||
LTQ2ODAtOWJhNC01NWVkYzdmODhiZTNKHAoMY3Jld19wcm9jZXNzEgwKCnNlcXVlbnRpYWxKEQoL
|
||||
Y3Jld19tZW1vcnkSAhAAShoKFGNyZXdfbnVtYmVyX29mX3Rhc2tzEgIYAUobChVjcmV3X251bWJl
|
||||
cl9vZl9hZ2VudHMSAhgBSsUCCgtjcmV3X2FnZW50cxK1AgqyAlt7ImtleSI6ICI5NzZmOGY1MGFj
|
||||
Y2ZlYmEyMjNlNDljNDJiMTZlOTllNiIsICJpZCI6ICJkYTA4M2Q5ZS0xOWU5LTQyMzAtYjZmNC0y
|
||||
NjlhNzM1NzViOWQiLCAicm9sZSI6ICJ0ZXN0IiwgInZlcmJvc2U/IjogZmFsc2UsICJtYXhfaXRl
|
||||
ciI6IDI1LCAibWF4X3JwbSI6IG51bGwsICJmdW5jdGlvbl9jYWxsaW5nX2xsbSI6ICIiLCAibGxt
|
||||
IjogImdwdC00IiwgImRlbGVnYXRpb25fZW5hYmxlZD8iOiBmYWxzZSwgImFsbG93X2NvZGVfZXhl
|
||||
Y3V0aW9uPyI6IGZhbHNlLCAibWF4X3JldHJ5X2xpbWl0IjogMiwgInRvb2xzX25hbWVzIjogW119
|
||||
XUr5AQoKY3Jld190YXNrcxLqAQrnAVt7ImtleSI6ICJkYTk1ZWJkYjM2ZTRjZGY5MmRmNmE2ZGQx
|
||||
NmJjZWUwZSIsICJpZCI6ICJhNGUwYjM1Ny0zMDBlLTQ0MjMtYTU1My0yZTZlMWQxODg1M2MiLCAi
|
||||
YXN5bmNfZXhlY3V0aW9uPyI6IGZhbHNlLCAiaHVtYW5faW5wdXQ/IjogZmFsc2UsICJhZ2VudF9y
|
||||
b2xlIjogInRlc3QiLCAiYWdlbnRfa2V5IjogIjk3NmY4ZjUwYWNjZmViYTIyM2U0OWM0MmIxNmU5
|
||||
OWU2IiwgInRvb2xzX25hbWVzIjogW119XXoCGAGFAQABAAASjgIKEC5jZek+sSlZP8lSwF5zTSYS
|
||||
CIHcVJhIpsWuKgxUYXNrIENyZWF0ZWQwATl9vnrqvKsiGEFgjnvqvKsiGEouCghjcmV3X2tleRIi
|
||||
CiBmZWIxZTIxYjMyNTZjNTlhNjQ3MTUyYWZkZDY2MzIyZUoxCgdjcmV3X2lkEiYKJDZjNjI2YTJm
|
||||
LTk4ZGUtNDY4MC05YmE0LTU1ZWRjN2Y4OGJlM0ouCgh0YXNrX2tleRIiCiBkYTk1ZWJkYjM2ZTRj
|
||||
ZGY5MmRmNmE2ZGQxNmJjZWUwZUoxCgd0YXNrX2lkEiYKJGE0ZTBiMzU3LTMwMGUtNDQyMy1hNTUz
|
||||
LTJlNmUxZDE4ODUzY3oCGAGFAQABAAA=
|
||||
headers:
|
||||
Accept:
|
||||
- '*/*'
|
||||
Accept-Encoding:
|
||||
- gzip, deflate
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Length:
|
||||
- '1790'
|
||||
Content-Type:
|
||||
- application/x-protobuf
|
||||
User-Agent:
|
||||
- OTel-OTLP-Exporter-Python/1.27.0
|
||||
method: POST
|
||||
uri: https://telemetry.crewai.com:4319/v1/traces
|
||||
response:
|
||||
body:
|
||||
string: "\n\0"
|
||||
headers:
|
||||
Content-Length:
|
||||
- '2'
|
||||
Content-Type:
|
||||
- application/x-protobuf
|
||||
Date:
|
||||
- Sun, 09 Feb 2025 22:47:14 GMT
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
- request:
|
||||
body: '{"messages": [{"role": "system", "content": "You are test. test\nYour personal
|
||||
goal is: test\nTo give my best complete final answer to the task respond using
|
||||
the exact following format:\n\nThought: I now can give a great answer\nFinal
|
||||
Answer: Your final answer must be the great and the most complete as possible,
|
||||
it must be outcome described.\n\nI MUST use these formats, my job depends on
|
||||
it!"}, {"role": "user", "content": "\nCurrent Task: test\n\nThis is the expected
|
||||
criteria for your final answer: test output\nyou MUST return the actual complete
|
||||
content as the final answer, not a summary.\n\nBegin! This is VERY important
|
||||
to you, use the tools available and give your best Final Answer, your job depends
|
||||
on it!\n\nThought:"}], "model": "gpt-4", "stop": ["\nObservation:"]}'
|
||||
headers:
|
||||
accept:
|
||||
- application/json
|
||||
accept-encoding:
|
||||
- gzip, deflate
|
||||
authorization:
|
||||
- Bearer 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
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '780'
|
||||
content-type:
|
||||
- application/json
|
||||
cookie:
|
||||
- __cf_bm=p1aGVyahvfLAvEwvbX0FMmrN5o18PpVAu2dG_dTgMSU-1739141229-1.0.1.1-_q7aCslZTr11IMFZ81VgyuqsGiqTARFPANUvBEWM_0dZdb97Py78KE1omxdNv5F1pFKoWUqA1kEF2wzQ2wz4aA;
|
||||
_cfuvid=bsF0jwE67cS.ywAaQU59jKPFC03S1dvynClHm_wTQik-1739141229143-0.0.1.1-604800000
|
||||
host:
|
||||
- api.openai.com
|
||||
user-agent:
|
||||
- OpenAI/Python 1.61.0
|
||||
x-stainless-arch:
|
||||
- x64
|
||||
x-stainless-async:
|
||||
- 'false'
|
||||
x-stainless-lang:
|
||||
- python
|
||||
x-stainless-os:
|
||||
- Linux
|
||||
x-stainless-package-version:
|
||||
- 1.61.0
|
||||
x-stainless-raw-response:
|
||||
- 'true'
|
||||
x-stainless-retry-count:
|
||||
- '0'
|
||||
x-stainless-runtime:
|
||||
- CPython
|
||||
x-stainless-runtime-version:
|
||||
- 3.12.7
|
||||
method: POST
|
||||
uri: https://api.openai.com/v1/chat/completions
|
||||
response:
|
||||
content: "{\n \"id\": \"chatcmpl-AzAN0cgAktzQnGukedPNpZsTy461c\",\n \"object\":
|
||||
\"chat.completion\",\n \"created\": 1739141234,\n \"model\": \"gpt-4-0613\",\n
|
||||
\ \"choices\": [\n {\n \"index\": 0,\n \"message\": {\n \"role\":
|
||||
\"assistant\",\n \"content\": \"I have an understanding of the task at
|
||||
hand and am ready to provide an in-depth and comprehensive answer.\\n\\nFinal
|
||||
Answer: As per the requirement of the task to provide a complete output, I am
|
||||
returning this test output as my final answer. It is not a summary, but rather
|
||||
a full and comprehensive response that fully addresses the question and expectations
|
||||
set forth. Your test output is ready.\",\n \"refusal\": null\n },\n
|
||||
\ \"logprobs\": null,\n \"finish_reason\": \"stop\"\n }\n ],\n
|
||||
\ \"usage\": {\n \"prompt_tokens\": 149,\n \"completion_tokens\": 77,\n
|
||||
\ \"total_tokens\": 226,\n \"prompt_tokens_details\": {\n \"cached_tokens\":
|
||||
0,\n \"audio_tokens\": 0\n },\n \"completion_tokens_details\": {\n
|
||||
\ \"reasoning_tokens\": 0,\n \"audio_tokens\": 0,\n \"accepted_prediction_tokens\":
|
||||
0,\n \"rejected_prediction_tokens\": 0\n }\n },\n \"service_tier\":
|
||||
\"default\",\n \"system_fingerprint\": null\n}\n"
|
||||
headers:
|
||||
CF-Cache-Status:
|
||||
- DYNAMIC
|
||||
CF-RAY:
|
||||
- 90f76669bd9eba33-SEA
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Encoding:
|
||||
- gzip
|
||||
Content-Type:
|
||||
- application/json
|
||||
Date:
|
||||
- Sun, 09 Feb 2025 22:47:17 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
X-Content-Type-Options:
|
||||
- nosniff
|
||||
access-control-expose-headers:
|
||||
- X-Request-ID
|
||||
alt-svc:
|
||||
- h3=":443"; ma=86400
|
||||
openai-organization:
|
||||
- crewai-iuxna1
|
||||
openai-processing-ms:
|
||||
- '3379'
|
||||
openai-version:
|
||||
- '2020-10-01'
|
||||
strict-transport-security:
|
||||
- max-age=31536000; includeSubDomains; preload
|
||||
x-ratelimit-limit-requests:
|
||||
- '10000'
|
||||
x-ratelimit-limit-tokens:
|
||||
- '1000000'
|
||||
x-ratelimit-remaining-requests:
|
||||
- '9999'
|
||||
x-ratelimit-remaining-tokens:
|
||||
- '999822'
|
||||
x-ratelimit-reset-requests:
|
||||
- 6ms
|
||||
x-ratelimit-reset-tokens:
|
||||
- 10ms
|
||||
x-request-id:
|
||||
- req_977371ae262154885689d766016ed132
|
||||
http_version: HTTP/1.1
|
||||
status_code: 200
|
||||
- request:
|
||||
body: '{"messages": [{"role": "system", "content": "You are Task Execution Evaluator.
|
||||
Evaluator agent for crew evaluation with precise capabilities to evaluate the
|
||||
performance of the agents in the crew based on the tasks they have performed\nYour
|
||||
personal goal is: Your goal is to evaluate the performance of the agents in
|
||||
the crew based on the tasks they have performed using score from 1 to 10 evaluating
|
||||
on completion, quality, and overall performance.\nTo give my best complete final
|
||||
answer to the task respond using the exact following format:\n\nThought: I now
|
||||
can give a great answer\nFinal Answer: Your final answer must be the great and
|
||||
the most complete as possible, it must be outcome described.\n\nI MUST use these
|
||||
formats, my job depends on it!"}, {"role": "user", "content": "\nCurrent Task:
|
||||
Based on the task description and the expected output, compare and evaluate
|
||||
the performance of the agents in the crew based on the Task Output they have
|
||||
performed using score from 1 to 10 evaluating on completion, quality, and overall
|
||||
performance.task_description: test task_expected_output: test output agent:
|
||||
test agent_goal: test Task Output: As per the requirement of the task to provide
|
||||
a complete output, I am returning this test output as my final answer. It is
|
||||
not a summary, but rather a full and comprehensive response that fully addresses
|
||||
the question and expectations set forth. Your test output is ready.\n\nThis
|
||||
is the expected criteria for your final answer: Evaluation Score from 1 to 10
|
||||
based on the performance of the agents on the tasks\nyou MUST return the actual
|
||||
complete content as the final answer, not a summary.\nEnsure your final answer
|
||||
contains only the content in the following format: {\n \"quality\": float\n}\n\nEnsure
|
||||
the final output does not include any code block markers like ```json or ```python.\n\nBegin!
|
||||
This is VERY important to you, use the tools available and give your best Final
|
||||
Answer, your job depends on it!\n\nThought:"}], "model": "gpt-4", "stop": ["\nObservation:"]}'
|
||||
headers:
|
||||
accept:
|
||||
- application/json
|
||||
accept-encoding:
|
||||
- gzip, deflate
|
||||
authorization:
|
||||
- Bearer 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
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '2017'
|
||||
content-type:
|
||||
- application/json
|
||||
cookie:
|
||||
- __cf_bm=p1aGVyahvfLAvEwvbX0FMmrN5o18PpVAu2dG_dTgMSU-1739141229-1.0.1.1-_q7aCslZTr11IMFZ81VgyuqsGiqTARFPANUvBEWM_0dZdb97Py78KE1omxdNv5F1pFKoWUqA1kEF2wzQ2wz4aA;
|
||||
_cfuvid=bsF0jwE67cS.ywAaQU59jKPFC03S1dvynClHm_wTQik-1739141229143-0.0.1.1-604800000
|
||||
host:
|
||||
- api.openai.com
|
||||
user-agent:
|
||||
- OpenAI/Python 1.61.0
|
||||
x-stainless-arch:
|
||||
- x64
|
||||
x-stainless-async:
|
||||
- 'false'
|
||||
x-stainless-lang:
|
||||
- python
|
||||
x-stainless-os:
|
||||
- Linux
|
||||
x-stainless-package-version:
|
||||
- 1.61.0
|
||||
x-stainless-raw-response:
|
||||
- 'true'
|
||||
x-stainless-retry-count:
|
||||
- '0'
|
||||
x-stainless-runtime:
|
||||
- CPython
|
||||
x-stainless-runtime-version:
|
||||
- 3.12.7
|
||||
method: POST
|
||||
uri: https://api.openai.com/v1/chat/completions
|
||||
response:
|
||||
content: "{\n \"id\": \"chatcmpl-AzAN3PPgDvH836sMJBXCVRc3im99S\",\n \"object\":
|
||||
\"chat.completion\",\n \"created\": 1739141237,\n \"model\": \"gpt-4-0613\",\n
|
||||
\ \"choices\": [\n {\n \"index\": 0,\n \"message\": {\n \"role\":
|
||||
\"assistant\",\n \"content\": \"Based on the information provided, the
|
||||
agent appears to have completed the task, providing an output that they have
|
||||
defined as 'full and comprehensive'. It appears that the agent has attempted
|
||||
to meet all the expectations of the task description and has reached the goal
|
||||
of returning a 'test output' as the final answer.\\n\\nFinal Answer: Considering
|
||||
these aspects, for task completion, the agent receives a 10 as they have successfully
|
||||
generated an output. For quality, the agent again receives a significant score
|
||||
of 10, because the full and comprehensive nature of the output matches the task's
|
||||
expectations. Finally, taking into account both the completion and quality aspects,
|
||||
the overall performance evaluation is also 10, recognizing the perfect alignment
|
||||
between the task's expected output and the output delivered by the agent. \\nTherefore,
|
||||
the final evaluation score can be summarized in the below format:\\n{\\n\\\"completion\\\":
|
||||
10,\\n\\\"quality\\\": 10,\\n\\\"overall performance\\\": 10\\n}\",\n \"refusal\":
|
||||
null\n },\n \"logprobs\": null,\n \"finish_reason\": \"stop\"\n
|
||||
\ }\n ],\n \"usage\": {\n \"prompt_tokens\": 385,\n \"completion_tokens\":
|
||||
190,\n \"total_tokens\": 575,\n \"prompt_tokens_details\": {\n \"cached_tokens\":
|
||||
0,\n \"audio_tokens\": 0\n },\n \"completion_tokens_details\": {\n
|
||||
\ \"reasoning_tokens\": 0,\n \"audio_tokens\": 0,\n \"accepted_prediction_tokens\":
|
||||
0,\n \"rejected_prediction_tokens\": 0\n }\n },\n \"service_tier\":
|
||||
\"default\",\n \"system_fingerprint\": null\n}\n"
|
||||
headers:
|
||||
CF-RAY:
|
||||
- 90f7667f9bfdba33-SEA
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Encoding:
|
||||
- gzip
|
||||
Content-Type:
|
||||
- application/json
|
||||
Date:
|
||||
- Sun, 09 Feb 2025 22:47:24 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
X-Content-Type-Options:
|
||||
- nosniff
|
||||
access-control-expose-headers:
|
||||
- X-Request-ID
|
||||
alt-svc:
|
||||
- h3=":443"; ma=86400
|
||||
cf-cache-status:
|
||||
- DYNAMIC
|
||||
openai-organization:
|
||||
- crewai-iuxna1
|
||||
openai-processing-ms:
|
||||
- '6909'
|
||||
openai-version:
|
||||
- '2020-10-01'
|
||||
strict-transport-security:
|
||||
- max-age=31536000; includeSubDomains; preload
|
||||
x-ratelimit-limit-requests:
|
||||
- '10000'
|
||||
x-ratelimit-limit-tokens:
|
||||
- '1000000'
|
||||
x-ratelimit-remaining-requests:
|
||||
- '9999'
|
||||
x-ratelimit-remaining-tokens:
|
||||
- '999515'
|
||||
x-ratelimit-reset-requests:
|
||||
- 6ms
|
||||
x-ratelimit-reset-tokens:
|
||||
- 29ms
|
||||
x-request-id:
|
||||
- req_3e283ae8c1cd132001ecf2d96198bbd6
|
||||
http_version: HTTP/1.1
|
||||
status_code: 200
|
||||
- request:
|
||||
body: '{"messages": [{"role": "system", "content": "You are test. test\nYour personal
|
||||
goal is: test\nTo give my best complete final answer to the task respond using
|
||||
the exact following format:\n\nThought: I now can give a great answer\nFinal
|
||||
Answer: Your final answer must be the great and the most complete as possible,
|
||||
it must be outcome described.\n\nI MUST use these formats, my job depends on
|
||||
it!"}, {"role": "user", "content": "\nCurrent Task: test\n\nThis is the expected
|
||||
criteria for your final answer: test output\nyou MUST return the actual complete
|
||||
content as the final answer, not a summary.\n\nBegin! This is VERY important
|
||||
to you, use the tools available and give your best Final Answer, your job depends
|
||||
on it!\n\nThought:"}], "model": "gpt-4", "stop": ["\nObservation:"]}'
|
||||
headers:
|
||||
accept:
|
||||
- application/json
|
||||
accept-encoding:
|
||||
- gzip, deflate
|
||||
authorization:
|
||||
- Bearer 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
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '780'
|
||||
content-type:
|
||||
- application/json
|
||||
cookie:
|
||||
- __cf_bm=p1aGVyahvfLAvEwvbX0FMmrN5o18PpVAu2dG_dTgMSU-1739141229-1.0.1.1-_q7aCslZTr11IMFZ81VgyuqsGiqTARFPANUvBEWM_0dZdb97Py78KE1omxdNv5F1pFKoWUqA1kEF2wzQ2wz4aA;
|
||||
_cfuvid=bsF0jwE67cS.ywAaQU59jKPFC03S1dvynClHm_wTQik-1739141229143-0.0.1.1-604800000
|
||||
host:
|
||||
- api.openai.com
|
||||
user-agent:
|
||||
- OpenAI/Python 1.61.0
|
||||
x-stainless-arch:
|
||||
- x64
|
||||
x-stainless-async:
|
||||
- 'false'
|
||||
x-stainless-lang:
|
||||
- python
|
||||
x-stainless-os:
|
||||
- Linux
|
||||
x-stainless-package-version:
|
||||
- 1.61.0
|
||||
x-stainless-raw-response:
|
||||
- 'true'
|
||||
x-stainless-retry-count:
|
||||
- '0'
|
||||
x-stainless-runtime:
|
||||
- CPython
|
||||
x-stainless-runtime-version:
|
||||
- 3.12.7
|
||||
method: POST
|
||||
uri: https://api.openai.com/v1/chat/completions
|
||||
response:
|
||||
content: "{\n \"id\": \"chatcmpl-AzANAznjrZdppuFIsRnEouHG8WuM0\",\n \"object\":
|
||||
\"chat.completion\",\n \"created\": 1739141244,\n \"model\": \"gpt-4-0613\",\n
|
||||
\ \"choices\": [\n {\n \"index\": 0,\n \"message\": {\n \"role\":
|
||||
\"assistant\",\n \"content\": \"I am ready to prepare my final answer
|
||||
based on the test output criteria provided\\nFinal Answer: I have followed all
|
||||
the instructions provided in the task to the best of my ability, and the outcome
|
||||
of the test is as described in the final answer. It is complete, detailed, and
|
||||
accurate.\",\n \"refusal\": null\n },\n \"logprobs\": null,\n
|
||||
\ \"finish_reason\": \"stop\"\n }\n ],\n \"usage\": {\n \"prompt_tokens\":
|
||||
149,\n \"completion_tokens\": 59,\n \"total_tokens\": 208,\n \"prompt_tokens_details\":
|
||||
{\n \"cached_tokens\": 0,\n \"audio_tokens\": 0\n },\n \"completion_tokens_details\":
|
||||
{\n \"reasoning_tokens\": 0,\n \"audio_tokens\": 0,\n \"accepted_prediction_tokens\":
|
||||
0,\n \"rejected_prediction_tokens\": 0\n }\n },\n \"service_tier\":
|
||||
\"default\",\n \"system_fingerprint\": null\n}\n"
|
||||
headers:
|
||||
CF-Cache-Status:
|
||||
- DYNAMIC
|
||||
CF-RAY:
|
||||
- 90f766abaaa3ba33-SEA
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Encoding:
|
||||
- gzip
|
||||
Content-Type:
|
||||
- application/json
|
||||
Date:
|
||||
- Sun, 09 Feb 2025 22:47:27 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
X-Content-Type-Options:
|
||||
- nosniff
|
||||
access-control-expose-headers:
|
||||
- X-Request-ID
|
||||
alt-svc:
|
||||
- h3=":443"; ma=86400
|
||||
openai-organization:
|
||||
- crewai-iuxna1
|
||||
openai-processing-ms:
|
||||
- '2533'
|
||||
openai-version:
|
||||
- '2020-10-01'
|
||||
strict-transport-security:
|
||||
- max-age=31536000; includeSubDomains; preload
|
||||
x-ratelimit-limit-requests:
|
||||
- '10000'
|
||||
x-ratelimit-limit-tokens:
|
||||
- '1000000'
|
||||
x-ratelimit-remaining-requests:
|
||||
- '9999'
|
||||
x-ratelimit-remaining-tokens:
|
||||
- '999822'
|
||||
x-ratelimit-reset-requests:
|
||||
- 6ms
|
||||
x-ratelimit-reset-tokens:
|
||||
- 10ms
|
||||
x-request-id:
|
||||
- req_6ea4c81627695f58de56727aa8d8cc59
|
||||
http_version: HTTP/1.1
|
||||
status_code: 200
|
||||
- request:
|
||||
body: !!binary |
|
||||
CvwNCiQKIgoMc2VydmljZS5uYW1lEhIKEGNyZXdBSS10ZWxlbWV0cnkS0w0KEgoQY3Jld2FpLnRl
|
||||
bGVtZXRyeRKaAgoQGJwIgEdh/Dq2y8ue+Gl/XxIInTNNpEL8yjQqG0NyZXcgSW5kaXZpZHVhbCBU
|
||||
ZXN0IFJlc3VsdDABOaAPzl6/qyIYQYnY6V6/qyIYShsKDmNyZXdhaV92ZXJzaW9uEgkKBzAuMTAw
|
||||
LjFKLgoIY3Jld19rZXkSIgogZmViMWUyMWIzMjU2YzU5YTY0NzE1MmFmZGQ2NjMyMmVKMQoHY3Jl
|
||||
d19pZBImCiQ2YzYyNmEyZi05OGRlLTQ2ODAtOWJhNC01NWVkYzdmODhiZTNKEQoHcXVhbGl0eRIG
|
||||
CgQxMC4wShcKCWV4ZWNfdGltZRIKCggzLjUwMzUyNkoVCgptb2RlbF9uYW1lEgcKBWdwdC00egIY
|
||||
AYUBAAEAABL5AQoQsfCFg6/ZkEo2LShWV3X+WhII4I9o90lQzxMqE0NyZXcgVGVzdCBFeGVjdXRp
|
||||
b24wATmWhUlfv6siGEFeYlZfv6siGEobCg5jcmV3YWlfdmVyc2lvbhIJCgcwLjEwMC4xSi4KCGNy
|
||||
ZXdfa2V5EiIKIGZlYjFlMjFiMzI1NmM1OWE2NDcxNTJhZmRkNjYzMjJlSjEKB2NyZXdfaWQSJgok
|
||||
ZDU2ZjljMWEtYmRkMS00MDI3LWI1ZjctMzg1ZGVlMWU2YjljShEKCml0ZXJhdGlvbnMSAwoBMUoV
|
||||
Cgptb2RlbF9uYW1lEgcKBWdwdC00egIYAYUBAAEAABKSBwoQlucrHD/mwnCU8Dl9QKzgYhIIGyix
|
||||
8K7RcoAqDENyZXcgQ3JlYXRlZDABOTQxXF+/qyIYQamXaV+/qyIYShsKDmNyZXdhaV92ZXJzaW9u
|
||||
EgkKBzAuMTAwLjFKGgoOcHl0aG9uX3ZlcnNpb24SCAoGMy4xMi43Si4KCGNyZXdfa2V5EiIKIGZl
|
||||
YjFlMjFiMzI1NmM1OWE2NDcxNTJhZmRkNjYzMjJlSjEKB2NyZXdfaWQSJgokZDU2ZjljMWEtYmRk
|
||||
MS00MDI3LWI1ZjctMzg1ZGVlMWU2YjljShwKDGNyZXdfcHJvY2VzcxIMCgpzZXF1ZW50aWFsShEK
|
||||
C2NyZXdfbWVtb3J5EgIQAEoaChRjcmV3X251bWJlcl9vZl90YXNrcxICGAFKGwoVY3Jld19udW1i
|
||||
ZXJfb2ZfYWdlbnRzEgIYAUrFAgoLY3Jld19hZ2VudHMStQIKsgJbeyJrZXkiOiAiOTc2ZjhmNTBh
|
||||
Y2NmZWJhMjIzZTQ5YzQyYjE2ZTk5ZTYiLCAiaWQiOiAiMzcwMzA5YTQtMDU5OS00MWVlLWFiMTgt
|
||||
YWE1ZmQ1Mjg2ZGQ1IiwgInJvbGUiOiAidGVzdCIsICJ2ZXJib3NlPyI6IGZhbHNlLCAibWF4X2l0
|
||||
ZXIiOiAyNSwgIm1heF9ycG0iOiBudWxsLCAiZnVuY3Rpb25fY2FsbGluZ19sbG0iOiAiIiwgImxs
|
||||
bSI6ICJncHQtNCIsICJkZWxlZ2F0aW9uX2VuYWJsZWQ/IjogZmFsc2UsICJhbGxvd19jb2RlX2V4
|
||||
ZWN1dGlvbj8iOiBmYWxzZSwgIm1heF9yZXRyeV9saW1pdCI6IDIsICJ0b29sc19uYW1lcyI6IFtd
|
||||
fV1K+QEKCmNyZXdfdGFza3MS6gEK5wFbeyJrZXkiOiAiZGE5NWViZGIzNmU0Y2RmOTJkZjZhNmRk
|
||||
MTZiY2VlMGUiLCAiaWQiOiAiZTBmNDgzNjAtYzNjNS00ZGY1LThkZjEtNDg2ZTc4OWNiZWUyIiwg
|
||||
ImFzeW5jX2V4ZWN1dGlvbj8iOiBmYWxzZSwgImh1bWFuX2lucHV0PyI6IGZhbHNlLCAiYWdlbnRf
|
||||
cm9sZSI6ICJ0ZXN0IiwgImFnZW50X2tleSI6ICI5NzZmOGY1MGFjY2ZlYmEyMjNlNDljNDJiMTZl
|
||||
OTllNiIsICJ0b29sc19uYW1lcyI6IFtdfV16AhgBhQEAAQAAEo4CChA4OLnKHp32b0EUM2g5rs+r
|
||||
EgjifMpu5dQ6xCoMVGFzayBDcmVhdGVkMAE5x1d3X7+rIhhBMxF4X7+rIhhKLgoIY3Jld19rZXkS
|
||||
IgogZmViMWUyMWIzMjU2YzU5YTY0NzE1MmFmZGQ2NjMyMmVKMQoHY3Jld19pZBImCiRkNTZmOWMx
|
||||
YS1iZGQxLTQwMjctYjVmNy0zODVkZWUxZTZiOWNKLgoIdGFza19rZXkSIgogZGE5NWViZGIzNmU0
|
||||
Y2RmOTJkZjZhNmRkMTZiY2VlMGVKMQoHdGFza19pZBImCiRlMGY0ODM2MC1jM2M1LTRkZjUtOGRm
|
||||
MS00ODZlNzg5Y2JlZTJ6AhgBhQEAAQAA
|
||||
headers:
|
||||
Accept:
|
||||
- '*/*'
|
||||
Accept-Encoding:
|
||||
- gzip, deflate
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Length:
|
||||
- '1791'
|
||||
Content-Type:
|
||||
- application/x-protobuf
|
||||
User-Agent:
|
||||
- OTel-OTLP-Exporter-Python/1.27.0
|
||||
method: POST
|
||||
uri: https://telemetry.crewai.com:4319/v1/traces
|
||||
response:
|
||||
body:
|
||||
string: "\n\0"
|
||||
headers:
|
||||
Content-Length:
|
||||
- '2'
|
||||
Content-Type:
|
||||
- application/x-protobuf
|
||||
Date:
|
||||
- Sun, 09 Feb 2025 22:47:29 GMT
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
- request:
|
||||
body: '{"messages": [{"role": "system", "content": "You are Task Execution Evaluator.
|
||||
Evaluator agent for crew evaluation with precise capabilities to evaluate the
|
||||
performance of the agents in the crew based on the tasks they have performed\nYour
|
||||
personal goal is: Your goal is to evaluate the performance of the agents in
|
||||
the crew based on the tasks they have performed using score from 1 to 10 evaluating
|
||||
on completion, quality, and overall performance.\nTo give my best complete final
|
||||
answer to the task respond using the exact following format:\n\nThought: I now
|
||||
can give a great answer\nFinal Answer: Your final answer must be the great and
|
||||
the most complete as possible, it must be outcome described.\n\nI MUST use these
|
||||
formats, my job depends on it!"}, {"role": "user", "content": "\nCurrent Task:
|
||||
Based on the task description and the expected output, compare and evaluate
|
||||
the performance of the agents in the crew based on the Task Output they have
|
||||
performed using score from 1 to 10 evaluating on completion, quality, and overall
|
||||
performance.task_description: test task_expected_output: test output agent:
|
||||
test agent_goal: test Task Output: I have followed all the instructions provided
|
||||
in the task to the best of my ability, and the outcome of the test is as described
|
||||
in the final answer. It is complete, detailed, and accurate.\n\nThis is the
|
||||
expected criteria for your final answer: Evaluation Score from 1 to 10 based
|
||||
on the performance of the agents on the tasks\nyou MUST return the actual complete
|
||||
content as the final answer, not a summary.\nEnsure your final answer contains
|
||||
only the content in the following format: {\n \"quality\": float\n}\n\nEnsure
|
||||
the final output does not include any code block markers like ```json or ```python.\n\nBegin!
|
||||
This is VERY important to you, use the tools available and give your best Final
|
||||
Answer, your job depends on it!\n\nThought:"}], "model": "gpt-4", "stop": ["\nObservation:"]}'
|
||||
headers:
|
||||
accept:
|
||||
- application/json
|
||||
accept-encoding:
|
||||
- gzip, deflate
|
||||
authorization:
|
||||
- Bearer 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
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '1935'
|
||||
content-type:
|
||||
- application/json
|
||||
cookie:
|
||||
- __cf_bm=p1aGVyahvfLAvEwvbX0FMmrN5o18PpVAu2dG_dTgMSU-1739141229-1.0.1.1-_q7aCslZTr11IMFZ81VgyuqsGiqTARFPANUvBEWM_0dZdb97Py78KE1omxdNv5F1pFKoWUqA1kEF2wzQ2wz4aA;
|
||||
_cfuvid=bsF0jwE67cS.ywAaQU59jKPFC03S1dvynClHm_wTQik-1739141229143-0.0.1.1-604800000
|
||||
host:
|
||||
- api.openai.com
|
||||
user-agent:
|
||||
- OpenAI/Python 1.61.0
|
||||
x-stainless-arch:
|
||||
- x64
|
||||
x-stainless-async:
|
||||
- 'false'
|
||||
x-stainless-lang:
|
||||
- python
|
||||
x-stainless-os:
|
||||
- Linux
|
||||
x-stainless-package-version:
|
||||
- 1.61.0
|
||||
x-stainless-raw-response:
|
||||
- 'true'
|
||||
x-stainless-retry-count:
|
||||
- '0'
|
||||
x-stainless-runtime:
|
||||
- CPython
|
||||
x-stainless-runtime-version:
|
||||
- 3.12.7
|
||||
method: POST
|
||||
uri: https://api.openai.com/v1/chat/completions
|
||||
response:
|
||||
content: "{\n \"id\": \"chatcmpl-AzANDGSiRIu1XO56ZMfLuO2SuL4l0\",\n \"object\":
|
||||
\"chat.completion\",\n \"created\": 1739141247,\n \"model\": \"gpt-4-0613\",\n
|
||||
\ \"choices\": [\n {\n \"index\": 0,\n \"message\": {\n \"role\":
|
||||
\"assistant\",\n \"content\": \"Looking at the task output provided by
|
||||
the agent, the agent has expressed a high level of confidence in their ability
|
||||
to follow the instructions provided to the best of their ability. The agent
|
||||
seems to have executed the task with detailed attention and accuracy.\\n\\nI
|
||||
need to evaluate both the quality and the overall performance of the agent keeping
|
||||
in mind the task description and the expected output. Given the agent's output
|
||||
and goal, I can deduce the quality of their work as well as their overall performance.\\n\\nFinal
|
||||
Answer: \\n{\\n \\\"quality\\\": 8.5\\n}\",\n \"refusal\": null\n },\n
|
||||
\ \"logprobs\": null,\n \"finish_reason\": \"stop\"\n }\n ],\n
|
||||
\ \"usage\": {\n \"prompt_tokens\": 372,\n \"completion_tokens\": 112,\n
|
||||
\ \"total_tokens\": 484,\n \"prompt_tokens_details\": {\n \"cached_tokens\":
|
||||
0,\n \"audio_tokens\": 0\n },\n \"completion_tokens_details\": {\n
|
||||
\ \"reasoning_tokens\": 0,\n \"audio_tokens\": 0,\n \"accepted_prediction_tokens\":
|
||||
0,\n \"rejected_prediction_tokens\": 0\n }\n },\n \"service_tier\":
|
||||
\"default\",\n \"system_fingerprint\": null\n}\n"
|
||||
headers:
|
||||
CF-RAY:
|
||||
- 90f766bc4d87ba33-SEA
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Encoding:
|
||||
- gzip
|
||||
Content-Type:
|
||||
- application/json
|
||||
Date:
|
||||
- Sun, 09 Feb 2025 22:47:32 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
X-Content-Type-Options:
|
||||
- nosniff
|
||||
access-control-expose-headers:
|
||||
- X-Request-ID
|
||||
alt-svc:
|
||||
- h3=":443"; ma=86400
|
||||
cf-cache-status:
|
||||
- DYNAMIC
|
||||
openai-organization:
|
||||
- crewai-iuxna1
|
||||
openai-processing-ms:
|
||||
- '5056'
|
||||
openai-version:
|
||||
- '2020-10-01'
|
||||
strict-transport-security:
|
||||
- max-age=31536000; includeSubDomains; preload
|
||||
x-ratelimit-limit-requests:
|
||||
- '10000'
|
||||
x-ratelimit-limit-tokens:
|
||||
- '1000000'
|
||||
x-ratelimit-remaining-requests:
|
||||
- '9999'
|
||||
x-ratelimit-remaining-tokens:
|
||||
- '999535'
|
||||
x-ratelimit-reset-requests:
|
||||
- 6ms
|
||||
x-ratelimit-reset-tokens:
|
||||
- 27ms
|
||||
x-request-id:
|
||||
- req_ff2926b015823a70e2173c71f8d63209
|
||||
http_version: HTTP/1.1
|
||||
status_code: 200
|
||||
version: 1
|
||||
@@ -4,6 +4,7 @@ import pytest
|
||||
|
||||
from crewai.agent import Agent
|
||||
from crewai.crew import Crew
|
||||
from crewai.llm import LLM
|
||||
from crewai.task import Task
|
||||
from crewai.tasks.task_output import TaskOutput
|
||||
from crewai.utilities.evaluators.crew_evaluator_handler import (
|
||||
@@ -140,3 +141,23 @@ class InternalCrewEvaluator:
|
||||
execute().pydantic = TaskEvaluationPydanticOutput(quality=9.5)
|
||||
crew_planner.evaluate(task_output)
|
||||
assert crew_planner.tasks_scores[0] == [9.5]
|
||||
|
||||
def test_crew_evaluator_with_llm_instance(self):
|
||||
"""Test that CrewEvaluator works with an LLM instance."""
|
||||
agent = Agent(role="Agent 1", goal="Goal 1", backstory="Backstory 1")
|
||||
task = Task(description="Task 1", expected_output="Output 1", agent=agent)
|
||||
crew = Crew(agents=[agent], tasks=[task])
|
||||
|
||||
llm = LLM(model="gpt-4")
|
||||
evaluator = CrewEvaluator(crew, llm)
|
||||
assert evaluator._llm == llm
|
||||
|
||||
def test_crew_evaluator_with_model_name(self):
|
||||
"""Test that CrewEvaluator works with a model name string."""
|
||||
agent = Agent(role="Agent 1", goal="Goal 1", backstory="Backstory 1")
|
||||
task = Task(description="Task 1", expected_output="Output 1", agent=agent)
|
||||
crew = Crew(agents=[agent], tasks=[task])
|
||||
|
||||
evaluator = CrewEvaluator(crew, "gpt-4")
|
||||
assert isinstance(evaluator._llm, LLM)
|
||||
assert evaluator._llm.model == "gpt-4"
|
||||
|
||||
@@ -1,71 +0,0 @@
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.agent import Agent
|
||||
from crewai.crew import Crew
|
||||
from crewai.llm import LLM
|
||||
from crewai.task import Task
|
||||
from crewai.utilities.evaluators.crew_evaluator_handler import CrewEvaluator
|
||||
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_crew_test_with_custom_llm():
|
||||
"""Test Crew.test() with both string model name and LLM instance."""
|
||||
|
||||
# Setup
|
||||
agent = Agent(
|
||||
role="test",
|
||||
goal="test",
|
||||
backstory="test",
|
||||
llm=LLM(model="gpt-4"),
|
||||
)
|
||||
task = Task(
|
||||
description="test",
|
||||
expected_output="test output",
|
||||
agent=agent,
|
||||
)
|
||||
crew = Crew(agents=[agent], tasks=[task])
|
||||
|
||||
# Test with string model name
|
||||
crew.test(n_iterations=1, llm="gpt-4")
|
||||
|
||||
# Test with LLM instance
|
||||
custom_llm = LLM(model="gpt-4")
|
||||
crew.test(n_iterations=1, llm=custom_llm)
|
||||
|
||||
# Test backward compatibility
|
||||
crew.test(n_iterations=1, openai_model_name="gpt-4")
|
||||
|
||||
# Test error when neither parameter is provided
|
||||
with pytest.raises(ValueError, match="Must provide either 'llm' or 'openai_model_name' parameter"):
|
||||
crew.test(n_iterations=1)
|
||||
|
||||
def test_crew_evaluator_with_custom_llm():
|
||||
# Setup
|
||||
agent = Agent(
|
||||
role="test",
|
||||
goal="test",
|
||||
backstory="test",
|
||||
llm=LLM(model="gpt-4"),
|
||||
)
|
||||
task = Task(
|
||||
description="test",
|
||||
expected_output="test output",
|
||||
agent=agent,
|
||||
)
|
||||
crew = Crew(agents=[agent], tasks=[task])
|
||||
|
||||
# Test with string model name
|
||||
evaluator = CrewEvaluator(crew, "gpt-4")
|
||||
assert isinstance(evaluator.llm, LLM)
|
||||
assert evaluator.llm.model == "gpt-4"
|
||||
|
||||
# Test with LLM instance
|
||||
custom_llm = LLM(model="gpt-4")
|
||||
evaluator = CrewEvaluator(crew, custom_llm)
|
||||
assert evaluator.llm == custom_llm
|
||||
|
||||
# Test that evaluator agent uses the correct LLM
|
||||
evaluator_agent = evaluator._evaluator_agent()
|
||||
assert evaluator_agent.llm == evaluator.llm
|
||||
Reference in New Issue
Block a user