mirror of
https://github.com/crewAIInc/crewAI.git
synced 2025-12-16 04:18:35 +00:00
Supporting eval single Agent/LiteAgent (#3167)
* refactor: rely on task completion event to evaluate agents * feat: remove Crew dependency to evaluate agent * feat: drop execution_context in AgentEvaluator * chore: drop experimental Agent Eval feature from stable crew.test * feat: support eval LiteAgent * resolve linter issues
This commit is contained in:
@@ -1313,7 +1313,6 @@ class Crew(FlowTrackable, BaseModel):
|
||||
n_iterations: int,
|
||||
eval_llm: Union[str, InstanceOf[BaseLLM]],
|
||||
inputs: Optional[Dict[str, Any]] = None,
|
||||
include_agent_eval: Optional[bool] = False
|
||||
) -> None:
|
||||
"""Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures."""
|
||||
try:
|
||||
@@ -1333,28 +1332,13 @@ class Crew(FlowTrackable, BaseModel):
|
||||
)
|
||||
test_crew = self.copy()
|
||||
|
||||
# TODO: Refator to use a single Evaluator Manage class
|
||||
evaluator = CrewEvaluator(test_crew, llm_instance)
|
||||
|
||||
if include_agent_eval:
|
||||
from crewai.experimental.evaluation import create_default_evaluator
|
||||
agent_evaluator = create_default_evaluator(crew=test_crew)
|
||||
|
||||
for i in range(1, n_iterations + 1):
|
||||
evaluator.set_iteration(i)
|
||||
|
||||
if include_agent_eval:
|
||||
agent_evaluator.set_iteration(i)
|
||||
|
||||
test_crew.kickoff(inputs=inputs)
|
||||
|
||||
# TODO: Refactor to use ListenerEvents instead of trigger each iteration manually
|
||||
if include_agent_eval:
|
||||
agent_evaluator.evaluate_current_iteration()
|
||||
|
||||
evaluator.print_crew_evaluation_result()
|
||||
if include_agent_eval:
|
||||
agent_evaluator.get_agent_evaluation(include_evaluation_feedback=True)
|
||||
|
||||
crewai_event_bus.emit(
|
||||
self,
|
||||
|
||||
@@ -3,32 +3,32 @@ from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
from crewai.experimental.evaluation.evaluation_display import EvaluationDisplayFormatter
|
||||
|
||||
from typing import Any, Dict
|
||||
from collections import defaultdict
|
||||
from typing import Any
|
||||
from crewai.experimental.evaluation import BaseEvaluator, create_evaluation_callbacks
|
||||
from collections.abc import Sequence
|
||||
from crewai.crew import Crew
|
||||
from crewai.utilities.events.crewai_event_bus import crewai_event_bus
|
||||
from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
|
||||
from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
|
||||
from contextlib import contextmanager
|
||||
from crewai.utilities.events.task_events import TaskCompletedEvent
|
||||
from crewai.utilities.events.agent_events import LiteAgentExecutionCompletedEvent
|
||||
from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult
|
||||
import threading
|
||||
|
||||
class ExecutionState:
|
||||
def __init__(self):
|
||||
self.traces: dict[str, Any] = {}
|
||||
self.current_agent_id: str | None = None
|
||||
self.current_task_id: str | None = None
|
||||
self.iteration: int = 1
|
||||
self.iterations_results: dict[int, dict[str, list[AgentEvaluationResult]]] = {}
|
||||
self.traces = {}
|
||||
self.current_agent_id = None
|
||||
self.current_task_id = None
|
||||
self.iteration = 1
|
||||
self.iterations_results = {}
|
||||
self.agent_evaluators = {}
|
||||
|
||||
class AgentEvaluator:
|
||||
def __init__(
|
||||
self,
|
||||
agents: list[Agent],
|
||||
evaluators: Sequence[BaseEvaluator] | None = None,
|
||||
crew: Crew | None = None,
|
||||
):
|
||||
self.crew: Crew | None = crew
|
||||
self.agents: list[Agent] = agents
|
||||
self.evaluators: Sequence[BaseEvaluator] | None = evaluators
|
||||
|
||||
self.callback = create_evaluation_callbacks()
|
||||
@@ -37,19 +37,10 @@ class AgentEvaluator:
|
||||
|
||||
self._thread_local: threading.local = threading.local()
|
||||
|
||||
self.agent_evaluators: dict[str, Sequence[BaseEvaluator] | None] = {}
|
||||
if crew is not None:
|
||||
assert crew and crew.agents is not None
|
||||
for agent in crew.agents:
|
||||
self.agent_evaluators[str(agent.id)] = self.evaluators
|
||||
for agent in self.agents:
|
||||
self._execution_state.agent_evaluators[str(agent.id)] = self.evaluators
|
||||
|
||||
@contextmanager
|
||||
def execution_context(self):
|
||||
state = ExecutionState()
|
||||
try:
|
||||
yield state
|
||||
finally:
|
||||
pass
|
||||
self._subscribe_to_events()
|
||||
|
||||
@property
|
||||
def _execution_state(self) -> ExecutionState:
|
||||
@@ -57,81 +48,95 @@ class AgentEvaluator:
|
||||
self._thread_local.execution_state = ExecutionState()
|
||||
return self._thread_local.execution_state
|
||||
|
||||
def _subscribe_to_events(self) -> None:
|
||||
crewai_event_bus.register_handler(TaskCompletedEvent, self._handle_task_completed)
|
||||
crewai_event_bus.register_handler(LiteAgentExecutionCompletedEvent, self._handle_lite_agent_completed)
|
||||
|
||||
def _handle_task_completed(self, source: Any, event: TaskCompletedEvent) -> None:
|
||||
assert event.task is not None
|
||||
agent = event.task.agent
|
||||
if agent and str(getattr(agent, 'id', 'unknown')) in self._execution_state.agent_evaluators:
|
||||
state = ExecutionState()
|
||||
state.current_agent_id = str(agent.id)
|
||||
state.current_task_id = str(event.task.id)
|
||||
|
||||
trace = self.callback.get_trace(state.current_agent_id, state.current_task_id)
|
||||
|
||||
if not trace:
|
||||
return
|
||||
|
||||
result = self.evaluate(
|
||||
agent=agent,
|
||||
task=event.task,
|
||||
execution_trace=trace,
|
||||
final_output=event.output,
|
||||
state=state
|
||||
)
|
||||
|
||||
current_iteration = self._execution_state.iteration
|
||||
if current_iteration not in self._execution_state.iterations_results:
|
||||
self._execution_state.iterations_results[current_iteration] = {}
|
||||
|
||||
if agent.role not in self._execution_state.iterations_results[current_iteration]:
|
||||
self._execution_state.iterations_results[current_iteration][agent.role] = []
|
||||
|
||||
self._execution_state.iterations_results[current_iteration][agent.role].append(result)
|
||||
|
||||
def _handle_lite_agent_completed(self, source: object, event: LiteAgentExecutionCompletedEvent) -> None:
|
||||
agent_info = event.agent_info
|
||||
agent_id = str(agent_info["id"])
|
||||
|
||||
if agent_id in self._execution_state.agent_evaluators:
|
||||
state = ExecutionState()
|
||||
state.current_agent_id = agent_id
|
||||
state.current_task_id = "lite_task"
|
||||
|
||||
target_agent = None
|
||||
for agent in self.agents:
|
||||
if str(agent.id) == agent_id:
|
||||
target_agent = agent
|
||||
break
|
||||
|
||||
if not target_agent:
|
||||
return
|
||||
|
||||
trace = self.callback.get_trace(state.current_agent_id, state.current_task_id)
|
||||
|
||||
if not trace:
|
||||
return
|
||||
|
||||
result = self.evaluate(
|
||||
agent=target_agent,
|
||||
execution_trace=trace,
|
||||
final_output=event.output,
|
||||
state=state
|
||||
)
|
||||
|
||||
current_iteration = self._execution_state.iteration
|
||||
if current_iteration not in self._execution_state.iterations_results:
|
||||
self._execution_state.iterations_results[current_iteration] = {}
|
||||
|
||||
agent_role = target_agent.role
|
||||
if agent_role not in self._execution_state.iterations_results[current_iteration]:
|
||||
self._execution_state.iterations_results[current_iteration][agent_role] = []
|
||||
|
||||
self._execution_state.iterations_results[current_iteration][agent_role].append(result)
|
||||
|
||||
def set_iteration(self, iteration: int) -> None:
|
||||
self._execution_state.iteration = iteration
|
||||
|
||||
def reset_iterations_results(self) -> None:
|
||||
self._execution_state.iterations_results = {}
|
||||
|
||||
def evaluate_current_iteration(self) -> dict[str, list[AgentEvaluationResult]]:
|
||||
if not self.crew:
|
||||
raise ValueError("Cannot evaluate: no crew was provided to the evaluator.")
|
||||
|
||||
if not self.callback:
|
||||
raise ValueError("Cannot evaluate: no callback was set. Use set_callback() method first.")
|
||||
|
||||
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
|
||||
evaluation_results: defaultdict[str, list[AgentEvaluationResult]] = defaultdict(list)
|
||||
|
||||
total_evals = 0
|
||||
for agent in self.crew.agents:
|
||||
for task in self.crew.tasks:
|
||||
if task.agent and task.agent.id == agent.id and self.agent_evaluators.get(str(agent.id)):
|
||||
total_evals += 1
|
||||
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[bold blue]{task.description}[/bold blue]"),
|
||||
BarColumn(),
|
||||
TextColumn("{task.percentage:.0f}% completed"),
|
||||
console=self.console_formatter.console
|
||||
) as progress:
|
||||
eval_task = progress.add_task(f"Evaluating agents (iteration {self._execution_state.iteration})...", total=total_evals)
|
||||
|
||||
with self.execution_context() as state:
|
||||
state.iteration = self._execution_state.iteration
|
||||
|
||||
for agent in self.crew.agents:
|
||||
evaluator = self.agent_evaluators.get(str(agent.id))
|
||||
if not evaluator:
|
||||
continue
|
||||
|
||||
for task in self.crew.tasks:
|
||||
if task.agent and str(task.agent.id) != str(agent.id):
|
||||
continue
|
||||
|
||||
trace = self.callback.get_trace(str(agent.id), str(task.id))
|
||||
if not trace:
|
||||
self.console_formatter.print(f"[yellow]Warning: No trace found for agent {agent.role} on task {task.description[:30]}...[/yellow]")
|
||||
progress.update(eval_task, advance=1)
|
||||
continue
|
||||
|
||||
state.current_agent_id = str(agent.id)
|
||||
state.current_task_id = str(task.id)
|
||||
|
||||
with crewai_event_bus.scoped_handlers():
|
||||
result = self.evaluate(
|
||||
agent=agent,
|
||||
task=task,
|
||||
execution_trace=trace,
|
||||
final_output=task.output,
|
||||
state=state
|
||||
)
|
||||
evaluation_results[agent.role].append(result)
|
||||
progress.update(eval_task, advance=1)
|
||||
|
||||
self._execution_state.iterations_results[self._execution_state.iteration] = evaluation_results
|
||||
return evaluation_results
|
||||
|
||||
def get_evaluation_results(self) -> dict[str, list[AgentEvaluationResult]]:
|
||||
if self._execution_state.iteration in self._execution_state.iterations_results:
|
||||
if self._execution_state.iterations_results and self._execution_state.iteration in self._execution_state.iterations_results:
|
||||
return self._execution_state.iterations_results[self._execution_state.iteration]
|
||||
return self.evaluate_current_iteration()
|
||||
return {}
|
||||
|
||||
def display_results_with_iterations(self) -> None:
|
||||
self.display_formatter.display_summary_results(self._execution_state.iterations_results)
|
||||
|
||||
def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = False) -> Dict[str, AgentAggregatedEvaluationResult]:
|
||||
def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = True) -> dict[str, AgentAggregatedEvaluationResult]:
|
||||
agent_results = {}
|
||||
with crewai_event_bus.scoped_handlers():
|
||||
task_results = self.get_evaluation_results()
|
||||
@@ -165,14 +170,14 @@ class AgentEvaluator:
|
||||
def evaluate(
|
||||
self,
|
||||
agent: Agent,
|
||||
task: Task,
|
||||
execution_trace: dict[str, Any],
|
||||
final_output: Any,
|
||||
state: ExecutionState
|
||||
state: ExecutionState,
|
||||
task: Task | None = None,
|
||||
) -> AgentEvaluationResult:
|
||||
result = AgentEvaluationResult(
|
||||
agent_id=state.current_agent_id or str(agent.id),
|
||||
task_id=state.current_task_id or str(task.id)
|
||||
task_id=state.current_task_id or (str(task.id) if task else "unknown_task")
|
||||
)
|
||||
|
||||
assert self.evaluators is not None
|
||||
@@ -190,7 +195,7 @@ class AgentEvaluator:
|
||||
|
||||
return result
|
||||
|
||||
def create_default_evaluator(crew, llm=None):
|
||||
def create_default_evaluator(agents: list[Agent], llm: None = None):
|
||||
from crewai.experimental.evaluation import (
|
||||
GoalAlignmentEvaluator,
|
||||
SemanticQualityEvaluator,
|
||||
@@ -209,4 +214,4 @@ def create_default_evaluator(crew, llm=None):
|
||||
ReasoningEfficiencyEvaluator(llm=llm),
|
||||
]
|
||||
|
||||
return AgentEvaluator(evaluators=evaluators, crew=crew)
|
||||
return AgentEvaluator(evaluators=evaluators, agents=agents)
|
||||
|
||||
@@ -57,9 +57,9 @@ class BaseEvaluator(abc.ABC):
|
||||
def evaluate(
|
||||
self,
|
||||
agent: Agent,
|
||||
task: Task,
|
||||
execution_trace: Dict[str, Any],
|
||||
final_output: Any,
|
||||
task: Task | None = None,
|
||||
) -> EvaluationScore:
|
||||
pass
|
||||
|
||||
|
||||
@@ -9,7 +9,9 @@ from crewai.utilities.events.base_event_listener import BaseEventListener
|
||||
from crewai.utilities.events.crewai_event_bus import CrewAIEventsBus
|
||||
from crewai.utilities.events.agent_events import (
|
||||
AgentExecutionStartedEvent,
|
||||
AgentExecutionCompletedEvent
|
||||
AgentExecutionCompletedEvent,
|
||||
LiteAgentExecutionStartedEvent,
|
||||
LiteAgentExecutionCompletedEvent
|
||||
)
|
||||
from crewai.utilities.events.tool_usage_events import (
|
||||
ToolUsageFinishedEvent,
|
||||
@@ -52,10 +54,18 @@ class EvaluationTraceCallback(BaseEventListener):
|
||||
def on_agent_started(source, event: AgentExecutionStartedEvent):
|
||||
self.on_agent_start(event.agent, event.task)
|
||||
|
||||
@event_bus.on(LiteAgentExecutionStartedEvent)
|
||||
def on_lite_agent_started(source, event: LiteAgentExecutionStartedEvent):
|
||||
self.on_lite_agent_start(event.agent_info)
|
||||
|
||||
@event_bus.on(AgentExecutionCompletedEvent)
|
||||
def on_agent_completed(source, event: AgentExecutionCompletedEvent):
|
||||
self.on_agent_finish(event.agent, event.task, event.output)
|
||||
|
||||
@event_bus.on(LiteAgentExecutionCompletedEvent)
|
||||
def on_lite_agent_completed(source, event: LiteAgentExecutionCompletedEvent):
|
||||
self.on_lite_agent_finish(event.output)
|
||||
|
||||
@event_bus.on(ToolUsageFinishedEvent)
|
||||
def on_tool_completed(source, event: ToolUsageFinishedEvent):
|
||||
self.on_tool_use(event.tool_name, event.tool_args, event.output, success=True)
|
||||
@@ -88,19 +98,38 @@ class EvaluationTraceCallback(BaseEventListener):
|
||||
def on_llm_call_completed(source, event: LLMCallCompletedEvent):
|
||||
self.on_llm_call_end(event.messages, event.response)
|
||||
|
||||
def on_lite_agent_start(self, agent_info: dict[str, Any]):
|
||||
self.current_agent_id = agent_info['id']
|
||||
self.current_task_id = "lite_task"
|
||||
|
||||
trace_key = f"{self.current_agent_id}_{self.current_task_id}"
|
||||
self._init_trace(
|
||||
trace_key=trace_key,
|
||||
agent_id=self.current_agent_id,
|
||||
task_id=self.current_task_id,
|
||||
tool_uses=[],
|
||||
llm_calls=[],
|
||||
start_time=datetime.now(),
|
||||
final_output=None
|
||||
)
|
||||
|
||||
def _init_trace(self, trace_key: str, **kwargs: Any):
|
||||
self.traces[trace_key] = kwargs
|
||||
|
||||
def on_agent_start(self, agent: Agent, task: Task):
|
||||
self.current_agent_id = agent.id
|
||||
self.current_task_id = task.id
|
||||
|
||||
trace_key = f"{agent.id}_{task.id}"
|
||||
self.traces[trace_key] = {
|
||||
"agent_id": agent.id,
|
||||
"task_id": task.id,
|
||||
"tool_uses": [],
|
||||
"llm_calls": [],
|
||||
"start_time": datetime.now(),
|
||||
"final_output": None
|
||||
}
|
||||
self._init_trace(
|
||||
trace_key=trace_key,
|
||||
agent_id=agent.id,
|
||||
task_id=task.id,
|
||||
tool_uses=[],
|
||||
llm_calls=[],
|
||||
start_time=datetime.now(),
|
||||
final_output=None
|
||||
)
|
||||
|
||||
def on_agent_finish(self, agent: Agent, task: Task, output: Any):
|
||||
trace_key = f"{agent.id}_{task.id}"
|
||||
@@ -108,9 +137,20 @@ class EvaluationTraceCallback(BaseEventListener):
|
||||
self.traces[trace_key]["final_output"] = output
|
||||
self.traces[trace_key]["end_time"] = datetime.now()
|
||||
|
||||
self._reset_current()
|
||||
|
||||
def _reset_current(self):
|
||||
self.current_agent_id = None
|
||||
self.current_task_id = None
|
||||
|
||||
def on_lite_agent_finish(self, output: Any):
|
||||
trace_key = f"{self.current_agent_id}_lite_task"
|
||||
if trace_key in self.traces:
|
||||
self.traces[trace_key]["final_output"] = output
|
||||
self.traces[trace_key]["end_time"] = datetime.now()
|
||||
|
||||
self._reset_current()
|
||||
|
||||
def on_tool_use(self, tool_name: str, tool_args: dict[str, Any] | str, result: Any,
|
||||
success: bool = True, error_type: str | None = None):
|
||||
if not self.current_agent_id or not self.current_task_id:
|
||||
|
||||
@@ -2,7 +2,7 @@ from collections import defaultdict
|
||||
from hashlib import md5
|
||||
from typing import Any
|
||||
|
||||
from crewai import Crew
|
||||
from crewai import Crew, Agent
|
||||
from crewai.experimental.evaluation import AgentEvaluator, create_default_evaluator
|
||||
from crewai.experimental.evaluation.experiment.result_display import ExperimentResultsDisplay
|
||||
from crewai.experimental.evaluation.experiment.result import ExperimentResults, ExperimentResult
|
||||
@@ -14,14 +14,18 @@ class ExperimentRunner:
|
||||
self.evaluator: AgentEvaluator | None = None
|
||||
self.display = ExperimentResultsDisplay()
|
||||
|
||||
def run(self, crew: Crew, print_summary: bool = False) -> ExperimentResults:
|
||||
self.evaluator = create_default_evaluator(crew=crew)
|
||||
def run(self, crew: Crew | None = None, agents: list[Agent] | None = None, print_summary: bool = False) -> ExperimentResults:
|
||||
if crew and not agents:
|
||||
agents = crew.agents
|
||||
|
||||
assert agents is not None
|
||||
self.evaluator = create_default_evaluator(agents=agents)
|
||||
|
||||
results = []
|
||||
|
||||
for test_case in self.dataset:
|
||||
self.evaluator.reset_iterations_results()
|
||||
result = self._run_test_case(test_case, crew)
|
||||
result = self._run_test_case(test_case=test_case, crew=crew, agents=agents)
|
||||
results.append(result)
|
||||
|
||||
experiment_results = ExperimentResults(results)
|
||||
@@ -31,7 +35,7 @@ class ExperimentRunner:
|
||||
|
||||
return experiment_results
|
||||
|
||||
def _run_test_case(self, test_case: dict[str, Any], crew: Crew) -> ExperimentResult:
|
||||
def _run_test_case(self, test_case: dict[str, Any], agents: list[Agent], crew: Crew | None = None) -> ExperimentResult:
|
||||
inputs = test_case["inputs"]
|
||||
expected_score = test_case["expected_score"]
|
||||
identifier = test_case.get("identifier") or md5(str(test_case).encode(), usedforsecurity=False).hexdigest()
|
||||
@@ -39,7 +43,11 @@ class ExperimentRunner:
|
||||
try:
|
||||
self.display.console.print(f"[dim]Running crew with input: {str(inputs)[:50]}...[/dim]")
|
||||
self.display.console.print("\n")
|
||||
crew.kickoff(inputs=inputs)
|
||||
if crew:
|
||||
crew.kickoff(inputs=inputs)
|
||||
else:
|
||||
for agent in agents:
|
||||
agent.kickoff(**inputs)
|
||||
|
||||
assert self.evaluator is not None
|
||||
agent_evaluations = self.evaluator.get_agent_evaluation()
|
||||
|
||||
@@ -14,10 +14,14 @@ class GoalAlignmentEvaluator(BaseEvaluator):
|
||||
def evaluate(
|
||||
self,
|
||||
agent: Agent,
|
||||
task: Task,
|
||||
execution_trace: Dict[str, Any],
|
||||
final_output: Any,
|
||||
task: Task | None = None,
|
||||
) -> EvaluationScore:
|
||||
task_context = ""
|
||||
if task is not None:
|
||||
task_context = f"Task description: {task.description}\nExpected output: {task.expected_output}\n"
|
||||
|
||||
prompt = [
|
||||
{"role": "system", "content": """You are an expert evaluator assessing how well an AI agent's output aligns with its assigned task goal.
|
||||
|
||||
@@ -37,8 +41,7 @@ Return your evaluation as JSON with fields 'score' (number) and 'feedback' (stri
|
||||
{"role": "user", "content": f"""
|
||||
Agent role: {agent.role}
|
||||
Agent goal: {agent.goal}
|
||||
Task description: {task.description}
|
||||
Expected output: {task.expected_output}
|
||||
{task_context}
|
||||
|
||||
Agent's final output:
|
||||
{final_output}
|
||||
|
||||
@@ -36,10 +36,14 @@ class ReasoningEfficiencyEvaluator(BaseEvaluator):
|
||||
def evaluate(
|
||||
self,
|
||||
agent: Agent,
|
||||
task: Task,
|
||||
execution_trace: Dict[str, Any],
|
||||
final_output: TaskOutput,
|
||||
final_output: TaskOutput | str,
|
||||
task: Task | None = None,
|
||||
) -> EvaluationScore:
|
||||
task_context = ""
|
||||
if task is not None:
|
||||
task_context = f"Task description: {task.description}\nExpected output: {task.expected_output}\n"
|
||||
|
||||
llm_calls = execution_trace.get("llm_calls", [])
|
||||
|
||||
if not llm_calls or len(llm_calls) < 2:
|
||||
@@ -83,6 +87,8 @@ class ReasoningEfficiencyEvaluator(BaseEvaluator):
|
||||
|
||||
call_samples = self._get_call_samples(llm_calls)
|
||||
|
||||
final_output = final_output.raw if isinstance(final_output, TaskOutput) else final_output
|
||||
|
||||
prompt = [
|
||||
{"role": "system", "content": """You are an expert evaluator assessing the reasoning efficiency of an AI agent's thought process.
|
||||
|
||||
@@ -117,7 +123,7 @@ Return your evaluation as JSON with the following structure:
|
||||
}"""},
|
||||
{"role": "user", "content": f"""
|
||||
Agent role: {agent.role}
|
||||
Task description: {task.description}
|
||||
{task_context}
|
||||
|
||||
Reasoning efficiency metrics:
|
||||
- Total LLM calls: {efficiency_metrics["total_llm_calls"]}
|
||||
@@ -130,7 +136,7 @@ Sample of agent reasoning flow (chronological sequence):
|
||||
{call_samples}
|
||||
|
||||
Agent's final output:
|
||||
{final_output.raw[:500]}... (truncated)
|
||||
{final_output[:500]}... (truncated)
|
||||
|
||||
Evaluate the reasoning efficiency of this agent based on these interaction patterns.
|
||||
Identify any inefficient reasoning patterns and provide specific suggestions for optimization.
|
||||
|
||||
@@ -14,10 +14,13 @@ class SemanticQualityEvaluator(BaseEvaluator):
|
||||
def evaluate(
|
||||
self,
|
||||
agent: Agent,
|
||||
task: Task,
|
||||
execution_trace: Dict[str, Any],
|
||||
final_output: Any,
|
||||
task: Task | None = None,
|
||||
) -> EvaluationScore:
|
||||
task_context = ""
|
||||
if task is not None:
|
||||
task_context = f"Task description: {task.description}"
|
||||
prompt = [
|
||||
{"role": "system", "content": """You are an expert evaluator assessing the semantic quality of an AI agent's output.
|
||||
|
||||
@@ -37,7 +40,7 @@ Return your evaluation as JSON with fields 'score' (number) and 'feedback' (stri
|
||||
"""},
|
||||
{"role": "user", "content": f"""
|
||||
Agent role: {agent.role}
|
||||
Task description: {task.description}
|
||||
{task_context}
|
||||
|
||||
Agent's final output:
|
||||
{final_output}
|
||||
|
||||
@@ -16,10 +16,14 @@ class ToolSelectionEvaluator(BaseEvaluator):
|
||||
def evaluate(
|
||||
self,
|
||||
agent: Agent,
|
||||
task: Task,
|
||||
execution_trace: Dict[str, Any],
|
||||
final_output: str,
|
||||
task: Task | None = None,
|
||||
) -> EvaluationScore:
|
||||
task_context = ""
|
||||
if task is not None:
|
||||
task_context = f"Task description: {task.description}"
|
||||
|
||||
tool_uses = execution_trace.get("tool_uses", [])
|
||||
tool_count = len(tool_uses)
|
||||
unique_tool_types = set([tool.get("tool", "Unknown tool") for tool in tool_uses])
|
||||
@@ -72,7 +76,7 @@ Return your evaluation as JSON with these fields:
|
||||
"""},
|
||||
{"role": "user", "content": f"""
|
||||
Agent role: {agent.role}
|
||||
Task description: {task.description}
|
||||
{task_context}
|
||||
|
||||
Available tools for this agent:
|
||||
{available_tools_info}
|
||||
@@ -128,10 +132,13 @@ class ParameterExtractionEvaluator(BaseEvaluator):
|
||||
def evaluate(
|
||||
self,
|
||||
agent: Agent,
|
||||
task: Task,
|
||||
execution_trace: Dict[str, Any],
|
||||
final_output: str,
|
||||
task: Task | None = None,
|
||||
) -> EvaluationScore:
|
||||
task_context = ""
|
||||
if task is not None:
|
||||
task_context = f"Task description: {task.description}"
|
||||
tool_uses = execution_trace.get("tool_uses", [])
|
||||
tool_count = len(tool_uses)
|
||||
|
||||
@@ -212,7 +219,7 @@ Return your evaluation as JSON with these fields:
|
||||
"""},
|
||||
{"role": "user", "content": f"""
|
||||
Agent role: {agent.role}
|
||||
Task description: {task.description}
|
||||
{task_context}
|
||||
|
||||
Parameter extraction examples:
|
||||
{param_samples_text}
|
||||
@@ -267,10 +274,13 @@ class ToolInvocationEvaluator(BaseEvaluator):
|
||||
def evaluate(
|
||||
self,
|
||||
agent: Agent,
|
||||
task: Task,
|
||||
execution_trace: Dict[str, Any],
|
||||
final_output: str,
|
||||
task: Task | None = None,
|
||||
) -> EvaluationScore:
|
||||
task_context = ""
|
||||
if task is not None:
|
||||
task_context = f"Task description: {task.description}"
|
||||
tool_uses = execution_trace.get("tool_uses", [])
|
||||
tool_errors = []
|
||||
tool_count = len(tool_uses)
|
||||
@@ -352,7 +362,7 @@ Return your evaluation as JSON with these fields:
|
||||
"""},
|
||||
{"role": "user", "content": f"""
|
||||
Agent role: {agent.role}
|
||||
Task description: {task.description}
|
||||
{task_context}
|
||||
|
||||
Tool invocation examples:
|
||||
{invocation_samples_text}
|
||||
|
||||
@@ -3,7 +3,7 @@ import inspect
|
||||
from typing_extensions import Any
|
||||
import warnings
|
||||
from crewai.experimental.evaluation.experiment import ExperimentResults, ExperimentRunner
|
||||
from crewai import Crew
|
||||
from crewai import Crew, Agent
|
||||
|
||||
def assert_experiment_successfully(experiment_results: ExperimentResults, baseline_filepath: str | None = None) -> None:
|
||||
failed_tests = [result for result in experiment_results.results if not result.passed]
|
||||
@@ -35,10 +35,10 @@ def assert_experiment_no_regression(comparison_result: dict[str, list[str]]) ->
|
||||
UserWarning
|
||||
)
|
||||
|
||||
def run_experiment(dataset: list[dict[str, Any]], crew: Crew, verbose: bool = False) -> ExperimentResults:
|
||||
def run_experiment(dataset: list[dict[str, Any]], crew: Crew | None = None, agents: list[Agent] | None = None, verbose: bool = False) -> ExperimentResults:
|
||||
runner = ExperimentRunner(dataset=dataset)
|
||||
|
||||
return runner.run(crew=crew, print_summary=verbose)
|
||||
return runner.run(agents=agents, crew=crew, print_summary=verbose)
|
||||
|
||||
def _get_baseline_filepath_fallback() -> str:
|
||||
test_func_name = "experiment_fallback"
|
||||
|
||||
@@ -305,6 +305,7 @@ class LiteAgent(FlowTrackable, BaseModel):
|
||||
"""
|
||||
# Create agent info for event emission
|
||||
agent_info = {
|
||||
"id": self.id,
|
||||
"role": self.role,
|
||||
"goal": self.goal,
|
||||
"backstory": self.backstory,
|
||||
|
||||
@@ -4,6 +4,7 @@ from .agent_events import (
|
||||
AgentExecutionCompletedEvent,
|
||||
AgentExecutionErrorEvent,
|
||||
AgentExecutionStartedEvent,
|
||||
LiteAgentExecutionCompletedEvent,
|
||||
)
|
||||
from .crew_events import (
|
||||
CrewKickoffCompletedEvent,
|
||||
@@ -80,6 +81,7 @@ EventTypes = Union[
|
||||
CrewTrainFailedEvent,
|
||||
AgentExecutionStartedEvent,
|
||||
AgentExecutionCompletedEvent,
|
||||
LiteAgentExecutionCompletedEvent,
|
||||
TaskStartedEvent,
|
||||
TaskCompletedEvent,
|
||||
TaskFailedEvent,
|
||||
|
||||
237
tests/cassettes/TestAgentEvaluator.test_eval_lite_agent.yaml
Normal file
237
tests/cassettes/TestAgentEvaluator.test_eval_lite_agent.yaml
Normal file
@@ -0,0 +1,237 @@
|
||||
interactions:
|
||||
- request:
|
||||
body: '{"messages": [{"role": "system", "content": "You are Test Agent. An agent
|
||||
created for testing purposes\nYour personal goal is: Complete test tasks successfully\n\nTo
|
||||
give my best complete final answer to the task respond using the exact following
|
||||
format:\n\nThought: I now can give a great answer\nFinal Answer: Your final
|
||||
answer must be the great and the most complete as possible, it must be outcome
|
||||
described.\n\nI MUST use these formats, my job depends on it!"}, {"role": "user",
|
||||
"content": "Complete this task successfully"}], "model": "gpt-4o-mini", "stop":
|
||||
["\nObservation:"]}'
|
||||
headers:
|
||||
accept:
|
||||
- application/json
|
||||
accept-encoding:
|
||||
- gzip, deflate, zstd
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '583'
|
||||
content-type:
|
||||
- application/json
|
||||
host:
|
||||
- api.openai.com
|
||||
user-agent:
|
||||
- OpenAI/Python 1.93.0
|
||||
x-stainless-arch:
|
||||
- arm64
|
||||
x-stainless-async:
|
||||
- 'false'
|
||||
x-stainless-lang:
|
||||
- python
|
||||
x-stainless-os:
|
||||
- MacOS
|
||||
x-stainless-package-version:
|
||||
- 1.93.0
|
||||
x-stainless-raw-response:
|
||||
- 'true'
|
||||
x-stainless-read-timeout:
|
||||
- '600.0'
|
||||
x-stainless-retry-count:
|
||||
- '0'
|
||||
x-stainless-runtime:
|
||||
- CPython
|
||||
x-stainless-runtime-version:
|
||||
- 3.11.12
|
||||
method: POST
|
||||
uri: https://api.openai.com/v1/chat/completions
|
||||
response:
|
||||
body:
|
||||
string: !!binary |
|
||||
H4sIAAAAAAAAAwAAAP//jFNNb9swDL3nVxA6J0U+HKTNbd0woMAOw7Bu6LbCUCXa1iqLgkgnzYr8
|
||||
98FKWqdbB+wiQHx81OMj9TgCUM6qNSjTaDFt9JNL+TZ7N/dfrusPN01NyV6vPk3f/mrl5vLrXI17
|
||||
Bt39RCNPrDNDbfQojsIBNgm1YF91tlrOl+fzxXKWgZYs+p5WR5kUNGldcJP5dF5MpqvJ7PzIbsgZ
|
||||
ZLWG7yMAgMd89jqDxQe1hun4KdIis65RrZ+TAFQi30eUZnYsOogaD6ChIBiy9M8NdXUja7iCQFsw
|
||||
OkDtNgga6l4/6MBbTAA/wnsXtIc3+b6Gjx41I8REG2cRWoStkwakQeCIxlXOgEXRzjNQgvzigwBV
|
||||
OUU038OOOgiIFhr0MdPHoIOFK9g67wEDdwlBCI7OIjgB7oxB5qrzfpeznxRokIZS3wwk5EiB8ey0
|
||||
54RVx7r3PXTenwA6BBLdzy27fXtE9s/+eqpjojv+g6oqFxw3ZULNFHovWSiqjO5HALd5jt2L0aiY
|
||||
qI1SCt1jfu7i4lBODdszgEVxBIVE+yE+KxbjV8qVR79PFkEZbRq0A3XYGt1ZRyfA6KTpv9W8VvvQ
|
||||
uAv1/5QfAGMwCtoyJrTOvOx4SEvYf65/pT2bnAUrxrRxBktxmPpBWKx05w8rr3jHgm1ZuVBjiskd
|
||||
9r6K5aLQy0LjxcKo0X70GwAA//8DAMz2wVUFBAAA
|
||||
headers:
|
||||
CF-RAY:
|
||||
- 95f93ea9af627e0b-GRU
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Encoding:
|
||||
- gzip
|
||||
Content-Type:
|
||||
- application/json
|
||||
Date:
|
||||
- Tue, 15 Jul 2025 12:25:54 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Set-Cookie:
|
||||
- __cf_bm=GRZmZLrjW5ZRHNmUJa4ccrMcy20D1rmeqK6Ptlv0mRY-1752582354-1.0.1.1-xKd_yga48Eedech5TRlThlEpDgsB2whxkWHlCyAGOVMqMcvH1Ju9FdXYbuQ9NdUQcVxPLgiGM35lYhqSLVQiXDyK01dnyp2Gvm560FBN9DY;
|
||||
path=/; expires=Tue, 15-Jul-25 12:55:54 GMT; domain=.api.openai.com; HttpOnly;
|
||||
Secure; SameSite=None
|
||||
- _cfuvid=MYqswpSR7sqr4kGp6qZVkaL7HDYwMiww49PeN9QBP.A-1752582354973-0.0.1.1-604800000;
|
||||
path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
X-Content-Type-Options:
|
||||
- nosniff
|
||||
access-control-expose-headers:
|
||||
- X-Request-ID
|
||||
alt-svc:
|
||||
- h3=":443"; ma=86400
|
||||
cf-cache-status:
|
||||
- DYNAMIC
|
||||
openai-organization:
|
||||
- crewai-iuxna1
|
||||
openai-processing-ms:
|
||||
- '4047'
|
||||
openai-version:
|
||||
- '2020-10-01'
|
||||
strict-transport-security:
|
||||
- max-age=31536000; includeSubDomains; preload
|
||||
x-envoy-upstream-service-time:
|
||||
- '4440'
|
||||
x-ratelimit-limit-requests:
|
||||
- '30000'
|
||||
x-ratelimit-limit-tokens:
|
||||
- '150000000'
|
||||
x-ratelimit-remaining-requests:
|
||||
- '29999'
|
||||
x-ratelimit-remaining-tokens:
|
||||
- '149999885'
|
||||
x-ratelimit-reset-requests:
|
||||
- 2ms
|
||||
x-ratelimit-reset-tokens:
|
||||
- 0s
|
||||
x-request-id:
|
||||
- req_5704c0f206a927ddc12aa1a19b612a75
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
- request:
|
||||
body: '{"messages": [{"role": "system", "content": "You are an expert evaluator
|
||||
assessing how well an AI agent''s output aligns with its assigned task goal.\n\nScore
|
||||
the agent''s goal alignment on a scale from 0-10 where:\n- 0: Complete misalignment,
|
||||
agent did not understand or attempt the task goal\n- 5: Partial alignment, agent
|
||||
attempted the task but missed key requirements\n- 10: Perfect alignment, agent
|
||||
fully satisfied all task requirements\n\nConsider:\n1. Did the agent correctly
|
||||
interpret the task goal?\n2. Did the final output directly address the requirements?\n3.
|
||||
Did the agent focus on relevant aspects of the task?\n4. Did the agent provide
|
||||
all requested information or deliverables?\n\nReturn your evaluation as JSON
|
||||
with fields ''score'' (number) and ''feedback'' (string).\n"}, {"role": "user",
|
||||
"content": "\nAgent role: Test Agent\nAgent goal: Complete test tasks successfully\n\n\nAgent''s
|
||||
final output:\nPlease provide me with the specific details or context of the
|
||||
task you need help with, and I will ensure to complete it successfully and provide
|
||||
a thorough response.\n\nEvaluate how well the agent''s output aligns with the
|
||||
assigned task goal.\n"}], "model": "gpt-4o-mini", "stop": []}'
|
||||
headers:
|
||||
accept:
|
||||
- application/json
|
||||
accept-encoding:
|
||||
- gzip, deflate, zstd
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '1196'
|
||||
content-type:
|
||||
- application/json
|
||||
cookie:
|
||||
- __cf_bm=GRZmZLrjW5ZRHNmUJa4ccrMcy20D1rmeqK6Ptlv0mRY-1752582354-1.0.1.1-xKd_yga48Eedech5TRlThlEpDgsB2whxkWHlCyAGOVMqMcvH1Ju9FdXYbuQ9NdUQcVxPLgiGM35lYhqSLVQiXDyK01dnyp2Gvm560FBN9DY;
|
||||
_cfuvid=MYqswpSR7sqr4kGp6qZVkaL7HDYwMiww49PeN9QBP.A-1752582354973-0.0.1.1-604800000
|
||||
host:
|
||||
- api.openai.com
|
||||
user-agent:
|
||||
- OpenAI/Python 1.93.0
|
||||
x-stainless-arch:
|
||||
- arm64
|
||||
x-stainless-async:
|
||||
- 'false'
|
||||
x-stainless-lang:
|
||||
- python
|
||||
x-stainless-os:
|
||||
- MacOS
|
||||
x-stainless-package-version:
|
||||
- 1.93.0
|
||||
x-stainless-raw-response:
|
||||
- 'true'
|
||||
x-stainless-read-timeout:
|
||||
- '600.0'
|
||||
x-stainless-retry-count:
|
||||
- '0'
|
||||
x-stainless-runtime:
|
||||
- CPython
|
||||
x-stainless-runtime-version:
|
||||
- 3.11.12
|
||||
method: POST
|
||||
uri: https://api.openai.com/v1/chat/completions
|
||||
response:
|
||||
body:
|
||||
string: !!binary |
|
||||
H4sIAAAAAAAAA4xUy27bQAy8+yuIPdtGbMdN4FvbSxM0QIsEKNA6MJhdSmK82hWWVFwj8L8XKz/k
|
||||
9AH0ogOHnOFjVq8DAMPOLMDYCtXWjR990O+TT7dfZs/v5OtFy/ef7++mxfu7j83t/cONGeaK+PRM
|
||||
Vo9VYxvrxpNyDHvYJkKlzDq5mk/n19PZfN4BdXTkc1nZ6OgyjmoOPJpeTC9HF1ejyfWhuopsScwC
|
||||
fgwAAF67b+4zOPppFnAxPEZqEsGSzOKUBGBS9DliUIRFMagZ9qCNQSl0rb8uA8DSiI2JlmYB0+E+
|
||||
UBC5J7TrHFuah4oASwoKjh2EqOCojkE0oRIgWE+YoA2OUhZzHEqIBWhFoChrKCP6IWwqthWwgEY4
|
||||
bItASbRLEpDWWhIpWu+3Y7gJooRuCKyAsiYHRUxQx0TgSJG9DIGDY4ua5RA82nVW5cDKqPxCWYhC
|
||||
iSXBhrU69TOGbxV7ysxSxY0Awoa951AGkq69/do67QLZk8vBJsUXdgQYtoBWW/SQSJoYpFPq2Ptp
|
||||
MLjTttC51DFXVIPjRFb9drw0y7A7v0uiohXM3git92cAhhAVs7c6RzwekN3JAz6WTYpP8lupKTiw
|
||||
VKtEKDHke4vGxnTobgDw2HmtfWMf06RYN7rSuKZObjo7eM30Fu/R6yOoUdH38dnkCLzhWx1ud+ZW
|
||||
Y9FW5PrS3trYOo5nwOBs6j+7+Rv3fnIO5f/Q94C11Ci5VZPIsX07cZ+WKP8B/pV22nLXsBFKL2xp
|
||||
pUwpX8JRga3fv0sjW1GqVwWHklKTuHuc+ZKD3eAXAAAA//8DADksFsafBAAA
|
||||
headers:
|
||||
CF-RAY:
|
||||
- 95f93ec73a1c7e0b-GRU
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Encoding:
|
||||
- gzip
|
||||
Content-Type:
|
||||
- application/json
|
||||
Date:
|
||||
- Tue, 15 Jul 2025 12:25:57 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
X-Content-Type-Options:
|
||||
- nosniff
|
||||
access-control-expose-headers:
|
||||
- X-Request-ID
|
||||
alt-svc:
|
||||
- h3=":443"; ma=86400
|
||||
cf-cache-status:
|
||||
- DYNAMIC
|
||||
openai-organization:
|
||||
- crewai-iuxna1
|
||||
openai-processing-ms:
|
||||
- '1544'
|
||||
openai-version:
|
||||
- '2020-10-01'
|
||||
strict-transport-security:
|
||||
- max-age=31536000; includeSubDomains; preload
|
||||
x-envoy-upstream-service-time:
|
||||
- '1546'
|
||||
x-ratelimit-limit-requests:
|
||||
- '30000'
|
||||
x-ratelimit-limit-tokens:
|
||||
- '150000000'
|
||||
x-ratelimit-remaining-requests:
|
||||
- '29999'
|
||||
x-ratelimit-remaining-tokens:
|
||||
- '149999732'
|
||||
x-ratelimit-reset-requests:
|
||||
- 2ms
|
||||
x-ratelimit-reset-tokens:
|
||||
- 0s
|
||||
x-request-id:
|
||||
- req_44930ba12ad8d1e3f0beed1d5e3d8b0c
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
version: 1
|
||||
File diff suppressed because one or more lines are too long
@@ -427,4 +427,140 @@ interactions:
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
- request:
|
||||
body: '{"messages": [{"role": "system", "content": "You are an expert evaluator
|
||||
assessing how well an AI agent''s output aligns with its assigned task goal.\n\nScore
|
||||
the agent''s goal alignment on a scale from 0-10 where:\n- 0: Complete misalignment,
|
||||
agent did not understand or attempt the task goal\n- 5: Partial alignment, agent
|
||||
attempted the task but missed key requirements\n- 10: Perfect alignment, agent
|
||||
fully satisfied all task requirements\n\nConsider:\n1. Did the agent correctly
|
||||
interpret the task goal?\n2. Did the final output directly address the requirements?\n3.
|
||||
Did the agent focus on relevant aspects of the task?\n4. Did the agent provide
|
||||
all requested information or deliverables?\n\nReturn your evaluation as JSON
|
||||
with fields ''score'' (number) and ''feedback'' (string).\n"}, {"role": "user",
|
||||
"content": "\nAgent role: Test Agent\nAgent goal: Complete test tasks successfully\nTask
|
||||
description: Test task description\nExpected output: Expected test output\n\nAgent''s
|
||||
final output:\nThe expected test output is a comprehensive document that outlines
|
||||
the specific parameters and criteria that define success for the task at hand.
|
||||
It should include detailed descriptions of the tasks, the goals that need to
|
||||
be achieved, and any specific formatting or structural requirements necessary
|
||||
for the output. Each component of the task must be analyzed and addressed, providing
|
||||
context as well as examples where applicable. Additionally, any tools or methodologies
|
||||
that are relevant to executing the tasks successfully should be outlined, including
|
||||
any potential risks or challenges that may arise during the process. This document
|
||||
serves as a guiding framework to ensure that all aspects of the task are thoroughly
|
||||
considered and executed to meet the high standards expected.\n\nEvaluate how
|
||||
well the agent''s output aligns with the assigned task goal.\n"}], "model":
|
||||
"gpt-4o-mini", "stop": []}'
|
||||
headers:
|
||||
accept:
|
||||
- application/json
|
||||
accept-encoding:
|
||||
- gzip, deflate, zstd
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '1893'
|
||||
content-type:
|
||||
- application/json
|
||||
cookie:
|
||||
- _cfuvid=XwsgBfgvDGlKFQ4LiGYGIARIoSNTiwidqoo9UZcc.XY-1752087999227-0.0.1.1-604800000
|
||||
host:
|
||||
- api.openai.com
|
||||
user-agent:
|
||||
- OpenAI/Python 1.93.0
|
||||
x-stainless-arch:
|
||||
- arm64
|
||||
x-stainless-async:
|
||||
- 'false'
|
||||
x-stainless-lang:
|
||||
- python
|
||||
x-stainless-os:
|
||||
- MacOS
|
||||
x-stainless-package-version:
|
||||
- 1.93.0
|
||||
x-stainless-raw-response:
|
||||
- 'true'
|
||||
x-stainless-read-timeout:
|
||||
- '600.0'
|
||||
x-stainless-retry-count:
|
||||
- '0'
|
||||
x-stainless-runtime:
|
||||
- CPython
|
||||
x-stainless-runtime-version:
|
||||
- 3.11.12
|
||||
method: POST
|
||||
uri: https://api.openai.com/v1/chat/completions
|
||||
response:
|
||||
body:
|
||||
string: !!binary |
|
||||
H4sIAAAAAAAAAwAAAP//jFRNbxs5DL37VxA6jwPHddrUxxwWi2BRtEAPRevCYCSOh41GUkWOnTTI
|
||||
fy8kf4zT5rCXOfCRT4+P5DxNAAw7swRjO1TbJz+90dvFxy//vX0za7dfr29+3eo/n75++Mh0O/za
|
||||
maZUxLsfZPVYdWFjnzwpx7CHbSZUKqyX767mV/PL2eKqAn105EvZJul0Eac9B57OZ/PFdPZuenl9
|
||||
qO4iWxKzhG8TAICn+i06g6MHs4RZc4z0JIIbMstTEoDJ0ZeIQREWxaCmGUEbg1Ko0p9WAWBlxMZM
|
||||
K7OEq2YfaIncHdr7EluZzx0BbigopBy37MgBgiNF9uTAkdjMqbQOsYVdhwraEdBDIqvkIA6aBgXp
|
||||
4uAdcLB+cNTArmPbAQfHFpUEJPYEQ3CUi2LHYVPoCpOi3EOmnwNn6imoXMC/cUdbyk3FWw7oj8+4
|
||||
SAIhKkgiyy1b9P4RHHneUn4pTEn0WIYC6YDX5866aqDH+yKHFRJm5cqInjeB3AWM7vQsUgzhTFb9
|
||||
48GtUlloSwMkZ4bEDMetOaSg1QH9XldVwSrk2wY4iBLWSs/hmG47zGiVMouylZP7WHkzdRSEtwQu
|
||||
2qH4dhyBjcWKHWsXhzJTEgpVAwagByySirgzRSfLDrtzsTKr8Hy+VJnaQbAsdhi8PwMwhKhYfKzr
|
||||
/P2APJ8W2MdNyvFO/ig1LQeWbp0JJYayrKIxmYo+TwC+10MZXuy+STn2Sdca76k+92ax2POZ8T5H
|
||||
9P31AdSo6Mf4YjFvXuFb71dezk7NWLQdubF0vEscHMczYHLW9d9qXuPed85h83/oR8BaSkpunTI5
|
||||
ti87HtMy/agTfT3t5HIVbITyli2tlSmXSThqcfD7n4qRR1Hq1y2HDeWUuf5ZyiQnz5PfAAAA//8D
|
||||
AEfUP8BcBQAA
|
||||
headers:
|
||||
CF-RAY:
|
||||
- 95f365f1bfc87ded-GRU
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Encoding:
|
||||
- gzip
|
||||
Content-Type:
|
||||
- application/json
|
||||
Date:
|
||||
- Mon, 14 Jul 2025 19:24:07 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Set-Cookie:
|
||||
- __cf_bm=PcC3_3T8.MK_WpZlQLdZfwpNv9Pe45AIYmrXOSgJ65E-1752521047-1.0.1.1-eyqwSWfQC7ZV6.JwTsTihK1ZWCrEmxd52CtNcfe.fw1UjjBN9rdTU4G7hRZiNqHQYo4sVZMmgRgqM9k7HRSzN2zln0bKmMiOuSQTZh6xF_I;
|
||||
path=/; expires=Mon, 14-Jul-25 19:54:07 GMT; domain=.api.openai.com; HttpOnly;
|
||||
Secure; SameSite=None
|
||||
- _cfuvid=JvQ1c4qYZefNwOPoVNgAtX8ET7ObU.JKDvGc43LOR6g-1752521047741-0.0.1.1-604800000;
|
||||
path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
X-Content-Type-Options:
|
||||
- nosniff
|
||||
access-control-expose-headers:
|
||||
- X-Request-ID
|
||||
alt-svc:
|
||||
- h3=":443"; ma=86400
|
||||
cf-cache-status:
|
||||
- DYNAMIC
|
||||
openai-organization:
|
||||
- crewai-iuxna1
|
||||
openai-processing-ms:
|
||||
- '2729'
|
||||
openai-version:
|
||||
- '2020-10-01'
|
||||
strict-transport-security:
|
||||
- max-age=31536000; includeSubDomains; preload
|
||||
x-envoy-upstream-service-time:
|
||||
- '2789'
|
||||
x-ratelimit-limit-requests:
|
||||
- '30000'
|
||||
x-ratelimit-limit-tokens:
|
||||
- '150000000'
|
||||
x-ratelimit-remaining-requests:
|
||||
- '29999'
|
||||
x-ratelimit-remaining-tokens:
|
||||
- '149999559'
|
||||
x-ratelimit-reset-requests:
|
||||
- 2ms
|
||||
x-ratelimit-reset-tokens:
|
||||
- 0s
|
||||
x-request-id:
|
||||
- req_74f6e8ff49db25dbea3d3525cc149e8e
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
version: 1
|
||||
|
||||
@@ -15,6 +15,7 @@ from crewai.experimental.evaluation import (
|
||||
)
|
||||
|
||||
from crewai.experimental.evaluation import create_default_evaluator
|
||||
|
||||
class TestAgentEvaluator:
|
||||
@pytest.fixture
|
||||
def mock_crew(self):
|
||||
@@ -39,18 +40,18 @@ class TestAgentEvaluator:
|
||||
return crew
|
||||
|
||||
def test_set_iteration(self):
|
||||
agent_evaluator = AgentEvaluator()
|
||||
agent_evaluator = AgentEvaluator(agents=[])
|
||||
|
||||
agent_evaluator.set_iteration(3)
|
||||
assert agent_evaluator._execution_state.iteration == 3
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
def test_evaluate_current_iteration(self, mock_crew):
|
||||
agent_evaluator = AgentEvaluator(crew=mock_crew, evaluators=[GoalAlignmentEvaluator()])
|
||||
agent_evaluator = AgentEvaluator(agents=mock_crew.agents, evaluators=[GoalAlignmentEvaluator()])
|
||||
|
||||
mock_crew.kickoff()
|
||||
|
||||
results = agent_evaluator.evaluate_current_iteration()
|
||||
results = agent_evaluator.get_evaluation_results()
|
||||
|
||||
assert isinstance(results, dict)
|
||||
|
||||
@@ -70,16 +71,16 @@ class TestAgentEvaluator:
|
||||
goal_alignment, = result.metrics.values()
|
||||
assert goal_alignment.score == 5.0
|
||||
|
||||
expected_feedback = "The agent's output demonstrates an understanding of the need for a comprehensive document"
|
||||
expected_feedback = "The agent's output demonstrates an understanding of the need for a comprehensive document outlining task"
|
||||
assert expected_feedback in goal_alignment.feedback
|
||||
|
||||
assert goal_alignment.raw_response is not None
|
||||
assert '"score": 5' in goal_alignment.raw_response
|
||||
|
||||
def test_create_default_evaluator(self, mock_crew):
|
||||
agent_evaluator = create_default_evaluator(crew=mock_crew)
|
||||
agent_evaluator = create_default_evaluator(agents=mock_crew.agents)
|
||||
assert isinstance(agent_evaluator, AgentEvaluator)
|
||||
assert agent_evaluator.crew == mock_crew
|
||||
assert agent_evaluator.agents == mock_crew.agents
|
||||
|
||||
expected_types = [
|
||||
GoalAlignmentEvaluator,
|
||||
@@ -93,3 +94,71 @@ class TestAgentEvaluator:
|
||||
assert len(agent_evaluator.evaluators) == len(expected_types)
|
||||
for evaluator, expected_type in zip(agent_evaluator.evaluators, expected_types):
|
||||
assert isinstance(evaluator, expected_type)
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
def test_eval_lite_agent(self):
|
||||
agent = Agent(
|
||||
role="Test Agent",
|
||||
goal="Complete test tasks successfully",
|
||||
backstory="An agent created for testing purposes",
|
||||
)
|
||||
agent_evaluator = AgentEvaluator(agents=[agent], evaluators=[GoalAlignmentEvaluator()])
|
||||
|
||||
agent.kickoff(messages="Complete this task successfully")
|
||||
|
||||
results = agent_evaluator.get_evaluation_results()
|
||||
|
||||
assert isinstance(results, dict)
|
||||
|
||||
result, = results[agent.role]
|
||||
assert isinstance(result, AgentEvaluationResult)
|
||||
|
||||
assert result.agent_id == str(agent.id)
|
||||
assert result.task_id == "lite_task"
|
||||
|
||||
goal_alignment, = result.metrics.values()
|
||||
assert goal_alignment.score == 2.0
|
||||
|
||||
expected_feedback = "The agent did not demonstrate a clear understanding of the task goal, which is to complete test tasks successfully"
|
||||
assert expected_feedback in goal_alignment.feedback
|
||||
|
||||
assert goal_alignment.raw_response is not None
|
||||
assert '"score": 2' in goal_alignment.raw_response
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
def test_eval_specific_agents_from_crew(self, mock_crew):
|
||||
agent = Agent(
|
||||
role="Test Agent Eval",
|
||||
goal="Complete test tasks successfully",
|
||||
backstory="An agent created for testing purposes",
|
||||
)
|
||||
task = Task(
|
||||
description="Test task description",
|
||||
agent=agent,
|
||||
expected_output="Expected test output"
|
||||
)
|
||||
mock_crew.agents.append(agent)
|
||||
mock_crew.tasks.append(task)
|
||||
|
||||
agent_evaluator = AgentEvaluator(agents=[agent], evaluators=[GoalAlignmentEvaluator()])
|
||||
|
||||
mock_crew.kickoff()
|
||||
|
||||
results = agent_evaluator.get_evaluation_results()
|
||||
|
||||
assert isinstance(results, dict)
|
||||
assert len(results.keys()) == 1
|
||||
result, = results[agent.role]
|
||||
assert isinstance(result, AgentEvaluationResult)
|
||||
|
||||
assert result.agent_id == str(agent.id)
|
||||
assert result.task_id == str(task.id)
|
||||
|
||||
goal_alignment, = result.metrics.values()
|
||||
assert goal_alignment.score == 5.0
|
||||
|
||||
expected_feedback = "The agent provided a thorough guide on how to conduct a test task but failed to produce specific expected output"
|
||||
assert expected_feedback in goal_alignment.feedback
|
||||
|
||||
assert goal_alignment.raw_response is not None
|
||||
assert '"score": 5' in goal_alignment.raw_response
|
||||
Reference in New Issue
Block a user