chore: resolve all ruff and mypy issues in experimental module

resolve linting, typing, and import issues; update Okta test
This commit is contained in:
Greyson LaLonde
2025-09-22 12:56:28 -04:00
committed by GitHub
parent aa8dc9d77f
commit 0e370593f1
17 changed files with 595 additions and 402 deletions

View File

@@ -1,40 +1,39 @@
from crewai.experimental.evaluation import ( from crewai.experimental.evaluation import (
AgentEvaluationResult,
AgentEvaluator,
BaseEvaluator, BaseEvaluator,
EvaluationScore, EvaluationScore,
MetricCategory,
AgentEvaluationResult,
SemanticQualityEvaluator,
GoalAlignmentEvaluator,
ReasoningEfficiencyEvaluator,
ToolSelectionEvaluator,
ParameterExtractionEvaluator,
ToolInvocationEvaluator,
EvaluationTraceCallback, EvaluationTraceCallback,
create_evaluation_callbacks,
AgentEvaluator,
create_default_evaluator,
ExperimentRunner,
ExperimentResults,
ExperimentResult, ExperimentResult,
ExperimentResults,
ExperimentRunner,
GoalAlignmentEvaluator,
MetricCategory,
ParameterExtractionEvaluator,
ReasoningEfficiencyEvaluator,
SemanticQualityEvaluator,
ToolInvocationEvaluator,
ToolSelectionEvaluator,
create_default_evaluator,
create_evaluation_callbacks,
) )
__all__ = [ __all__ = [
"AgentEvaluationResult",
"AgentEvaluator",
"BaseEvaluator", "BaseEvaluator",
"EvaluationScore", "EvaluationScore",
"MetricCategory",
"AgentEvaluationResult",
"SemanticQualityEvaluator",
"GoalAlignmentEvaluator",
"ReasoningEfficiencyEvaluator",
"ToolSelectionEvaluator",
"ParameterExtractionEvaluator",
"ToolInvocationEvaluator",
"EvaluationTraceCallback", "EvaluationTraceCallback",
"create_evaluation_callbacks", "ExperimentResult",
"AgentEvaluator",
"create_default_evaluator",
"ExperimentRunner",
"ExperimentResults", "ExperimentResults",
"ExperimentResult" "ExperimentRunner",
] "GoalAlignmentEvaluator",
"MetricCategory",
"ParameterExtractionEvaluator",
"ReasoningEfficiencyEvaluator",
"SemanticQualityEvaluator",
"ToolInvocationEvaluator",
"ToolSelectionEvaluator",
"create_default_evaluator",
"create_evaluation_callbacks",
]

View File

@@ -1,51 +1,47 @@
from crewai.experimental.evaluation.agent_evaluator import (
AgentEvaluator,
create_default_evaluator,
)
from crewai.experimental.evaluation.base_evaluator import ( from crewai.experimental.evaluation.base_evaluator import (
AgentEvaluationResult,
BaseEvaluator, BaseEvaluator,
EvaluationScore, EvaluationScore,
MetricCategory, MetricCategory,
AgentEvaluationResult
) )
from crewai.experimental.evaluation.metrics import (
SemanticQualityEvaluator,
GoalAlignmentEvaluator,
ReasoningEfficiencyEvaluator,
ToolSelectionEvaluator,
ParameterExtractionEvaluator,
ToolInvocationEvaluator
)
from crewai.experimental.evaluation.evaluation_listener import ( from crewai.experimental.evaluation.evaluation_listener import (
EvaluationTraceCallback, EvaluationTraceCallback,
create_evaluation_callbacks create_evaluation_callbacks,
) )
from crewai.experimental.evaluation.agent_evaluator import (
AgentEvaluator,
create_default_evaluator
)
from crewai.experimental.evaluation.experiment import ( from crewai.experimental.evaluation.experiment import (
ExperimentRunner, ExperimentResult,
ExperimentResults, ExperimentResults,
ExperimentResult ExperimentRunner,
)
from crewai.experimental.evaluation.metrics import (
GoalAlignmentEvaluator,
ParameterExtractionEvaluator,
ReasoningEfficiencyEvaluator,
SemanticQualityEvaluator,
ToolInvocationEvaluator,
ToolSelectionEvaluator,
) )
__all__ = [ __all__ = [
"AgentEvaluationResult",
"AgentEvaluator",
"BaseEvaluator", "BaseEvaluator",
"EvaluationScore", "EvaluationScore",
"MetricCategory",
"AgentEvaluationResult",
"SemanticQualityEvaluator",
"GoalAlignmentEvaluator",
"ReasoningEfficiencyEvaluator",
"ToolSelectionEvaluator",
"ParameterExtractionEvaluator",
"ToolInvocationEvaluator",
"EvaluationTraceCallback", "EvaluationTraceCallback",
"create_evaluation_callbacks", "ExperimentResult",
"AgentEvaluator",
"create_default_evaluator",
"ExperimentRunner",
"ExperimentResults", "ExperimentResults",
"ExperimentResult" "ExperimentRunner",
"GoalAlignmentEvaluator",
"MetricCategory",
"ParameterExtractionEvaluator",
"ReasoningEfficiencyEvaluator",
"SemanticQualityEvaluator",
"ToolInvocationEvaluator",
"ToolSelectionEvaluator",
"create_default_evaluator",
"create_evaluation_callbacks",
] ]

View File

@@ -1,34 +1,36 @@
import threading import threading
from typing import Any, Optional from collections.abc import Sequence
from typing import Any
from crewai.experimental.evaluation.base_evaluator import (
AgentEvaluationResult,
AggregationStrategy,
)
from crewai.agent import Agent from crewai.agent import Agent
from crewai.task import Task from crewai.agents.agent_builder.base_agent import BaseAgent
from crewai.experimental.evaluation.evaluation_display import EvaluationDisplayFormatter from crewai.events.event_bus import crewai_event_bus
from crewai.events.types.agent_events import ( from crewai.events.types.agent_events import (
AgentEvaluationStartedEvent,
AgentEvaluationCompletedEvent, AgentEvaluationCompletedEvent,
AgentEvaluationFailedEvent, AgentEvaluationFailedEvent,
AgentEvaluationStartedEvent,
LiteAgentExecutionCompletedEvent,
) )
from crewai.experimental.evaluation import BaseEvaluator, create_evaluation_callbacks
from collections.abc import Sequence
from crewai.events.event_bus import crewai_event_bus
from crewai.events.utils.console_formatter import ConsoleFormatter
from crewai.events.types.task_events import TaskCompletedEvent from crewai.events.types.task_events import TaskCompletedEvent
from crewai.events.types.agent_events import LiteAgentExecutionCompletedEvent from crewai.events.utils.console_formatter import ConsoleFormatter
from crewai.experimental.evaluation.base_evaluator import ( from crewai.experimental.evaluation.base_evaluator import (
AgentAggregatedEvaluationResult, AgentAggregatedEvaluationResult,
AgentEvaluationResult,
AggregationStrategy,
BaseEvaluator,
EvaluationScore, EvaluationScore,
MetricCategory, MetricCategory,
) )
from crewai.experimental.evaluation.evaluation_display import EvaluationDisplayFormatter
from crewai.experimental.evaluation.evaluation_listener import (
create_evaluation_callbacks,
)
from crewai.task import Task
class ExecutionState: class ExecutionState:
current_agent_id: Optional[str] = None current_agent_id: str | None = None
current_task_id: Optional[str] = None current_task_id: str | None = None
def __init__(self): def __init__(self):
self.traces = {} self.traces = {}
@@ -40,10 +42,10 @@ class ExecutionState:
class AgentEvaluator: class AgentEvaluator:
def __init__( def __init__(
self, self,
agents: list[Agent], agents: list[Agent] | list[BaseAgent],
evaluators: Sequence[BaseEvaluator] | None = None, evaluators: Sequence[BaseEvaluator] | None = None,
): ):
self.agents: list[Agent] = agents self.agents: list[Agent] | list[BaseAgent] = agents
self.evaluators: Sequence[BaseEvaluator] | None = evaluators self.evaluators: Sequence[BaseEvaluator] | None = evaluators
self.callback = create_evaluation_callbacks() self.callback = create_evaluation_callbacks()
@@ -75,7 +77,8 @@ class AgentEvaluator:
) )
def _handle_task_completed(self, source: Any, event: TaskCompletedEvent) -> None: def _handle_task_completed(self, source: Any, event: TaskCompletedEvent) -> None:
assert event.task is not None if event.task is None:
raise ValueError("TaskCompletedEvent must have a task")
agent = event.task.agent agent = event.task.agent
if ( if (
agent agent
@@ -92,9 +95,8 @@ class AgentEvaluator:
state.current_agent_id = str(agent.id) state.current_agent_id = str(agent.id)
state.current_task_id = str(event.task.id) state.current_task_id = str(event.task.id)
assert ( if state.current_agent_id is None or state.current_task_id is None:
state.current_agent_id is not None and state.current_task_id is not None raise ValueError("Agent ID and Task ID must not be None")
)
trace = self.callback.get_trace( trace = self.callback.get_trace(
state.current_agent_id, state.current_task_id state.current_agent_id, state.current_task_id
) )
@@ -146,9 +148,8 @@ class AgentEvaluator:
if not target_agent: if not target_agent:
return return
assert ( if state.current_agent_id is None or state.current_task_id is None:
state.current_agent_id is not None and state.current_task_id is not None raise ValueError("Agent ID and Task ID must not be None")
)
trace = self.callback.get_trace( trace = self.callback.get_trace(
state.current_agent_id, state.current_task_id state.current_agent_id, state.current_task_id
) )
@@ -244,7 +245,7 @@ class AgentEvaluator:
def evaluate( def evaluate(
self, self,
agent: Agent, agent: Agent | BaseAgent,
execution_trace: dict[str, Any], execution_trace: dict[str, Any],
final_output: Any, final_output: Any,
state: ExecutionState, state: ExecutionState,
@@ -255,7 +256,8 @@ class AgentEvaluator:
task_id=state.current_task_id or (str(task.id) if task else "unknown_task"), task_id=state.current_task_id or (str(task.id) if task else "unknown_task"),
) )
assert self.evaluators is not None if self.evaluators is None:
raise ValueError("Evaluators must be initialized")
task_id = str(task.id) if task else None task_id = str(task.id) if task else None
for evaluator in self.evaluators: for evaluator in self.evaluators:
try: try:
@@ -276,7 +278,7 @@ class AgentEvaluator:
metric_category=evaluator.metric_category, metric_category=evaluator.metric_category,
score=score, score=score,
) )
except Exception as e: except Exception as e: # noqa: PERF203
self.emit_evaluation_failed_event( self.emit_evaluation_failed_event(
agent_role=agent.role, agent_role=agent.role,
agent_id=str(agent.id), agent_id=str(agent.id),
@@ -284,7 +286,7 @@ class AgentEvaluator:
error=str(e), error=str(e),
) )
self.console_formatter.print( self.console_formatter.print(
f"Error in {evaluator.metric_category.value} evaluator: {str(e)}" f"Error in {evaluator.metric_category.value} evaluator: {e!s}"
) )
return result return result
@@ -337,14 +339,14 @@ class AgentEvaluator:
) )
def create_default_evaluator(agents: list[Agent], llm: None = None): def create_default_evaluator(agents: list[Agent] | list[BaseAgent], llm: None = None):
from crewai.experimental.evaluation import ( from crewai.experimental.evaluation import (
GoalAlignmentEvaluator, GoalAlignmentEvaluator,
SemanticQualityEvaluator,
ToolSelectionEvaluator,
ParameterExtractionEvaluator, ParameterExtractionEvaluator,
ToolInvocationEvaluator,
ReasoningEfficiencyEvaluator, ReasoningEfficiencyEvaluator,
SemanticQualityEvaluator,
ToolInvocationEvaluator,
ToolSelectionEvaluator,
) )
evaluators = [ evaluators = [

View File

@@ -1,15 +1,17 @@
import abc import abc
import enum import enum
from enum import Enum from enum import Enum
from typing import Any, Dict, List, Optional from typing import Any
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from crewai.agent import Agent from crewai.agent import Agent
from crewai.task import Task from crewai.agents.agent_builder.base_agent import BaseAgent
from crewai.llm import BaseLLM from crewai.llm import BaseLLM
from crewai.task import Task
from crewai.utilities.llm_utils import create_llm from crewai.utilities.llm_utils import create_llm
class MetricCategory(enum.Enum): class MetricCategory(enum.Enum):
GOAL_ALIGNMENT = "goal_alignment" GOAL_ALIGNMENT = "goal_alignment"
SEMANTIC_QUALITY = "semantic_quality" SEMANTIC_QUALITY = "semantic_quality"
@@ -19,7 +21,7 @@ class MetricCategory(enum.Enum):
TOOL_INVOCATION = "tool_invocation" TOOL_INVOCATION = "tool_invocation"
def title(self): def title(self):
return self.value.replace('_', ' ').title() return self.value.replace("_", " ").title()
class EvaluationScore(BaseModel): class EvaluationScore(BaseModel):
@@ -27,15 +29,13 @@ class EvaluationScore(BaseModel):
default=5.0, default=5.0,
description="Numeric score from 0-10 where 0 is worst and 10 is best, None if not applicable", description="Numeric score from 0-10 where 0 is worst and 10 is best, None if not applicable",
ge=0.0, ge=0.0,
le=10.0 le=10.0,
) )
feedback: str = Field( feedback: str = Field(
default="", default="", description="Detailed feedback explaining the evaluation score"
description="Detailed feedback explaining the evaluation score"
) )
raw_response: str | None = Field( raw_response: str | None = Field(
default=None, default=None, description="Raw response from the evaluator (e.g., LLM)"
description="Raw response from the evaluator (e.g., LLM)"
) )
def __str__(self) -> str: def __str__(self) -> str:
@@ -56,8 +56,8 @@ class BaseEvaluator(abc.ABC):
@abc.abstractmethod @abc.abstractmethod
def evaluate( def evaluate(
self, self,
agent: Agent, agent: Agent | BaseAgent,
execution_trace: Dict[str, Any], execution_trace: dict[str, Any],
final_output: Any, final_output: Any,
task: Task | None = None, task: Task | None = None,
) -> EvaluationScore: ) -> EvaluationScore:
@@ -67,9 +67,8 @@ class BaseEvaluator(abc.ABC):
class AgentEvaluationResult(BaseModel): class AgentEvaluationResult(BaseModel):
agent_id: str = Field(description="ID of the evaluated agent") agent_id: str = Field(description="ID of the evaluated agent")
task_id: str = Field(description="ID of the task that was executed") task_id: str = Field(description="ID of the task that was executed")
metrics: Dict[MetricCategory, EvaluationScore] = Field( metrics: dict[MetricCategory, EvaluationScore] = Field(
default_factory=dict, default_factory=dict, description="Evaluation scores for each metric category"
description="Evaluation scores for each metric category"
) )
@@ -81,33 +80,23 @@ class AggregationStrategy(Enum):
class AgentAggregatedEvaluationResult(BaseModel): class AgentAggregatedEvaluationResult(BaseModel):
agent_id: str = Field( agent_id: str = Field(default="", description="ID of the agent")
default="", agent_role: str = Field(default="", description="Role of the agent")
description="ID of the agent"
)
agent_role: str = Field(
default="",
description="Role of the agent"
)
task_count: int = Field( task_count: int = Field(
default=0, default=0, description="Number of tasks included in this aggregation"
description="Number of tasks included in this aggregation"
) )
aggregation_strategy: AggregationStrategy = Field( aggregation_strategy: AggregationStrategy = Field(
default=AggregationStrategy.SIMPLE_AVERAGE, default=AggregationStrategy.SIMPLE_AVERAGE,
description="Strategy used for aggregation" description="Strategy used for aggregation",
) )
metrics: Dict[MetricCategory, EvaluationScore] = Field( metrics: dict[MetricCategory, EvaluationScore] = Field(
default_factory=dict, default_factory=dict, description="Aggregated metrics across all tasks"
description="Aggregated metrics across all tasks"
) )
task_results: List[str] = Field( task_results: list[str] = Field(
default_factory=list, default_factory=list, description="IDs of tasks included in this aggregation"
description="IDs of tasks included in this aggregation"
) )
overall_score: Optional[float] = Field( overall_score: float | None = Field(
default=None, default=None, description="Overall score for this agent"
description="Overall score for this agent"
) )
def __str__(self) -> str: def __str__(self) -> str:
@@ -119,7 +108,7 @@ class AgentAggregatedEvaluationResult(BaseModel):
result += f"\n\n- {category.value.upper()}: {score.score}/10\n" result += f"\n\n- {category.value.upper()}: {score.score}/10\n"
if score.feedback: if score.feedback:
detailed_feedback = "\n ".join(score.feedback.split('\n')) detailed_feedback = "\n ".join(score.feedback.split("\n"))
result += f" {detailed_feedback}\n" result += f" {detailed_feedback}\n"
return result return result

View File

@@ -1,16 +1,18 @@
from collections import defaultdict from collections import defaultdict
from typing import Dict, Any, List
from rich.table import Table
from rich.box import HEAVY_EDGE, ROUNDED
from collections.abc import Sequence from collections.abc import Sequence
from typing import Any
from rich.box import HEAVY_EDGE, ROUNDED
from rich.table import Table
from crewai.events.utils.console_formatter import ConsoleFormatter
from crewai.experimental.evaluation.base_evaluator import ( from crewai.experimental.evaluation.base_evaluator import (
AgentAggregatedEvaluationResult, AgentAggregatedEvaluationResult,
AggregationStrategy,
AgentEvaluationResult, AgentEvaluationResult,
AggregationStrategy,
EvaluationScore,
MetricCategory, MetricCategory,
) )
from crewai.experimental.evaluation import EvaluationScore
from crewai.events.utils.console_formatter import ConsoleFormatter
from crewai.utilities.llm_utils import create_llm from crewai.utilities.llm_utils import create_llm
@@ -19,7 +21,7 @@ class EvaluationDisplayFormatter:
self.console_formatter = ConsoleFormatter() self.console_formatter = ConsoleFormatter()
def display_evaluation_with_feedback( def display_evaluation_with_feedback(
self, iterations_results: Dict[int, Dict[str, List[Any]]] self, iterations_results: dict[int, dict[str, list[Any]]]
): ):
if not iterations_results: if not iterations_results:
self.console_formatter.print( self.console_formatter.print(
@@ -99,7 +101,7 @@ class EvaluationDisplayFormatter:
def display_summary_results( def display_summary_results(
self, self,
iterations_results: Dict[int, Dict[str, List[AgentAggregatedEvaluationResult]]], iterations_results: dict[int, dict[str, list[AgentEvaluationResult]]],
): ):
if not iterations_results: if not iterations_results:
self.console_formatter.print( self.console_formatter.print(
@@ -280,7 +282,7 @@ class EvaluationDisplayFormatter:
feedback_summary = feedbacks[0] feedback_summary = feedbacks[0]
aggregated_metrics[category] = EvaluationScore( aggregated_metrics[category] = EvaluationScore(
score=avg_score, feedback=feedback_summary score=avg_score, feedback=feedback_summary or ""
) )
overall_score = None overall_score = None
@@ -304,25 +306,25 @@ class EvaluationDisplayFormatter:
self, self,
agent_role: str, agent_role: str,
metric: str, metric: str,
feedbacks: List[str], feedbacks: list[str],
scores: List[float | None], scores: list[float | None],
strategy: AggregationStrategy, strategy: AggregationStrategy,
) -> str: ) -> str:
if len(feedbacks) <= 2 and all(len(fb) < 200 for fb in feedbacks): if len(feedbacks) <= 2 and all(len(fb) < 200 for fb in feedbacks):
return "\n\n".join( return "\n\n".join(
[f"Feedback {i+1}: {fb}" for i, fb in enumerate(feedbacks)] [f"Feedback {i + 1}: {fb}" for i, fb in enumerate(feedbacks)]
) )
try: try:
llm = create_llm() llm = create_llm()
formatted_feedbacks = [] formatted_feedbacks = []
for i, (feedback, score) in enumerate(zip(feedbacks, scores)): for i, (feedback, score) in enumerate(zip(feedbacks, scores, strict=False)):
if len(feedback) > 500: if len(feedback) > 500:
feedback = feedback[:500] + "..." feedback = feedback[:500] + "..."
score_text = f"{score:.1f}" if score is not None else "N/A" score_text = f"{score:.1f}" if score is not None else "N/A"
formatted_feedbacks.append( formatted_feedbacks.append(
f"Feedback #{i+1} (Score: {score_text}):\n{feedback}" f"Feedback #{i + 1} (Score: {score_text}):\n{feedback}"
) )
all_feedbacks = "\n\n" + "\n\n---\n\n".join(formatted_feedbacks) all_feedbacks = "\n\n" + "\n\n---\n\n".join(formatted_feedbacks)
@@ -365,10 +367,9 @@ class EvaluationDisplayFormatter:
""", """,
}, },
] ]
assert llm is not None if llm is None:
response = llm.call(prompt) raise ValueError("LLM must be initialized")
return llm.call(prompt)
return response
except Exception: except Exception:
return "Synthesized from multiple tasks: " + "\n\n".join( return "Synthesized from multiple tasks: " + "\n\n".join(

View File

@@ -1,26 +1,25 @@
from datetime import datetime
from typing import Any, Dict, Optional
from collections.abc import Sequence from collections.abc import Sequence
from datetime import datetime
from typing import Any
from crewai.agent import Agent from crewai.agents.agent_builder.base_agent import BaseAgent
from crewai.task import Task
from crewai.events.base_event_listener import BaseEventListener from crewai.events.base_event_listener import BaseEventListener
from crewai.events.event_bus import CrewAIEventsBus from crewai.events.event_bus import CrewAIEventsBus
from crewai.events.types.agent_events import ( from crewai.events.types.agent_events import (
AgentExecutionStartedEvent,
AgentExecutionCompletedEvent, AgentExecutionCompletedEvent,
LiteAgentExecutionStartedEvent, AgentExecutionStartedEvent,
LiteAgentExecutionCompletedEvent, LiteAgentExecutionCompletedEvent,
LiteAgentExecutionStartedEvent,
) )
from crewai.events.types.llm_events import LLMCallCompletedEvent, LLMCallStartedEvent
from crewai.events.types.tool_usage_events import ( from crewai.events.types.tool_usage_events import (
ToolUsageFinishedEvent,
ToolUsageErrorEvent,
ToolExecutionErrorEvent, ToolExecutionErrorEvent,
ToolSelectionErrorEvent, ToolSelectionErrorEvent,
ToolUsageErrorEvent,
ToolUsageFinishedEvent,
ToolValidateInputErrorEvent, ToolValidateInputErrorEvent,
) )
from crewai.events.types.llm_events import LLMCallStartedEvent, LLMCallCompletedEvent from crewai.task import Task
class EvaluationTraceCallback(BaseEventListener): class EvaluationTraceCallback(BaseEventListener):
@@ -136,7 +135,7 @@ class EvaluationTraceCallback(BaseEventListener):
def _init_trace(self, trace_key: str, **kwargs: Any): def _init_trace(self, trace_key: str, **kwargs: Any):
self.traces[trace_key] = kwargs self.traces[trace_key] = kwargs
def on_agent_start(self, agent: Agent, task: Task): def on_agent_start(self, agent: BaseAgent, task: Task):
self.current_agent_id = agent.id self.current_agent_id = agent.id
self.current_task_id = task.id self.current_task_id = task.id
@@ -151,7 +150,7 @@ class EvaluationTraceCallback(BaseEventListener):
final_output=None, final_output=None,
) )
def on_agent_finish(self, agent: Agent, task: Task, output: Any): def on_agent_finish(self, agent: BaseAgent, task: Task, output: Any):
trace_key = f"{agent.id}_{task.id}" trace_key = f"{agent.id}_{task.id}"
if trace_key in self.traces: if trace_key in self.traces:
self.traces[trace_key]["final_output"] = output self.traces[trace_key]["final_output"] = output
@@ -253,7 +252,7 @@ class EvaluationTraceCallback(BaseEventListener):
if hasattr(self, "current_llm_call"): if hasattr(self, "current_llm_call"):
self.current_llm_call = {} self.current_llm_call = {}
def get_trace(self, agent_id: str, task_id: str) -> Optional[Dict[str, Any]]: def get_trace(self, agent_id: str, task_id: str) -> dict[str, Any] | None:
trace_key = f"{agent_id}_{task_id}" trace_key = f"{agent_id}_{task_id}"
return self.traces.get(trace_key) return self.traces.get(trace_key)

View File

@@ -1,8 +1,7 @@
from crewai.experimental.evaluation.experiment.result import (
ExperimentResult,
ExperimentResults,
)
from crewai.experimental.evaluation.experiment.runner import ExperimentRunner from crewai.experimental.evaluation.experiment.runner import ExperimentRunner
from crewai.experimental.evaluation.experiment.result import ExperimentResults, ExperimentResult
__all__ = [ __all__ = ["ExperimentResult", "ExperimentResults", "ExperimentRunner"]
"ExperimentRunner",
"ExperimentResults",
"ExperimentResult"
]

View File

@@ -2,45 +2,60 @@ import json
import os import os
from datetime import datetime, timezone from datetime import datetime, timezone
from typing import Any from typing import Any
from pydantic import BaseModel from pydantic import BaseModel
class ExperimentResult(BaseModel): class ExperimentResult(BaseModel):
identifier: str identifier: str
inputs: dict[str, Any] inputs: dict[str, Any]
score: int | dict[str, int | float] score: float | dict[str, float]
expected_score: int | dict[str, int | float] expected_score: float | dict[str, float]
passed: bool passed: bool
agent_evaluations: dict[str, Any] | None = None agent_evaluations: dict[str, Any] | None = None
class ExperimentResults: class ExperimentResults:
def __init__(self, results: list[ExperimentResult], metadata: dict[str, Any] | None = None): def __init__(
self, results: list[ExperimentResult], metadata: dict[str, Any] | None = None
):
self.results = results self.results = results
self.metadata = metadata or {} self.metadata = metadata or {}
self.timestamp = datetime.now(timezone.utc) self.timestamp = datetime.now(timezone.utc)
from crewai.experimental.evaluation.experiment.result_display import ExperimentResultsDisplay from crewai.experimental.evaluation.experiment.result_display import (
ExperimentResultsDisplay,
)
self.display = ExperimentResultsDisplay() self.display = ExperimentResultsDisplay()
def to_json(self, filepath: str | None = None) -> dict[str, Any]: def to_json(self, filepath: str | None = None) -> dict[str, Any]:
data = { data = {
"timestamp": self.timestamp.isoformat(), "timestamp": self.timestamp.isoformat(),
"metadata": self.metadata, "metadata": self.metadata,
"results": [r.model_dump(exclude={"agent_evaluations"}) for r in self.results] "results": [
r.model_dump(exclude={"agent_evaluations"}) for r in self.results
],
} }
if filepath: if filepath:
with open(filepath, 'w') as f: with open(filepath, "w") as f:
json.dump(data, f, indent=2) json.dump(data, f, indent=2)
self.display.console.print(f"[green]Results saved to {filepath}[/green]") self.display.console.print(f"[green]Results saved to {filepath}[/green]")
return data return data
def compare_with_baseline(self, baseline_filepath: str, save_current: bool = True, print_summary: bool = False) -> dict[str, Any]: def compare_with_baseline(
self,
baseline_filepath: str,
save_current: bool = True,
print_summary: bool = False,
) -> dict[str, Any]:
baseline_runs = [] baseline_runs = []
if os.path.exists(baseline_filepath) and os.path.getsize(baseline_filepath) > 0: if os.path.exists(baseline_filepath) and os.path.getsize(baseline_filepath) > 0:
try: try:
with open(baseline_filepath, 'r') as f: with open(baseline_filepath, "r") as f:
baseline_data = json.load(f) baseline_data = json.load(f)
if isinstance(baseline_data, dict) and "timestamp" in baseline_data: if isinstance(baseline_data, dict) and "timestamp" in baseline_data:
@@ -48,14 +63,18 @@ class ExperimentResults:
elif isinstance(baseline_data, list): elif isinstance(baseline_data, list):
baseline_runs = baseline_data baseline_runs = baseline_data
except (json.JSONDecodeError, FileNotFoundError) as e: except (json.JSONDecodeError, FileNotFoundError) as e:
self.display.console.print(f"[yellow]Warning: Could not load baseline file: {str(e)}[/yellow]") self.display.console.print(
f"[yellow]Warning: Could not load baseline file: {e!s}[/yellow]"
)
if not baseline_runs: if not baseline_runs:
if save_current: if save_current:
current_data = self.to_json() current_data = self.to_json()
with open(baseline_filepath, 'w') as f: with open(baseline_filepath, "w") as f:
json.dump([current_data], f, indent=2) json.dump([current_data], f, indent=2)
self.display.console.print(f"[green]Saved current results as new baseline to {baseline_filepath}[/green]") self.display.console.print(
f"[green]Saved current results as new baseline to {baseline_filepath}[/green]"
)
return {"is_baseline": True, "changes": {}} return {"is_baseline": True, "changes": {}}
baseline_runs.sort(key=lambda x: x.get("timestamp", ""), reverse=True) baseline_runs.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
@@ -69,9 +88,11 @@ class ExperimentResults:
if save_current: if save_current:
current_data = self.to_json() current_data = self.to_json()
baseline_runs.append(current_data) baseline_runs.append(current_data)
with open(baseline_filepath, 'w') as f: with open(baseline_filepath, "w") as f:
json.dump(baseline_runs, f, indent=2) json.dump(baseline_runs, f, indent=2)
self.display.console.print(f"[green]Added current results to baseline file {baseline_filepath}[/green]") self.display.console.print(
f"[green]Added current results to baseline file {baseline_filepath}[/green]"
)
return comparison return comparison
@@ -118,5 +139,5 @@ class ExperimentResults:
"new_tests": new_tests, "new_tests": new_tests,
"missing_tests": missing_tests, "missing_tests": missing_tests,
"total_compared": len(improved) + len(regressed) + len(unchanged), "total_compared": len(improved) + len(regressed) + len(unchanged),
"baseline_timestamp": baseline_run.get("timestamp", "unknown") "baseline_timestamp": baseline_run.get("timestamp", "unknown"),
} }

View File

@@ -1,9 +1,12 @@
from typing import Dict, Any from typing import Any
from rich.console import Console from rich.console import Console
from rich.table import Table
from rich.panel import Panel from rich.panel import Panel
from rich.table import Table
from crewai.experimental.evaluation.experiment.result import ExperimentResults from crewai.experimental.evaluation.experiment.result import ExperimentResults
class ExperimentResultsDisplay: class ExperimentResultsDisplay:
def __init__(self): def __init__(self):
self.console = Console() self.console = Console()
@@ -19,13 +22,19 @@ class ExperimentResultsDisplay:
table.add_row("Total Test Cases", str(total)) table.add_row("Total Test Cases", str(total))
table.add_row("Passed", str(passed)) table.add_row("Passed", str(passed))
table.add_row("Failed", str(total - passed)) table.add_row("Failed", str(total - passed))
table.add_row("Success Rate", f"{(passed / total * 100):.1f}%" if total > 0 else "N/A") table.add_row(
"Success Rate", f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
)
self.console.print(table) self.console.print(table)
def comparison_summary(self, comparison: Dict[str, Any], baseline_timestamp: str): def comparison_summary(self, comparison: dict[str, Any], baseline_timestamp: str):
self.console.print(Panel(f"[bold]Comparison with baseline run from {baseline_timestamp}[/bold]", self.console.print(
expand=False)) Panel(
f"[bold]Comparison with baseline run from {baseline_timestamp}[/bold]",
expand=False,
)
)
table = Table(title="Results Comparison") table = Table(title="Results Comparison")
table.add_column("Metric", style="cyan") table.add_column("Metric", style="cyan")
@@ -34,7 +43,9 @@ class ExperimentResultsDisplay:
improved = comparison.get("improved", []) improved = comparison.get("improved", [])
if improved: if improved:
details = ", ".join([f"{test_identifier}" for test_identifier in improved[:3]]) details = ", ".join(
[f"{test_identifier}" for test_identifier in improved[:3]]
)
if len(improved) > 3: if len(improved) > 3:
details += f" and {len(improved) - 3} more" details += f" and {len(improved) - 3} more"
table.add_row("✅ Improved", str(len(improved)), details) table.add_row("✅ Improved", str(len(improved)), details)
@@ -43,7 +54,9 @@ class ExperimentResultsDisplay:
regressed = comparison.get("regressed", []) regressed = comparison.get("regressed", [])
if regressed: if regressed:
details = ", ".join([f"{test_identifier}" for test_identifier in regressed[:3]]) details = ", ".join(
[f"{test_identifier}" for test_identifier in regressed[:3]]
)
if len(regressed) > 3: if len(regressed) > 3:
details += f" and {len(regressed) - 3} more" details += f" and {len(regressed) - 3} more"
table.add_row("❌ Regressed", str(len(regressed)), details, style="red") table.add_row("❌ Regressed", str(len(regressed)), details, style="red")
@@ -58,13 +71,13 @@ class ExperimentResultsDisplay:
details = ", ".join(new_tests[:3]) details = ", ".join(new_tests[:3])
if len(new_tests) > 3: if len(new_tests) > 3:
details += f" and {len(new_tests) - 3} more" details += f" and {len(new_tests) - 3} more"
table.add_row(" New Tests", str(len(new_tests)), details) table.add_row("+ New Tests", str(len(new_tests)), details)
missing_tests = comparison.get("missing_tests", []) missing_tests = comparison.get("missing_tests", [])
if missing_tests: if missing_tests:
details = ", ".join(missing_tests[:3]) details = ", ".join(missing_tests[:3])
if len(missing_tests) > 3: if len(missing_tests) > 3:
details += f" and {len(missing_tests) - 3} more" details += f" and {len(missing_tests) - 3} more"
table.add_row(" Missing Tests", str(len(missing_tests)), details) table.add_row("- Missing Tests", str(len(missing_tests)), details)
self.console.print(table) self.console.print(table)

View File

@@ -2,11 +2,20 @@ from collections import defaultdict
from hashlib import md5 from hashlib import md5
from typing import Any from typing import Any
from crewai import Crew, Agent from crewai import Agent, Crew
from crewai.agents.agent_builder.base_agent import BaseAgent
from crewai.experimental.evaluation import AgentEvaluator, create_default_evaluator from crewai.experimental.evaluation import AgentEvaluator, create_default_evaluator
from crewai.experimental.evaluation.experiment.result_display import ExperimentResultsDisplay from crewai.experimental.evaluation.evaluation_display import (
from crewai.experimental.evaluation.experiment.result import ExperimentResults, ExperimentResult AgentAggregatedEvaluationResult,
from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult )
from crewai.experimental.evaluation.experiment.result import (
ExperimentResult,
ExperimentResults,
)
from crewai.experimental.evaluation.experiment.result_display import (
ExperimentResultsDisplay,
)
class ExperimentRunner: class ExperimentRunner:
def __init__(self, dataset: list[dict[str, Any]]): def __init__(self, dataset: list[dict[str, Any]]):
@@ -14,11 +23,17 @@ class ExperimentRunner:
self.evaluator: AgentEvaluator | None = None self.evaluator: AgentEvaluator | None = None
self.display = ExperimentResultsDisplay() self.display = ExperimentResultsDisplay()
def run(self, crew: Crew | None = None, agents: list[Agent] | None = None, print_summary: bool = False) -> ExperimentResults: def run(
self,
crew: Crew | None = None,
agents: list[Agent] | list[BaseAgent] | None = None,
print_summary: bool = False,
) -> ExperimentResults:
if crew and not agents: if crew and not agents:
agents = crew.agents agents = crew.agents
assert agents is not None if agents is None:
raise ValueError("Agents must be provided either directly or via a crew")
self.evaluator = create_default_evaluator(agents=agents) self.evaluator = create_default_evaluator(agents=agents)
results = [] results = []
@@ -35,21 +50,37 @@ class ExperimentRunner:
return experiment_results return experiment_results
def _run_test_case(self, test_case: dict[str, Any], agents: list[Agent], crew: Crew | None = None) -> ExperimentResult: def _run_test_case(
self,
test_case: dict[str, Any],
agents: list[Agent] | list[BaseAgent],
crew: Crew | None = None,
) -> ExperimentResult:
inputs = test_case["inputs"] inputs = test_case["inputs"]
expected_score = test_case["expected_score"] expected_score = test_case["expected_score"]
identifier = test_case.get("identifier") or md5(str(test_case).encode(), usedforsecurity=False).hexdigest() identifier = (
test_case.get("identifier")
or md5(str(test_case).encode(), usedforsecurity=False).hexdigest()
)
try: try:
self.display.console.print(f"[dim]Running crew with input: {str(inputs)[:50]}...[/dim]") self.display.console.print(
f"[dim]Running crew with input: {str(inputs)[:50]}...[/dim]"
)
self.display.console.print("\n") self.display.console.print("\n")
if crew: if crew:
crew.kickoff(inputs=inputs) crew.kickoff(inputs=inputs)
else: else:
for agent in agents: for agent in agents:
agent.kickoff(**inputs) if isinstance(agent, Agent):
agent.kickoff(**inputs)
else:
raise TypeError(
f"Agent {agent} is not an instance of Agent and cannot be kicked off directly"
)
assert self.evaluator is not None if self.evaluator is None:
raise ValueError("Evaluator must be initialized")
agent_evaluations = self.evaluator.get_agent_evaluation() agent_evaluations = self.evaluator.get_agent_evaluation()
actual_score = self._extract_scores(agent_evaluations) actual_score = self._extract_scores(agent_evaluations)
@@ -61,35 +92,38 @@ class ExperimentRunner:
score=actual_score, score=actual_score,
expected_score=expected_score, expected_score=expected_score,
passed=passed, passed=passed,
agent_evaluations=agent_evaluations agent_evaluations=agent_evaluations,
) )
except Exception as e: except Exception as e:
self.display.console.print(f"[red]Error running test case: {str(e)}[/red]") self.display.console.print(f"[red]Error running test case: {e!s}[/red]")
return ExperimentResult( return ExperimentResult(
identifier=identifier, identifier=identifier,
inputs=inputs, inputs=inputs,
score=0, score=0.0,
expected_score=expected_score, expected_score=expected_score,
passed=False passed=False,
) )
def _extract_scores(self, agent_evaluations: dict[str, AgentAggregatedEvaluationResult]) -> float | dict[str, float]: def _extract_scores(
self, agent_evaluations: dict[str, AgentAggregatedEvaluationResult]
) -> float | dict[str, float]:
all_scores: dict[str, list[float]] = defaultdict(list) all_scores: dict[str, list[float]] = defaultdict(list)
for evaluation in agent_evaluations.values(): for evaluation in agent_evaluations.values():
for metric_name, score in evaluation.metrics.items(): for metric_name, score in evaluation.metrics.items():
if score.score is not None: if score.score is not None:
all_scores[metric_name.value].append(score.score) all_scores[metric_name.value].append(score.score)
avg_scores = {m: sum(s)/len(s) for m, s in all_scores.items()} avg_scores = {m: sum(s) / len(s) for m, s in all_scores.items()}
if len(avg_scores) == 1: if len(avg_scores) == 1:
return list(avg_scores.values())[0] return next(iter(avg_scores.values()))
return avg_scores return avg_scores
def _assert_scores(self, expected: float | dict[str, float], def _assert_scores(
actual: float | dict[str, float]) -> bool: self, expected: float | dict[str, float], actual: float | dict[str, float]
) -> bool:
""" """
Compare expected and actual scores, and return whether the test case passed. Compare expected and actual scores, and return whether the test case passed.
@@ -122,4 +156,4 @@ class ExperimentRunner:
# All matching keys must have actual >= expected # All matching keys must have actual >= expected
return all(actual[key] >= expected[key] for key in matching_keys) return all(actual[key] >= expected[key] for key in matching_keys)
return False return False

View File

@@ -13,11 +13,11 @@ def extract_json_from_llm_response(text: str) -> dict[str, Any]:
json_patterns = [ json_patterns = [
# Standard markdown code blocks with json # Standard markdown code blocks with json
r'```json\s*([\s\S]*?)\s*```', r"```json\s*([\s\S]*?)\s*```",
# Code blocks without language specifier # Code blocks without language specifier
r'```\s*([\s\S]*?)\s*```', r"```\s*([\s\S]*?)\s*```",
# Inline code with JSON # Inline code with JSON
r'`([{\\[].*[}\]])`', r"`([{\\[].*[}\]])`",
] ]
for pattern in json_patterns: for pattern in json_patterns:
@@ -25,6 +25,6 @@ def extract_json_from_llm_response(text: str) -> dict[str, Any]:
for match in matches: for match in matches:
try: try:
return json.loads(match.strip()) return json.loads(match.strip())
except json.JSONDecodeError: except json.JSONDecodeError: # noqa: PERF203
continue continue
raise ValueError("No valid JSON found in the response") raise ValueError("No valid JSON found in the response")

View File

@@ -1,26 +1,21 @@
from crewai.experimental.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator
from crewai.experimental.evaluation.metrics.reasoning_metrics import ( from crewai.experimental.evaluation.metrics.reasoning_metrics import (
ReasoningEfficiencyEvaluator ReasoningEfficiencyEvaluator,
) )
from crewai.experimental.evaluation.metrics.tools_metrics import (
ToolSelectionEvaluator,
ParameterExtractionEvaluator,
ToolInvocationEvaluator
)
from crewai.experimental.evaluation.metrics.goal_metrics import (
GoalAlignmentEvaluator
)
from crewai.experimental.evaluation.metrics.semantic_quality_metrics import ( from crewai.experimental.evaluation.metrics.semantic_quality_metrics import (
SemanticQualityEvaluator SemanticQualityEvaluator,
)
from crewai.experimental.evaluation.metrics.tools_metrics import (
ParameterExtractionEvaluator,
ToolInvocationEvaluator,
ToolSelectionEvaluator,
) )
__all__ = [ __all__ = [
"ReasoningEfficiencyEvaluator",
"ToolSelectionEvaluator",
"ParameterExtractionEvaluator",
"ToolInvocationEvaluator",
"GoalAlignmentEvaluator", "GoalAlignmentEvaluator",
"SemanticQualityEvaluator" "ParameterExtractionEvaluator",
] "ReasoningEfficiencyEvaluator",
"SemanticQualityEvaluator",
"ToolInvocationEvaluator",
"ToolSelectionEvaluator",
]

View File

@@ -1,10 +1,15 @@
from typing import Any, Dict from typing import Any
from crewai.agent import Agent from crewai.agent import Agent
from crewai.agents.agent_builder.base_agent import BaseAgent
from crewai.experimental.evaluation.base_evaluator import (
BaseEvaluator,
EvaluationScore,
MetricCategory,
)
from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
from crewai.task import Task from crewai.task import Task
from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
class GoalAlignmentEvaluator(BaseEvaluator): class GoalAlignmentEvaluator(BaseEvaluator):
@property @property
@@ -13,8 +18,8 @@ class GoalAlignmentEvaluator(BaseEvaluator):
def evaluate( def evaluate(
self, self,
agent: Agent, agent: Agent | BaseAgent,
execution_trace: Dict[str, Any], execution_trace: dict[str, Any],
final_output: Any, final_output: Any,
task: Task | None = None, task: Task | None = None,
) -> EvaluationScore: ) -> EvaluationScore:
@@ -23,7 +28,9 @@ class GoalAlignmentEvaluator(BaseEvaluator):
task_context = f"Task description: {task.description}\nExpected output: {task.expected_output}\n" task_context = f"Task description: {task.description}\nExpected output: {task.expected_output}\n"
prompt = [ prompt = [
{"role": "system", "content": """You are an expert evaluator assessing how well an AI agent's output aligns with its assigned task goal. {
"role": "system",
"content": """You are an expert evaluator assessing how well an AI agent's output aligns with its assigned task goal.
Score the agent's goal alignment on a scale from 0-10 where: Score the agent's goal alignment on a scale from 0-10 where:
- 0: Complete misalignment, agent did not understand or attempt the task goal - 0: Complete misalignment, agent did not understand or attempt the task goal
@@ -37,8 +44,11 @@ Consider:
4. Did the agent provide all requested information or deliverables? 4. Did the agent provide all requested information or deliverables?
Return your evaluation as JSON with fields 'score' (number) and 'feedback' (string). Return your evaluation as JSON with fields 'score' (number) and 'feedback' (string).
"""}, """,
{"role": "user", "content": f""" },
{
"role": "user",
"content": f"""
Agent role: {agent.role} Agent role: {agent.role}
Agent goal: {agent.goal} Agent goal: {agent.goal}
{task_context} {task_context}
@@ -47,23 +57,26 @@ Agent's final output:
{final_output} {final_output}
Evaluate how well the agent's output aligns with the assigned task goal. Evaluate how well the agent's output aligns with the assigned task goal.
"""} """,
},
] ]
assert self.llm is not None if self.llm is None:
raise ValueError("LLM must be initialized")
response = self.llm.call(prompt) response = self.llm.call(prompt)
try: try:
evaluation_data: dict[str, Any] = extract_json_from_llm_response(response) evaluation_data: dict[str, Any] = extract_json_from_llm_response(response)
assert evaluation_data is not None if evaluation_data is None:
raise ValueError("Failed to extract evaluation data from LLM response")
return EvaluationScore( return EvaluationScore(
score=evaluation_data.get("score", 0), score=evaluation_data.get("score", 0),
feedback=evaluation_data.get("feedback", response), feedback=evaluation_data.get("feedback", response),
raw_response=response raw_response=response,
) )
except Exception: except Exception:
return EvaluationScore( return EvaluationScore(
score=None, score=None,
feedback=f"Failed to parse evaluation. Raw response: {response}", feedback=f"Failed to parse evaluation. Raw response: {response}",
raw_response=response raw_response=response,
) )

View File

@@ -8,18 +8,24 @@ This module provides evaluator implementations for:
import logging import logging
import re import re
from enum import Enum
from typing import Any, Dict, List, Tuple
import numpy as np
from collections.abc import Sequence from collections.abc import Sequence
from enum import Enum
from typing import Any
import numpy as np
from crewai.agent import Agent from crewai.agent import Agent
from crewai.task import Task from crewai.agents.agent_builder.base_agent import BaseAgent
from crewai.experimental.evaluation.base_evaluator import (
from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory BaseEvaluator,
EvaluationScore,
MetricCategory,
)
from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
from crewai.task import Task
from crewai.tasks.task_output import TaskOutput from crewai.tasks.task_output import TaskOutput
class ReasoningPatternType(Enum): class ReasoningPatternType(Enum):
EFFICIENT = "efficient" # Good reasoning flow EFFICIENT = "efficient" # Good reasoning flow
LOOP = "loop" # Agent is stuck in a loop LOOP = "loop" # Agent is stuck in a loop
@@ -35,8 +41,8 @@ class ReasoningEfficiencyEvaluator(BaseEvaluator):
def evaluate( def evaluate(
self, self,
agent: Agent, agent: Agent | BaseAgent,
execution_trace: Dict[str, Any], execution_trace: dict[str, Any],
final_output: TaskOutput | str, final_output: TaskOutput | str,
task: Task | None = None, task: Task | None = None,
) -> EvaluationScore: ) -> EvaluationScore:
@@ -49,7 +55,7 @@ class ReasoningEfficiencyEvaluator(BaseEvaluator):
if not llm_calls or len(llm_calls) < 2: if not llm_calls or len(llm_calls) < 2:
return EvaluationScore( return EvaluationScore(
score=None, score=None,
feedback="Insufficient LLM calls to evaluate reasoning efficiency." feedback="Insufficient LLM calls to evaluate reasoning efficiency.",
) )
total_calls = len(llm_calls) total_calls = len(llm_calls)
@@ -58,12 +64,16 @@ class ReasoningEfficiencyEvaluator(BaseEvaluator):
time_intervals = [] time_intervals = []
has_reliable_timing = True has_reliable_timing = True
for i in range(1, len(llm_calls)): for i in range(1, len(llm_calls)):
start_time = llm_calls[i-1].get("end_time") start_time = llm_calls[i - 1].get("end_time")
end_time = llm_calls[i].get("start_time") end_time = llm_calls[i].get("start_time")
if start_time and end_time and start_time != end_time: if start_time and end_time and start_time != end_time:
try: try:
interval = end_time - start_time interval = end_time - start_time
time_intervals.append(interval.total_seconds() if hasattr(interval, 'total_seconds') else 0) time_intervals.append(
interval.total_seconds()
if hasattr(interval, "total_seconds")
else 0
)
except Exception: except Exception:
has_reliable_timing = False has_reliable_timing = False
else: else:
@@ -83,14 +93,22 @@ class ReasoningEfficiencyEvaluator(BaseEvaluator):
if has_reliable_timing and time_intervals: if has_reliable_timing and time_intervals:
efficiency_metrics["avg_time_between_calls"] = np.mean(time_intervals) efficiency_metrics["avg_time_between_calls"] = np.mean(time_intervals)
loop_info = f"Detected {len(loop_details)} potential reasoning loops." if loop_detected else "No significant reasoning loops detected." loop_info = (
f"Detected {len(loop_details)} potential reasoning loops."
if loop_detected
else "No significant reasoning loops detected."
)
call_samples = self._get_call_samples(llm_calls) call_samples = self._get_call_samples(llm_calls)
final_output = final_output.raw if isinstance(final_output, TaskOutput) else final_output final_output = (
final_output.raw if isinstance(final_output, TaskOutput) else final_output
)
prompt = [ prompt = [
{"role": "system", "content": """You are an expert evaluator assessing the reasoning efficiency of an AI agent's thought process. {
"role": "system",
"content": """You are an expert evaluator assessing the reasoning efficiency of an AI agent's thought process.
Evaluate the agent's reasoning efficiency across these five key subcategories: Evaluate the agent's reasoning efficiency across these five key subcategories:
@@ -120,8 +138,11 @@ Return your evaluation as JSON with the following structure:
"feedback": string (general feedback about overall reasoning efficiency), "feedback": string (general feedback about overall reasoning efficiency),
"optimization_suggestions": string (concrete suggestions for improving reasoning efficiency), "optimization_suggestions": string (concrete suggestions for improving reasoning efficiency),
"detected_patterns": string (describe any inefficient reasoning patterns you observe) "detected_patterns": string (describe any inefficient reasoning patterns you observe)
}"""}, }""",
{"role": "user", "content": f""" },
{
"role": "user",
"content": f"""
Agent role: {agent.role} Agent role: {agent.role}
{task_context} {task_context}
@@ -140,10 +161,12 @@ Agent's final output:
Evaluate the reasoning efficiency of this agent based on these interaction patterns. Evaluate the reasoning efficiency of this agent based on these interaction patterns.
Identify any inefficient reasoning patterns and provide specific suggestions for optimization. Identify any inefficient reasoning patterns and provide specific suggestions for optimization.
"""} """,
},
] ]
assert self.llm is not None if self.llm is None:
raise ValueError("LLM must be initialized")
response = self.llm.call(prompt) response = self.llm.call(prompt)
try: try:
@@ -156,34 +179,46 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
conciseness = scores.get("conciseness", 5.0) conciseness = scores.get("conciseness", 5.0)
loop_avoidance = scores.get("loop_avoidance", 5.0) loop_avoidance = scores.get("loop_avoidance", 5.0)
overall_score = evaluation_data.get("overall_score", evaluation_data.get("score", 5.0)) overall_score = evaluation_data.get(
"overall_score", evaluation_data.get("score", 5.0)
)
feedback = evaluation_data.get("feedback", "No detailed feedback provided.") feedback = evaluation_data.get("feedback", "No detailed feedback provided.")
optimization_suggestions = evaluation_data.get("optimization_suggestions", "No specific suggestions provided.") optimization_suggestions = evaluation_data.get(
"optimization_suggestions", "No specific suggestions provided."
)
detailed_feedback = "Reasoning Efficiency Evaluation:\n" detailed_feedback = "Reasoning Efficiency Evaluation:\n"
detailed_feedback += f"• Focus: {focus}/10 - Staying on topic without tangents\n" detailed_feedback += (
detailed_feedback += f"• Progression: {progression}/10 - Building on previous thinking\n" f"• Focus: {focus}/10 - Staying on topic without tangents\n"
)
detailed_feedback += (
f"• Progression: {progression}/10 - Building on previous thinking\n"
)
detailed_feedback += f"• Decision Quality: {decision_quality}/10 - Making appropriate decisions\n" detailed_feedback += f"• Decision Quality: {decision_quality}/10 - Making appropriate decisions\n"
detailed_feedback += f"• Conciseness: {conciseness}/10 - Communicating efficiently\n" detailed_feedback += (
f"• Conciseness: {conciseness}/10 - Communicating efficiently\n"
)
detailed_feedback += f"• Loop Avoidance: {loop_avoidance}/10 - Avoiding repetitive patterns\n\n" detailed_feedback += f"• Loop Avoidance: {loop_avoidance}/10 - Avoiding repetitive patterns\n\n"
detailed_feedback += f"Feedback:\n{feedback}\n\n" detailed_feedback += f"Feedback:\n{feedback}\n\n"
detailed_feedback += f"Optimization Suggestions:\n{optimization_suggestions}" detailed_feedback += (
f"Optimization Suggestions:\n{optimization_suggestions}"
)
return EvaluationScore( return EvaluationScore(
score=float(overall_score), score=float(overall_score),
feedback=detailed_feedback, feedback=detailed_feedback,
raw_response=response raw_response=response,
) )
except Exception as e: except Exception as e:
logging.warning(f"Failed to parse reasoning efficiency evaluation: {e}") logging.warning(f"Failed to parse reasoning efficiency evaluation: {e}")
return EvaluationScore( return EvaluationScore(
score=None, score=None,
feedback=f"Failed to parse reasoning efficiency evaluation. Raw response: {response[:200]}...", feedback=f"Failed to parse reasoning efficiency evaluation. Raw response: {response[:200]}...",
raw_response=response raw_response=response,
) )
def _detect_loops(self, llm_calls: List[Dict]) -> Tuple[bool, List[Dict]]: def _detect_loops(self, llm_calls: list[dict]) -> tuple[bool, list[dict]]:
loop_details = [] loop_details = []
messages = [] messages = []
@@ -193,9 +228,11 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
messages.append(content) messages.append(content)
elif isinstance(content, list) and len(content) > 0: elif isinstance(content, list) and len(content) > 0:
# Handle message list format # Handle message list format
for msg in content: messages.extend(
if isinstance(msg, dict) and "content" in msg: msg["content"]
messages.append(msg["content"]) for msg in content
if isinstance(msg, dict) and "content" in msg
)
# Simple n-gram based similarity detection # Simple n-gram based similarity detection
# For a more robust implementation, consider using embedding-based similarity # For a more robust implementation, consider using embedding-based similarity
@@ -205,18 +242,20 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
# A more sophisticated approach would use semantic similarity # A more sophisticated approach would use semantic similarity
similarity = self._calculate_text_similarity(messages[i], messages[j]) similarity = self._calculate_text_similarity(messages[i], messages[j])
if similarity > 0.7: # Arbitrary threshold if similarity > 0.7: # Arbitrary threshold
loop_details.append({ loop_details.append(
"first_occurrence": i, {
"second_occurrence": j, "first_occurrence": i,
"similarity": similarity, "second_occurrence": j,
"snippet": messages[i][:100] + "..." "similarity": similarity,
}) "snippet": messages[i][:100] + "...",
}
)
return len(loop_details) > 0, loop_details return len(loop_details) > 0, loop_details
def _calculate_text_similarity(self, text1: str, text2: str) -> float: def _calculate_text_similarity(self, text1: str, text2: str) -> float:
text1 = re.sub(r'\s+', ' ', text1.lower()).strip() text1 = re.sub(r"\s+", " ", text1.lower()).strip()
text2 = re.sub(r'\s+', ' ', text2.lower()).strip() text2 = re.sub(r"\s+", " ", text2.lower()).strip()
# Simple Jaccard similarity on word sets # Simple Jaccard similarity on word sets
words1 = set(text1.split()) words1 = set(text1.split())
@@ -227,7 +266,7 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
return intersection / union if union > 0 else 0.0 return intersection / union if union > 0 else 0.0
def _analyze_reasoning_patterns(self, llm_calls: List[Dict]) -> Dict[str, Any]: def _analyze_reasoning_patterns(self, llm_calls: list[dict]) -> dict[str, Any]:
call_lengths = [] call_lengths = []
response_times = [] response_times = []
@@ -248,8 +287,8 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
if start_time and end_time: if start_time and end_time:
try: try:
response_times.append(end_time - start_time) response_times.append(end_time - start_time)
except Exception: except Exception as e:
pass logging.debug(f"Failed to calculate response time: {e}")
avg_length = np.mean(call_lengths) if call_lengths else 0 avg_length = np.mean(call_lengths) if call_lengths else 0
std_length = np.std(call_lengths) if call_lengths else 0 std_length = np.std(call_lengths) if call_lengths else 0
@@ -267,7 +306,9 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
details = "Agent is consistently verbose across interactions." details = "Agent is consistently verbose across interactions."
elif len(llm_calls) > 10 and length_trend > 0.5: elif len(llm_calls) > 10 and length_trend > 0.5:
primary_pattern = ReasoningPatternType.INDECISIVE primary_pattern = ReasoningPatternType.INDECISIVE
details = "Agent shows signs of indecisiveness with increasing message lengths." details = (
"Agent shows signs of indecisiveness with increasing message lengths."
)
elif std_length / avg_length > 0.8: elif std_length / avg_length > 0.8:
primary_pattern = ReasoningPatternType.SCATTERED primary_pattern = ReasoningPatternType.SCATTERED
details = "Agent shows inconsistent reasoning flow with highly variable responses." details = "Agent shows inconsistent reasoning flow with highly variable responses."
@@ -279,8 +320,8 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
"avg_length": avg_length, "avg_length": avg_length,
"std_length": std_length, "std_length": std_length,
"length_trend": length_trend, "length_trend": length_trend,
"loop_score": loop_score "loop_score": loop_score,
} },
} }
def _calculate_trend(self, values: Sequence[float | int]) -> float: def _calculate_trend(self, values: Sequence[float | int]) -> float:
@@ -303,7 +344,9 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
except Exception: except Exception:
return 0.0 return 0.0
def _calculate_loop_likelihood(self, call_lengths: Sequence[float], response_times: Sequence[float]) -> float: def _calculate_loop_likelihood(
self, call_lengths: Sequence[float], response_times: Sequence[float]
) -> float:
if not call_lengths or len(call_lengths) < 3: if not call_lengths or len(call_lengths) < 3:
return 0.0 return 0.0
@@ -312,7 +355,11 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
if len(call_lengths) >= 4: if len(call_lengths) >= 4:
repeated_lengths = 0 repeated_lengths = 0
for i in range(len(call_lengths) - 2): for i in range(len(call_lengths) - 2):
ratio = call_lengths[i] / call_lengths[i + 2] if call_lengths[i + 2] > 0 else 0 ratio = (
call_lengths[i] / call_lengths[i + 2]
if call_lengths[i + 2] > 0
else 0
)
if 0.85 <= ratio <= 1.15: if 0.85 <= ratio <= 1.15:
repeated_lengths += 1 repeated_lengths += 1
@@ -324,21 +371,27 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
std_time = np.std(response_times) std_time = np.std(response_times)
mean_time = np.mean(response_times) mean_time = np.mean(response_times)
if mean_time > 0: if mean_time > 0:
time_consistency = 1.0 - (std_time / mean_time) time_consistency = 1.0 - (float(std_time) / float(mean_time))
indicators.append(max(0, time_consistency - 0.3) * 1.5) indicators.append(max(0.0, float(time_consistency - 0.3)) * 1.5)
except Exception: except Exception as e:
pass logging.debug(f"Time consistency calculation failed: {e}")
return np.mean(indicators) if indicators else 0.0 return float(np.mean(indicators)) if indicators else 0.0
def _get_call_samples(self, llm_calls: List[Dict]) -> str: def _get_call_samples(self, llm_calls: list[dict]) -> str:
samples = [] samples = []
if len(llm_calls) <= 6: if len(llm_calls) <= 6:
sample_indices = list(range(len(llm_calls))) sample_indices = list(range(len(llm_calls)))
else: else:
sample_indices = [0, 1, len(llm_calls) // 2 - 1, len(llm_calls) // 2, sample_indices = [
len(llm_calls) - 2, len(llm_calls) - 1] 0,
1,
len(llm_calls) // 2 - 1,
len(llm_calls) // 2,
len(llm_calls) - 2,
len(llm_calls) - 1,
]
for idx in sample_indices: for idx in sample_indices:
call = llm_calls[idx] call = llm_calls[idx]
@@ -347,10 +400,11 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
if isinstance(content, str): if isinstance(content, str):
sample = content sample = content
elif isinstance(content, list) and len(content) > 0: elif isinstance(content, list) and len(content) > 0:
sample_parts = [] sample_parts = [
for msg in content: msg["content"]
if isinstance(msg, dict) and "content" in msg: for msg in content
sample_parts.append(msg["content"]) if isinstance(msg, dict) and "content" in msg
]
sample = "\n".join(sample_parts) sample = "\n".join(sample_parts)
else: else:
sample = str(content) sample = str(content)

View File

@@ -1,10 +1,15 @@
from typing import Any, Dict from typing import Any
from crewai.agent import Agent from crewai.agent import Agent
from crewai.agents.agent_builder.base_agent import BaseAgent
from crewai.experimental.evaluation.base_evaluator import (
BaseEvaluator,
EvaluationScore,
MetricCategory,
)
from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
from crewai.task import Task from crewai.task import Task
from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
class SemanticQualityEvaluator(BaseEvaluator): class SemanticQualityEvaluator(BaseEvaluator):
@property @property
@@ -13,8 +18,8 @@ class SemanticQualityEvaluator(BaseEvaluator):
def evaluate( def evaluate(
self, self,
agent: Agent, agent: Agent | BaseAgent,
execution_trace: Dict[str, Any], execution_trace: dict[str, Any],
final_output: Any, final_output: Any,
task: Task | None = None, task: Task | None = None,
) -> EvaluationScore: ) -> EvaluationScore:
@@ -22,7 +27,9 @@ class SemanticQualityEvaluator(BaseEvaluator):
if task is not None: if task is not None:
task_context = f"Task description: {task.description}" task_context = f"Task description: {task.description}"
prompt = [ prompt = [
{"role": "system", "content": """You are an expert evaluator assessing the semantic quality of an AI agent's output. {
"role": "system",
"content": """You are an expert evaluator assessing the semantic quality of an AI agent's output.
Score the semantic quality on a scale from 0-10 where: Score the semantic quality on a scale from 0-10 where:
- 0: Completely incoherent, confusing, or logically flawed output - 0: Completely incoherent, confusing, or logically flawed output
@@ -37,8 +44,11 @@ Consider:
5. Is the output free from contradictions and logical fallacies? 5. Is the output free from contradictions and logical fallacies?
Return your evaluation as JSON with fields 'score' (number) and 'feedback' (string). Return your evaluation as JSON with fields 'score' (number) and 'feedback' (string).
"""}, """,
{"role": "user", "content": f""" },
{
"role": "user",
"content": f"""
Agent role: {agent.role} Agent role: {agent.role}
{task_context} {task_context}
@@ -46,23 +56,28 @@ Agent's final output:
{final_output} {final_output}
Evaluate the semantic quality and reasoning of this output. Evaluate the semantic quality and reasoning of this output.
"""} """,
},
] ]
assert self.llm is not None if self.llm is None:
raise ValueError("LLM must be initialized")
response = self.llm.call(prompt) response = self.llm.call(prompt)
try: try:
evaluation_data: dict[str, Any] = extract_json_from_llm_response(response) evaluation_data: dict[str, Any] = extract_json_from_llm_response(response)
assert evaluation_data is not None if evaluation_data is None:
raise ValueError("Failed to extract evaluation data from LLM response")
return EvaluationScore( return EvaluationScore(
score=float(evaluation_data["score"]) if evaluation_data.get("score") is not None else None, score=float(evaluation_data["score"])
if evaluation_data.get("score") is not None
else None,
feedback=evaluation_data.get("feedback", response), feedback=evaluation_data.get("feedback", response),
raw_response=response raw_response=response,
) )
except Exception: except Exception:
return EvaluationScore( return EvaluationScore(
score=None, score=None,
feedback=f"Failed to parse evaluation. Raw response: {response}", feedback=f"Failed to parse evaluation. Raw response: {response}",
raw_response=response raw_response=response,
) )

View File

@@ -1,22 +1,26 @@
import json import json
from typing import Dict, Any from typing import Any
from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
from crewai.agent import Agent from crewai.agent import Agent
from crewai.agents.agent_builder.base_agent import BaseAgent
from crewai.experimental.evaluation.base_evaluator import (
BaseEvaluator,
EvaluationScore,
MetricCategory,
)
from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
from crewai.task import Task from crewai.task import Task
class ToolSelectionEvaluator(BaseEvaluator): class ToolSelectionEvaluator(BaseEvaluator):
@property @property
def metric_category(self) -> MetricCategory: def metric_category(self) -> MetricCategory:
return MetricCategory.TOOL_SELECTION return MetricCategory.TOOL_SELECTION
def evaluate( def evaluate(
self, self,
agent: Agent, agent: Agent | BaseAgent,
execution_trace: Dict[str, Any], execution_trace: dict[str, Any],
final_output: str, final_output: str,
task: Task | None = None, task: Task | None = None,
) -> EvaluationScore: ) -> EvaluationScore:
@@ -26,19 +30,18 @@ class ToolSelectionEvaluator(BaseEvaluator):
tool_uses = execution_trace.get("tool_uses", []) tool_uses = execution_trace.get("tool_uses", [])
tool_count = len(tool_uses) tool_count = len(tool_uses)
unique_tool_types = set([tool.get("tool", "Unknown tool") for tool in tool_uses]) unique_tool_types = set(
[tool.get("tool", "Unknown tool") for tool in tool_uses]
)
if tool_count == 0: if tool_count == 0:
if not agent.tools: if not agent.tools:
return EvaluationScore( return EvaluationScore(
score=None, score=None, feedback="Agent had no tools available to use."
feedback="Agent had no tools available to use."
)
else:
return EvaluationScore(
score=None,
feedback="Agent had tools available but didn't use any."
) )
return EvaluationScore(
score=None, feedback="Agent had tools available but didn't use any."
)
available_tools_info = "" available_tools_info = ""
if agent.tools: if agent.tools:
@@ -52,7 +55,9 @@ class ToolSelectionEvaluator(BaseEvaluator):
tool_types_summary += f"- {tool_type}\n" tool_types_summary += f"- {tool_type}\n"
prompt = [ prompt = [
{"role": "system", "content": """You are an expert evaluator assessing if an AI agent selected the most appropriate tools for a given task. {
"role": "system",
"content": """You are an expert evaluator assessing if an AI agent selected the most appropriate tools for a given task.
You must evaluate based on these 2 criteria: You must evaluate based on these 2 criteria:
1. Relevance (0-10): Were the tools chosen directly aligned with the task's goals? 1. Relevance (0-10): Were the tools chosen directly aligned with the task's goals?
@@ -73,8 +78,11 @@ Return your evaluation as JSON with these fields:
- overall_score: number (average of all scores, 0-10) - overall_score: number (average of all scores, 0-10)
- feedback: string (focused ONLY on tool selection decisions from available tools) - feedback: string (focused ONLY on tool selection decisions from available tools)
- improvement_suggestions: string (ONLY suggest better selection from the AVAILABLE tools list, NOT new tools) - improvement_suggestions: string (ONLY suggest better selection from the AVAILABLE tools list, NOT new tools)
"""}, """,
{"role": "user", "content": f""" },
{
"role": "user",
"content": f"""
Agent role: {agent.role} Agent role: {agent.role}
{task_context} {task_context}
@@ -89,14 +97,17 @@ IMPORTANT:
- ONLY evaluate selection from tools listed as available - ONLY evaluate selection from tools listed as available
- DO NOT suggest new tools that aren't in the available tools list - DO NOT suggest new tools that aren't in the available tools list
- DO NOT evaluate tool usage or results - DO NOT evaluate tool usage or results
"""} """,
},
] ]
assert self.llm is not None if self.llm is None:
raise ValueError("LLM must be initialized")
response = self.llm.call(prompt) response = self.llm.call(prompt)
try: try:
evaluation_data = extract_json_from_llm_response(response) evaluation_data = extract_json_from_llm_response(response)
assert evaluation_data is not None if evaluation_data is None:
raise ValueError("Failed to extract evaluation data from LLM response")
scores = evaluation_data.get("scores", {}) scores = evaluation_data.get("scores", {})
relevance = scores.get("relevance", 5.0) relevance = scores.get("relevance", 5.0)
@@ -105,22 +116,24 @@ IMPORTANT:
feedback = "Tool Selection Evaluation:\n" feedback = "Tool Selection Evaluation:\n"
feedback += f"• Relevance: {relevance}/10 - Selection of appropriate tool types for the task\n" feedback += f"• Relevance: {relevance}/10 - Selection of appropriate tool types for the task\n"
feedback += f"• Coverage: {coverage}/10 - Selection of all necessary tool types\n" feedback += (
f"• Coverage: {coverage}/10 - Selection of all necessary tool types\n"
)
if "improvement_suggestions" in evaluation_data: if "improvement_suggestions" in evaluation_data:
feedback += f"Improvement Suggestions:\n{evaluation_data['improvement_suggestions']}" feedback += f"Improvement Suggestions:\n{evaluation_data['improvement_suggestions']}"
else: else:
feedback += evaluation_data.get("feedback", "No detailed feedback available.") feedback += evaluation_data.get(
"feedback", "No detailed feedback available."
)
return EvaluationScore( return EvaluationScore(
score=overall_score, score=overall_score, feedback=feedback, raw_response=response
feedback=feedback,
raw_response=response
) )
except Exception as e: except Exception as e:
return EvaluationScore( return EvaluationScore(
score=None, score=None,
feedback=f"Error evaluating tool selection: {e}", feedback=f"Error evaluating tool selection: {e}",
raw_response=response raw_response=response,
) )
@@ -131,8 +144,8 @@ class ParameterExtractionEvaluator(BaseEvaluator):
def evaluate( def evaluate(
self, self,
agent: Agent, agent: Agent | BaseAgent,
execution_trace: Dict[str, Any], execution_trace: dict[str, Any],
final_output: str, final_output: str,
task: Task | None = None, task: Task | None = None,
) -> EvaluationScore: ) -> EvaluationScore:
@@ -145,19 +158,23 @@ class ParameterExtractionEvaluator(BaseEvaluator):
if tool_count == 0: if tool_count == 0:
return EvaluationScore( return EvaluationScore(
score=None, score=None,
feedback="No tool usage detected. Cannot evaluate parameter extraction." feedback="No tool usage detected. Cannot evaluate parameter extraction.",
) )
validation_errors = [] validation_errors = [
for tool_use in tool_uses: {
if not tool_use.get("success", True) and tool_use.get("error_type") == "validation_error": "tool": tool_use.get("tool", "Unknown tool"),
validation_errors.append({ "error": tool_use.get("result"),
"tool": tool_use.get("tool", "Unknown tool"), "args": tool_use.get("args", {}),
"error": tool_use.get("result"), }
"args": tool_use.get("args", {}) for tool_use in tool_uses
}) if not tool_use.get("success", True)
and tool_use.get("error_type") == "validation_error"
]
validation_error_rate = len(validation_errors) / tool_count if tool_count > 0 else 0 validation_error_rate = (
len(validation_errors) / tool_count if tool_count > 0 else 0
)
param_samples = [] param_samples = []
for i, tool_use in enumerate(tool_uses[:5]): for i, tool_use in enumerate(tool_uses[:5]):
@@ -168,7 +185,7 @@ class ParameterExtractionEvaluator(BaseEvaluator):
is_validation_error = error_type == "validation_error" is_validation_error = error_type == "validation_error"
sample = f"Tool use #{i+1} - {tool_name}:\n" sample = f"Tool use #{i + 1} - {tool_name}:\n"
sample += f"- Parameters: {json.dumps(tool_args, indent=2)}\n" sample += f"- Parameters: {json.dumps(tool_args, indent=2)}\n"
sample += f"- Success: {'No' if not success else 'Yes'}" sample += f"- Success: {'No' if not success else 'Yes'}"
@@ -187,13 +204,17 @@ class ParameterExtractionEvaluator(BaseEvaluator):
tool_name = err.get("tool", "Unknown tool") tool_name = err.get("tool", "Unknown tool")
error_msg = err.get("error", "Unknown error") error_msg = err.get("error", "Unknown error")
args = err.get("args", {}) args = err.get("args", {})
validation_errors_info += f"\nValidation Error #{i+1}:\n- Tool: {tool_name}\n- Args: {json.dumps(args, indent=2)}\n- Error: {error_msg}" validation_errors_info += f"\nValidation Error #{i + 1}:\n- Tool: {tool_name}\n- Args: {json.dumps(args, indent=2)}\n- Error: {error_msg}"
if len(validation_errors) > 3: if len(validation_errors) > 3:
validation_errors_info += f"\n...and {len(validation_errors) - 3} more validation errors." validation_errors_info += (
f"\n...and {len(validation_errors) - 3} more validation errors."
)
param_samples_text = "\n\n".join(param_samples) param_samples_text = "\n\n".join(param_samples)
prompt = [ prompt = [
{"role": "system", "content": """You are an expert evaluator assessing how well an AI agent extracts and formats PARAMETER VALUES for tool calls. {
"role": "system",
"content": """You are an expert evaluator assessing how well an AI agent extracts and formats PARAMETER VALUES for tool calls.
Your job is to evaluate ONLY whether the agent used the correct parameter VALUES, not whether the right tools were selected or how the tools were invoked. Your job is to evaluate ONLY whether the agent used the correct parameter VALUES, not whether the right tools were selected or how the tools were invoked.
@@ -216,8 +237,11 @@ Return your evaluation as JSON with these fields:
- overall_score: number (average of all scores, 0-10) - overall_score: number (average of all scores, 0-10)
- feedback: string (focused ONLY on parameter value extraction quality) - feedback: string (focused ONLY on parameter value extraction quality)
- improvement_suggestions: string (concrete suggestions for better parameter VALUE extraction) - improvement_suggestions: string (concrete suggestions for better parameter VALUE extraction)
"""}, """,
{"role": "user", "content": f""" },
{
"role": "user",
"content": f"""
Agent role: {agent.role} Agent role: {agent.role}
{task_context} {task_context}
@@ -226,15 +250,18 @@ Parameter extraction examples:
{validation_errors_info} {validation_errors_info}
Evaluate the quality of the agent's parameter extraction for this task. Evaluate the quality of the agent's parameter extraction for this task.
"""} """,
},
] ]
assert self.llm is not None if self.llm is None:
raise ValueError("LLM must be initialized")
response = self.llm.call(prompt) response = self.llm.call(prompt)
try: try:
evaluation_data = extract_json_from_llm_response(response) evaluation_data = extract_json_from_llm_response(response)
assert evaluation_data is not None if evaluation_data is None:
raise ValueError("Failed to extract evaluation data from LLM response")
scores = evaluation_data.get("scores", {}) scores = evaluation_data.get("scores", {})
accuracy = scores.get("accuracy", 5.0) accuracy = scores.get("accuracy", 5.0)
@@ -251,18 +278,18 @@ Evaluate the quality of the agent's parameter extraction for this task.
if "improvement_suggestions" in evaluation_data: if "improvement_suggestions" in evaluation_data:
feedback += f"Improvement Suggestions:\n{evaluation_data['improvement_suggestions']}" feedback += f"Improvement Suggestions:\n{evaluation_data['improvement_suggestions']}"
else: else:
feedback += evaluation_data.get("feedback", "No detailed feedback available.") feedback += evaluation_data.get(
"feedback", "No detailed feedback available."
)
return EvaluationScore( return EvaluationScore(
score=overall_score, score=overall_score, feedback=feedback, raw_response=response
feedback=feedback,
raw_response=response
) )
except Exception as e: except Exception as e:
return EvaluationScore( return EvaluationScore(
score=None, score=None,
feedback=f"Error evaluating parameter extraction: {e}", feedback=f"Error evaluating parameter extraction: {e}",
raw_response=response raw_response=response,
) )
@@ -273,8 +300,8 @@ class ToolInvocationEvaluator(BaseEvaluator):
def evaluate( def evaluate(
self, self,
agent: Agent, agent: Agent | BaseAgent,
execution_trace: Dict[str, Any], execution_trace: dict[str, Any],
final_output: str, final_output: str,
task: Task | None = None, task: Task | None = None,
) -> EvaluationScore: ) -> EvaluationScore:
@@ -288,7 +315,7 @@ class ToolInvocationEvaluator(BaseEvaluator):
if tool_count == 0: if tool_count == 0:
return EvaluationScore( return EvaluationScore(
score=None, score=None,
feedback="No tool usage detected. Cannot evaluate tool invocation." feedback="No tool usage detected. Cannot evaluate tool invocation.",
) )
for tool_use in tool_uses: for tool_use in tool_uses:
@@ -296,7 +323,7 @@ class ToolInvocationEvaluator(BaseEvaluator):
error_info = { error_info = {
"tool": tool_use.get("tool", "Unknown tool"), "tool": tool_use.get("tool", "Unknown tool"),
"error": tool_use.get("result"), "error": tool_use.get("result"),
"error_type": tool_use.get("error_type", "unknown_error") "error_type": tool_use.get("error_type", "unknown_error"),
} }
tool_errors.append(error_info) tool_errors.append(error_info)
@@ -315,9 +342,11 @@ class ToolInvocationEvaluator(BaseEvaluator):
tool_args = tool_use.get("args", {}) tool_args = tool_use.get("args", {})
success = tool_use.get("success", True) and not tool_use.get("error", False) success = tool_use.get("success", True) and not tool_use.get("error", False)
error_type = tool_use.get("error_type", "") if not success else "" error_type = tool_use.get("error_type", "") if not success else ""
error_msg = tool_use.get("result", "No error") if not success else "No error" error_msg = (
tool_use.get("result", "No error") if not success else "No error"
)
sample = f"Tool invocation #{i+1}:\n" sample = f"Tool invocation #{i + 1}:\n"
sample += f"- Tool: {tool_name}\n" sample += f"- Tool: {tool_name}\n"
sample += f"- Parameters: {json.dumps(tool_args, indent=2)}\n" sample += f"- Parameters: {json.dumps(tool_args, indent=2)}\n"
sample += f"- Success: {'No' if not success else 'Yes'}\n" sample += f"- Success: {'No' if not success else 'Yes'}\n"
@@ -330,11 +359,13 @@ class ToolInvocationEvaluator(BaseEvaluator):
if error_types: if error_types:
error_type_summary = "Error type breakdown:\n" error_type_summary = "Error type breakdown:\n"
for error_type, count in error_types.items(): for error_type, count in error_types.items():
error_type_summary += f"- {error_type}: {count} occurrences ({(count/tool_count):.1%})\n" error_type_summary += f"- {error_type}: {count} occurrences ({(count / tool_count):.1%})\n"
invocation_samples_text = "\n\n".join(invocation_samples) invocation_samples_text = "\n\n".join(invocation_samples)
prompt = [ prompt = [
{"role": "system", "content": """You are an expert evaluator assessing how correctly an AI agent's tool invocations are STRUCTURED. {
"role": "system",
"content": """You are an expert evaluator assessing how correctly an AI agent's tool invocations are STRUCTURED.
Your job is to evaluate ONLY the structural and syntactical aspects of how the agent called tools, NOT which tools were selected or what parameter values were used. Your job is to evaluate ONLY the structural and syntactical aspects of how the agent called tools, NOT which tools were selected or what parameter values were used.
@@ -359,8 +390,11 @@ Return your evaluation as JSON with these fields:
- overall_score: number (average of all scores, 0-10) - overall_score: number (average of all scores, 0-10)
- feedback: string (focused ONLY on structural aspects of tool invocation) - feedback: string (focused ONLY on structural aspects of tool invocation)
- improvement_suggestions: string (concrete suggestions for better structuring of tool calls) - improvement_suggestions: string (concrete suggestions for better structuring of tool calls)
"""}, """,
{"role": "user", "content": f""" },
{
"role": "user",
"content": f"""
Agent role: {agent.role} Agent role: {agent.role}
{task_context} {task_context}
@@ -371,15 +405,18 @@ Tool error rate: {error_rate:.2%} ({len(tool_errors)} errors out of {tool_count}
{error_type_summary} {error_type_summary}
Evaluate the quality of the agent's tool invocation structure during this task. Evaluate the quality of the agent's tool invocation structure during this task.
"""} """,
},
] ]
assert self.llm is not None if self.llm is None:
raise ValueError("LLM must be initialized")
response = self.llm.call(prompt) response = self.llm.call(prompt)
try: try:
evaluation_data = extract_json_from_llm_response(response) evaluation_data = extract_json_from_llm_response(response)
assert evaluation_data is not None if evaluation_data is None:
raise ValueError("Failed to extract evaluation data from LLM response")
scores = evaluation_data.get("scores", {}) scores = evaluation_data.get("scores", {})
structure = scores.get("structure", 5.0) structure = scores.get("structure", 5.0)
error_handling = scores.get("error_handling", 5.0) error_handling = scores.get("error_handling", 5.0)
@@ -388,23 +425,25 @@ Evaluate the quality of the agent's tool invocation structure during this task.
overall_score = float(evaluation_data.get("overall_score", 5.0)) overall_score = float(evaluation_data.get("overall_score", 5.0))
feedback = "Tool Invocation Evaluation:\n" feedback = "Tool Invocation Evaluation:\n"
feedback += f"• Structure: {structure}/10 - Following proper syntax and format\n" feedback += (
f"• Structure: {structure}/10 - Following proper syntax and format\n"
)
feedback += f"• Error Handling: {error_handling}/10 - Appropriately handling tool errors\n" feedback += f"• Error Handling: {error_handling}/10 - Appropriately handling tool errors\n"
feedback += f"• Invocation Patterns: {invocation_patterns}/10 - Proper sequencing and management of calls\n\n" feedback += f"• Invocation Patterns: {invocation_patterns}/10 - Proper sequencing and management of calls\n\n"
if "improvement_suggestions" in evaluation_data: if "improvement_suggestions" in evaluation_data:
feedback += f"Improvement Suggestions:\n{evaluation_data['improvement_suggestions']}" feedback += f"Improvement Suggestions:\n{evaluation_data['improvement_suggestions']}"
else: else:
feedback += evaluation_data.get("feedback", "No detailed feedback available.") feedback += evaluation_data.get(
"feedback", "No detailed feedback available."
)
return EvaluationScore( return EvaluationScore(
score=overall_score, score=overall_score, feedback=feedback, raw_response=response
feedback=feedback,
raw_response=response
) )
except Exception as e: except Exception as e:
return EvaluationScore( return EvaluationScore(
score=None, score=None,
feedback=f"Error evaluating tool invocation: {e}", feedback=f"Error evaluating tool invocation: {e}",
raw_response=response raw_response=response,
) )

View File

@@ -1,12 +1,21 @@
import inspect import inspect
import warnings
from typing_extensions import Any from typing_extensions import Any
import warnings
from crewai.experimental.evaluation.experiment import ExperimentResults, ExperimentRunner
from crewai import Crew, Agent
def assert_experiment_successfully(experiment_results: ExperimentResults, baseline_filepath: str | None = None) -> None: from crewai import Agent, Crew
failed_tests = [result for result in experiment_results.results if not result.passed] from crewai.experimental.evaluation.experiment import (
ExperimentResults,
ExperimentRunner,
)
def assert_experiment_successfully(
experiment_results: ExperimentResults, baseline_filepath: str | None = None
) -> None:
failed_tests = [
result for result in experiment_results.results if not result.passed
]
if failed_tests: if failed_tests:
detailed_failures: list[str] = [] detailed_failures: list[str] = []
@@ -14,39 +23,54 @@ def assert_experiment_successfully(experiment_results: ExperimentResults, baseli
for result in failed_tests: for result in failed_tests:
expected = result.expected_score expected = result.expected_score
actual = result.score actual = result.score
detailed_failures.append(f"- {result.identifier}: expected {expected}, got {actual}") detailed_failures.append(
f"- {result.identifier}: expected {expected}, got {actual}"
)
failure_details = "\n".join(detailed_failures) failure_details = "\n".join(detailed_failures)
raise AssertionError(f"The following test cases failed:\n{failure_details}") raise AssertionError(f"The following test cases failed:\n{failure_details}")
baseline_filepath = baseline_filepath or _get_baseline_filepath_fallback() baseline_filepath = baseline_filepath or _get_baseline_filepath_fallback()
comparison = experiment_results.compare_with_baseline(baseline_filepath=baseline_filepath) comparison = experiment_results.compare_with_baseline(
baseline_filepath=baseline_filepath
)
assert_experiment_no_regression(comparison) assert_experiment_no_regression(comparison)
def assert_experiment_no_regression(comparison_result: dict[str, list[str]]) -> None: def assert_experiment_no_regression(comparison_result: dict[str, list[str]]) -> None:
regressed = comparison_result.get("regressed", []) regressed = comparison_result.get("regressed", [])
if regressed: if regressed:
raise AssertionError(f"Regression detected! The following tests that previously passed now fail: {regressed}") raise AssertionError(
f"Regression detected! The following tests that previously passed now fail: {regressed}"
)
missing_tests = comparison_result.get("missing_tests", []) missing_tests = comparison_result.get("missing_tests", [])
if missing_tests: if missing_tests:
warnings.warn( warnings.warn(
f"Warning: {len(missing_tests)} tests from the baseline are missing in the current run: {missing_tests}", f"Warning: {len(missing_tests)} tests from the baseline are missing in the current run: {missing_tests}",
UserWarning UserWarning,
stacklevel=2,
) )
def run_experiment(dataset: list[dict[str, Any]], crew: Crew | None = None, agents: list[Agent] | None = None, verbose: bool = False) -> ExperimentResults:
def run_experiment(
dataset: list[dict[str, Any]],
crew: Crew | None = None,
agents: list[Agent] | None = None,
verbose: bool = False,
) -> ExperimentResults:
runner = ExperimentRunner(dataset=dataset) runner = ExperimentRunner(dataset=dataset)
return runner.run(agents=agents, crew=crew, print_summary=verbose) return runner.run(agents=agents, crew=crew, print_summary=verbose)
def _get_baseline_filepath_fallback() -> str: def _get_baseline_filepath_fallback() -> str:
test_func_name = "experiment_fallback" test_func_name = "experiment_fallback"
try: try:
current_frame = inspect.currentframe() current_frame = inspect.currentframe()
if current_frame is not None: if current_frame is not None:
test_func_name = current_frame.f_back.f_back.f_code.co_name # type: ignore[union-attr] test_func_name = current_frame.f_back.f_back.f_code.co_name # type: ignore[union-attr]
except Exception: except Exception:
... ...
return f"{test_func_name}_results.json" return f"{test_func_name}_results.json"