From 1b6b2b36d973f5466d75adc8303e9031c4ac7cf9 Mon Sep 17 00:00:00 2001 From: Lucas Gomide Date: Mon, 14 Jul 2025 10:06:45 -0300 Subject: [PATCH] Introduce Evaluator Experiment (#3133) * feat: add exchanged messages in LLMCallCompletedEvent * feat: add GoalAlignment metric for Agent evaluation * feat: add SemanticQuality metric for Agent evaluation * feat: add Tool Metrics for Agent evaluation * feat: add Reasoning Metrics for Agent evaluation, still in progress * feat: add AgentEvaluator class This class will evaluate Agent' results and report to user * fix: do not evaluate Agent by default This is a experimental feature we still need refine it further * test: add Agent eval tests * fix: render all feedback per iteration * style: resolve linter issues * style: fix mypy issues * fix: allow messages be empty on LLMCallCompletedEvent * feat: add Experiment evaluation framework with baseline comparison * fix: reset evaluator for each experiement iteraction * fix: fix track of new test cases * chore: split Experimental evaluation classes * refactor: remove unused method * refactor: isolate Console print in a dedicated class * fix: make crew required to run an experiment * fix: use time-aware to define experiment result * test: add tests for Evaluator Experiment * style: fix linter issues * fix: encode string before hashing * style: resolve linter issues * feat: add experimental folder for beta features (#3141) * test: move tests to experimental folder --- src/crewai/crew.py | 2 +- src/crewai/experimental/__init__.py | 40 ++ .../experimental/evaluation/__init__.py | 51 +++ .../evaluation/agent_evaluator.py | 182 ++++++++ .../experimental/evaluation/base_evaluator.py | 125 ++++++ .../evaluation/evaluation_display.py | 341 +++++++++++++++ .../evaluation/evaluation_listener.py | 190 +++++++++ .../evaluation/experiment/__init__.py | 8 + .../evaluation/experiment/result.py | 122 ++++++ .../evaluation/experiment/result_display.py | 70 +++ .../evaluation/experiment/runner.py | 117 +++++ .../experimental/evaluation/json_parser.py | 30 ++ .../evaluation/metrics/__init__.py | 26 ++ .../evaluation/metrics/goal_metrics.py | 66 +++ .../evaluation/metrics/reasoning_metrics.py | 355 ++++++++++++++++ .../metrics/semantic_quality_metrics.py | 65 +++ .../evaluation/metrics/tools_metrics.py | 400 ++++++++++++++++++ .../{ => experimental}/evaluation/__init__.py | 0 .../evaluation/metrics/__init__.py | 0 .../metrics/base_evaluation_metrics_test.py | 0 .../evaluation/metrics/test_goal_metrics.py | 6 +- .../metrics/test_reasoning_metrics.py | 6 +- .../metrics/test_semantic_quality_metrics.py | 6 +- .../evaluation/metrics/test_tools_metrics.py | 4 +- .../evaluation/test_agent_evaluator.py | 8 +- .../evaluation/test_experiment_result.py | 111 +++++ .../evaluation/test_experiment_runner.py | 197 +++++++++ 27 files changed, 2512 insertions(+), 16 deletions(-) create mode 100644 src/crewai/experimental/__init__.py create mode 100644 src/crewai/experimental/evaluation/__init__.py create mode 100644 src/crewai/experimental/evaluation/agent_evaluator.py create mode 100644 src/crewai/experimental/evaluation/base_evaluator.py create mode 100644 src/crewai/experimental/evaluation/evaluation_display.py create mode 100644 src/crewai/experimental/evaluation/evaluation_listener.py create mode 100644 src/crewai/experimental/evaluation/experiment/__init__.py create mode 100644 src/crewai/experimental/evaluation/experiment/result.py create mode 100644 src/crewai/experimental/evaluation/experiment/result_display.py create mode 100644 src/crewai/experimental/evaluation/experiment/runner.py create mode 100644 src/crewai/experimental/evaluation/json_parser.py create mode 100644 src/crewai/experimental/evaluation/metrics/__init__.py create mode 100644 src/crewai/experimental/evaluation/metrics/goal_metrics.py create mode 100644 src/crewai/experimental/evaluation/metrics/reasoning_metrics.py create mode 100644 src/crewai/experimental/evaluation/metrics/semantic_quality_metrics.py create mode 100644 src/crewai/experimental/evaluation/metrics/tools_metrics.py rename tests/{ => experimental}/evaluation/__init__.py (100%) rename tests/{ => experimental}/evaluation/metrics/__init__.py (100%) rename tests/{ => experimental}/evaluation/metrics/base_evaluation_metrics_test.py (100%) rename tests/{ => experimental}/evaluation/metrics/test_goal_metrics.py (88%) rename tests/{ => experimental}/evaluation/metrics/test_reasoning_metrics.py (96%) rename tests/{ => experimental}/evaluation/metrics/test_semantic_quality_metrics.py (90%) rename tests/{ => experimental}/evaluation/metrics/test_tools_metrics.py (97%) rename tests/{ => experimental}/evaluation/test_agent_evaluator.py (91%) create mode 100644 tests/experimental/evaluation/test_experiment_result.py create mode 100644 tests/experimental/evaluation/test_experiment_runner.py diff --git a/src/crewai/crew.py b/src/crewai/crew.py index 89acee505..1f02b9b06 100644 --- a/src/crewai/crew.py +++ b/src/crewai/crew.py @@ -1337,7 +1337,7 @@ class Crew(FlowTrackable, BaseModel): evaluator = CrewEvaluator(test_crew, llm_instance) if include_agent_eval: - from crewai.evaluation import create_default_evaluator + from crewai.experimental.evaluation import create_default_evaluator agent_evaluator = create_default_evaluator(crew=test_crew) for i in range(1, n_iterations + 1): diff --git a/src/crewai/experimental/__init__.py b/src/crewai/experimental/__init__.py new file mode 100644 index 000000000..67eb7847f --- /dev/null +++ b/src/crewai/experimental/__init__.py @@ -0,0 +1,40 @@ +from crewai.experimental.evaluation import ( + BaseEvaluator, + EvaluationScore, + MetricCategory, + AgentEvaluationResult, + SemanticQualityEvaluator, + GoalAlignmentEvaluator, + ReasoningEfficiencyEvaluator, + ToolSelectionEvaluator, + ParameterExtractionEvaluator, + ToolInvocationEvaluator, + EvaluationTraceCallback, + create_evaluation_callbacks, + AgentEvaluator, + create_default_evaluator, + ExperimentRunner, + ExperimentResults, + ExperimentResult, +) + + +__all__ = [ + "BaseEvaluator", + "EvaluationScore", + "MetricCategory", + "AgentEvaluationResult", + "SemanticQualityEvaluator", + "GoalAlignmentEvaluator", + "ReasoningEfficiencyEvaluator", + "ToolSelectionEvaluator", + "ParameterExtractionEvaluator", + "ToolInvocationEvaluator", + "EvaluationTraceCallback", + "create_evaluation_callbacks", + "AgentEvaluator", + "create_default_evaluator", + "ExperimentRunner", + "ExperimentResults", + "ExperimentResult" +] \ No newline at end of file diff --git a/src/crewai/experimental/evaluation/__init__.py b/src/crewai/experimental/evaluation/__init__.py new file mode 100644 index 000000000..024c3aaed --- /dev/null +++ b/src/crewai/experimental/evaluation/__init__.py @@ -0,0 +1,51 @@ +from crewai.experimental.evaluation.base_evaluator import ( + BaseEvaluator, + EvaluationScore, + MetricCategory, + AgentEvaluationResult +) + +from crewai.experimental.evaluation.metrics import ( + SemanticQualityEvaluator, + GoalAlignmentEvaluator, + ReasoningEfficiencyEvaluator, + ToolSelectionEvaluator, + ParameterExtractionEvaluator, + ToolInvocationEvaluator +) + +from crewai.experimental.evaluation.evaluation_listener import ( + EvaluationTraceCallback, + create_evaluation_callbacks +) + +from crewai.experimental.evaluation.agent_evaluator import ( + AgentEvaluator, + create_default_evaluator +) + +from crewai.experimental.evaluation.experiment import ( + ExperimentRunner, + ExperimentResults, + ExperimentResult +) + +__all__ = [ + "BaseEvaluator", + "EvaluationScore", + "MetricCategory", + "AgentEvaluationResult", + "SemanticQualityEvaluator", + "GoalAlignmentEvaluator", + "ReasoningEfficiencyEvaluator", + "ToolSelectionEvaluator", + "ParameterExtractionEvaluator", + "ToolInvocationEvaluator", + "EvaluationTraceCallback", + "create_evaluation_callbacks", + "AgentEvaluator", + "create_default_evaluator", + "ExperimentRunner", + "ExperimentResults", + "ExperimentResult" +] diff --git a/src/crewai/experimental/evaluation/agent_evaluator.py b/src/crewai/experimental/evaluation/agent_evaluator.py new file mode 100644 index 000000000..3cf9583dc --- /dev/null +++ b/src/crewai/experimental/evaluation/agent_evaluator.py @@ -0,0 +1,182 @@ +from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy +from crewai.agent import Agent +from crewai.task import Task +from crewai.experimental.evaluation.evaluation_display import EvaluationDisplayFormatter + +from typing import Any, Dict +from collections import defaultdict +from crewai.experimental.evaluation import BaseEvaluator, create_evaluation_callbacks +from collections.abc import Sequence +from crewai.crew import Crew +from crewai.utilities.events.crewai_event_bus import crewai_event_bus +from crewai.utilities.events.utils.console_formatter import ConsoleFormatter +from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult + +class AgentEvaluator: + def __init__( + self, + evaluators: Sequence[BaseEvaluator] | None = None, + crew: Crew | None = None, + ): + self.crew: Crew | None = crew + self.evaluators: Sequence[BaseEvaluator] | None = evaluators + + self.agent_evaluators: dict[str, Sequence[BaseEvaluator] | None] = {} + if crew is not None: + assert crew and crew.agents is not None + for agent in crew.agents: + self.agent_evaluators[str(agent.id)] = self.evaluators + + self.callback = create_evaluation_callbacks() + self.console_formatter = ConsoleFormatter() + self.display_formatter = EvaluationDisplayFormatter() + + self.iteration = 1 + self.iterations_results: dict[int, dict[str, list[AgentEvaluationResult]]] = {} + + def set_iteration(self, iteration: int) -> None: + self.iteration = iteration + + def reset_iterations_results(self): + self.iterations_results = {} + + def evaluate_current_iteration(self) -> dict[str, list[AgentEvaluationResult]]: + if not self.crew: + raise ValueError("Cannot evaluate: no crew was provided to the evaluator.") + + if not self.callback: + raise ValueError("Cannot evaluate: no callback was set. Use set_callback() method first.") + + from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn + evaluation_results: defaultdict[str, list[AgentEvaluationResult]] = defaultdict(list) + + total_evals = 0 + for agent in self.crew.agents: + for task in self.crew.tasks: + if task.agent and task.agent.id == agent.id and self.agent_evaluators.get(str(agent.id)): + total_evals += 1 + + with Progress( + SpinnerColumn(), + TextColumn("[bold blue]{task.description}[/bold blue]"), + BarColumn(), + TextColumn("{task.percentage:.0f}% completed"), + console=self.console_formatter.console + ) as progress: + eval_task = progress.add_task(f"Evaluating agents (iteration {self.iteration})...", total=total_evals) + + for agent in self.crew.agents: + evaluator = self.agent_evaluators.get(str(agent.id)) + if not evaluator: + continue + + for task in self.crew.tasks: + + if task.agent and str(task.agent.id) != str(agent.id): + continue + + trace = self.callback.get_trace(str(agent.id), str(task.id)) + if not trace: + self.console_formatter.print(f"[yellow]Warning: No trace found for agent {agent.role} on task {task.description[:30]}...[/yellow]") + progress.update(eval_task, advance=1) + continue + + with crewai_event_bus.scoped_handlers(): + result = self.evaluate( + agent=agent, + task=task, + execution_trace=trace, + final_output=task.output + ) + evaluation_results[agent.role].append(result) + progress.update(eval_task, advance=1) + + self.iterations_results[self.iteration] = evaluation_results + return evaluation_results + + def get_evaluation_results(self): + if self.iteration in self.iterations_results: + return self.iterations_results[self.iteration] + + return self.evaluate_current_iteration() + + def display_results_with_iterations(self): + self.display_formatter.display_summary_results(self.iterations_results) + + def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = False) -> Dict[str, AgentAggregatedEvaluationResult]: + agent_results = {} + with crewai_event_bus.scoped_handlers(): + task_results = self.get_evaluation_results() + for agent_role, results in task_results.items(): + if not results: + continue + + agent_id = results[0].agent_id + + aggregated_result = self.display_formatter._aggregate_agent_results( + agent_id=agent_id, + agent_role=agent_role, + results=results, + strategy=strategy + ) + + agent_results[agent_role] = aggregated_result + + + if self.iteration == max(self.iterations_results.keys()): + self.display_results_with_iterations() + + if include_evaluation_feedback: + self.display_evaluation_with_feedback() + + return agent_results + + def display_evaluation_with_feedback(self): + self.display_formatter.display_evaluation_with_feedback(self.iterations_results) + + def evaluate( + self, + agent: Agent, + task: Task, + execution_trace: Dict[str, Any], + final_output: Any + ) -> AgentEvaluationResult: + result = AgentEvaluationResult( + agent_id=str(agent.id), + task_id=str(task.id) + ) + assert self.evaluators is not None + for evaluator in self.evaluators: + try: + score = evaluator.evaluate( + agent=agent, + task=task, + execution_trace=execution_trace, + final_output=final_output + ) + result.metrics[evaluator.metric_category] = score + except Exception as e: + self.console_formatter.print(f"Error in {evaluator.metric_category.value} evaluator: {str(e)}") + + return result + +def create_default_evaluator(crew, llm=None): + from crewai.experimental.evaluation import ( + GoalAlignmentEvaluator, + SemanticQualityEvaluator, + ToolSelectionEvaluator, + ParameterExtractionEvaluator, + ToolInvocationEvaluator, + ReasoningEfficiencyEvaluator + ) + + evaluators = [ + GoalAlignmentEvaluator(llm=llm), + SemanticQualityEvaluator(llm=llm), + ToolSelectionEvaluator(llm=llm), + ParameterExtractionEvaluator(llm=llm), + ToolInvocationEvaluator(llm=llm), + ReasoningEfficiencyEvaluator(llm=llm), + ] + + return AgentEvaluator(evaluators=evaluators, crew=crew) diff --git a/src/crewai/experimental/evaluation/base_evaluator.py b/src/crewai/experimental/evaluation/base_evaluator.py new file mode 100644 index 000000000..b11c61973 --- /dev/null +++ b/src/crewai/experimental/evaluation/base_evaluator.py @@ -0,0 +1,125 @@ +import abc +import enum +from enum import Enum +from typing import Any, Dict, List, Optional + +from pydantic import BaseModel, Field + +from crewai.agent import Agent +from crewai.task import Task +from crewai.llm import BaseLLM +from crewai.utilities.llm_utils import create_llm + +class MetricCategory(enum.Enum): + GOAL_ALIGNMENT = "goal_alignment" + SEMANTIC_QUALITY = "semantic_quality" + REASONING_EFFICIENCY = "reasoning_efficiency" + TOOL_SELECTION = "tool_selection" + PARAMETER_EXTRACTION = "parameter_extraction" + TOOL_INVOCATION = "tool_invocation" + + def title(self): + return self.value.replace('_', ' ').title() + + +class EvaluationScore(BaseModel): + score: float | None = Field( + default=5.0, + description="Numeric score from 0-10 where 0 is worst and 10 is best, None if not applicable", + ge=0.0, + le=10.0 + ) + feedback: str = Field( + default="", + description="Detailed feedback explaining the evaluation score" + ) + raw_response: str | None = Field( + default=None, + description="Raw response from the evaluator (e.g., LLM)" + ) + + def __str__(self) -> str: + if self.score is None: + return f"Score: N/A - {self.feedback}" + return f"Score: {self.score:.1f}/10 - {self.feedback}" + + +class BaseEvaluator(abc.ABC): + def __init__(self, llm: BaseLLM | None = None): + self.llm: BaseLLM | None = create_llm(llm) + + @property + @abc.abstractmethod + def metric_category(self) -> MetricCategory: + pass + + @abc.abstractmethod + def evaluate( + self, + agent: Agent, + task: Task, + execution_trace: Dict[str, Any], + final_output: Any, + ) -> EvaluationScore: + pass + + +class AgentEvaluationResult(BaseModel): + agent_id: str = Field(description="ID of the evaluated agent") + task_id: str = Field(description="ID of the task that was executed") + metrics: Dict[MetricCategory, EvaluationScore] = Field( + default_factory=dict, + description="Evaluation scores for each metric category" + ) + + +class AggregationStrategy(Enum): + SIMPLE_AVERAGE = "simple_average" # Equal weight to all tasks + WEIGHTED_BY_COMPLEXITY = "weighted_by_complexity" # Weight by task complexity + BEST_PERFORMANCE = "best_performance" # Use best scores across tasks + WORST_PERFORMANCE = "worst_performance" # Use worst scores across tasks + + +class AgentAggregatedEvaluationResult(BaseModel): + agent_id: str = Field( + default="", + description="ID of the agent" + ) + agent_role: str = Field( + default="", + description="Role of the agent" + ) + task_count: int = Field( + default=0, + description="Number of tasks included in this aggregation" + ) + aggregation_strategy: AggregationStrategy = Field( + default=AggregationStrategy.SIMPLE_AVERAGE, + description="Strategy used for aggregation" + ) + metrics: Dict[MetricCategory, EvaluationScore] = Field( + default_factory=dict, + description="Aggregated metrics across all tasks" + ) + task_results: List[str] = Field( + default_factory=list, + description="IDs of tasks included in this aggregation" + ) + overall_score: Optional[float] = Field( + default=None, + description="Overall score for this agent" + ) + + def __str__(self) -> str: + result = f"Agent Evaluation: {self.agent_role}\n" + result += f"Strategy: {self.aggregation_strategy.value}\n" + result += f"Tasks evaluated: {self.task_count}\n" + + for category, score in self.metrics.items(): + result += f"\n\n- {category.value.upper()}: {score.score}/10\n" + + if score.feedback: + detailed_feedback = "\n ".join(score.feedback.split('\n')) + result += f" {detailed_feedback}\n" + + return result \ No newline at end of file diff --git a/src/crewai/experimental/evaluation/evaluation_display.py b/src/crewai/experimental/evaluation/evaluation_display.py new file mode 100644 index 000000000..c25ea6db4 --- /dev/null +++ b/src/crewai/experimental/evaluation/evaluation_display.py @@ -0,0 +1,341 @@ +from collections import defaultdict +from typing import Dict, Any, List +from rich.table import Table +from rich.box import HEAVY_EDGE, ROUNDED +from collections.abc import Sequence +from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult, AggregationStrategy, AgentEvaluationResult, MetricCategory +from crewai.experimental.evaluation import EvaluationScore +from crewai.utilities.events.utils.console_formatter import ConsoleFormatter +from crewai.utilities.llm_utils import create_llm + +class EvaluationDisplayFormatter: + def __init__(self): + self.console_formatter = ConsoleFormatter() + + def display_evaluation_with_feedback(self, iterations_results: Dict[int, Dict[str, List[Any]]]): + if not iterations_results: + self.console_formatter.print("[yellow]No evaluation results to display[/yellow]") + return + + # Get all agent roles across all iterations + all_agent_roles: set[str] = set() + for iter_results in iterations_results.values(): + all_agent_roles.update(iter_results.keys()) + + for agent_role in sorted(all_agent_roles): + self.console_formatter.print(f"\n[bold cyan]Agent: {agent_role}[/bold cyan]") + + # Process each iteration + for iter_num, results in sorted(iterations_results.items()): + if agent_role not in results or not results[agent_role]: + continue + + agent_results = results[agent_role] + agent_id = agent_results[0].agent_id + + # Aggregate results for this agent in this iteration + aggregated_result = self._aggregate_agent_results( + agent_id=agent_id, + agent_role=agent_role, + results=agent_results, + ) + + # Display iteration header + self.console_formatter.print(f"\n[bold]Iteration {iter_num}[/bold]") + + # Create table for this iteration + table = Table(box=ROUNDED) + table.add_column("Metric", style="cyan") + table.add_column("Score (1-10)", justify="center") + table.add_column("Feedback", style="green") + + # Add metrics to table + if aggregated_result.metrics: + for metric, evaluation_score in aggregated_result.metrics.items(): + score = evaluation_score.score + + if isinstance(score, (int, float)): + if score >= 8.0: + score_text = f"[green]{score:.1f}[/green]" + elif score >= 6.0: + score_text = f"[cyan]{score:.1f}[/cyan]" + elif score >= 4.0: + score_text = f"[yellow]{score:.1f}[/yellow]" + else: + score_text = f"[red]{score:.1f}[/red]" + else: + score_text = "[dim]N/A[/dim]" + + table.add_section() + table.add_row( + metric.title(), + score_text, + evaluation_score.feedback or "" + ) + + if aggregated_result.overall_score is not None: + overall_score = aggregated_result.overall_score + if overall_score >= 8.0: + overall_color = "green" + elif overall_score >= 6.0: + overall_color = "cyan" + elif overall_score >= 4.0: + overall_color = "yellow" + else: + overall_color = "red" + + table.add_section() + table.add_row( + "Overall Score", + f"[{overall_color}]{overall_score:.1f}[/]", + "Overall agent evaluation score" + ) + + # Print the table for this iteration + self.console_formatter.print(table) + + def display_summary_results(self, iterations_results: Dict[int, Dict[str, List[AgentAggregatedEvaluationResult]]]): + if not iterations_results: + self.console_formatter.print("[yellow]No evaluation results to display[/yellow]") + return + + self.console_formatter.print("\n") + + table = Table(title="Agent Performance Scores \n (1-10 Higher is better)", box=HEAVY_EDGE) + + table.add_column("Agent/Metric", style="cyan") + + for iter_num in sorted(iterations_results.keys()): + run_label = f"Run {iter_num}" + table.add_column(run_label, justify="center") + + table.add_column("Avg. Total", justify="center") + + all_agent_roles: set[str] = set() + for results in iterations_results.values(): + all_agent_roles.update(results.keys()) + + for agent_role in sorted(all_agent_roles): + agent_scores_by_iteration = {} + agent_metrics_by_iteration = {} + + for iter_num, results in sorted(iterations_results.items()): + if agent_role not in results or not results[agent_role]: + continue + + agent_results = results[agent_role] + agent_id = agent_results[0].agent_id + + aggregated_result = self._aggregate_agent_results( + agent_id=agent_id, + agent_role=agent_role, + results=agent_results, + strategy=AggregationStrategy.SIMPLE_AVERAGE + ) + + valid_scores = [score.score for score in aggregated_result.metrics.values() + if score.score is not None] + if valid_scores: + avg_score = sum(valid_scores) / len(valid_scores) + agent_scores_by_iteration[iter_num] = avg_score + + agent_metrics_by_iteration[iter_num] = aggregated_result.metrics + + if not agent_scores_by_iteration: + continue + + avg_across_iterations = sum(agent_scores_by_iteration.values()) / len(agent_scores_by_iteration) + + row = [f"[bold]{agent_role}[/bold]"] + + for iter_num in sorted(iterations_results.keys()): + if iter_num in agent_scores_by_iteration: + score = agent_scores_by_iteration[iter_num] + if score >= 8.0: + color = "green" + elif score >= 6.0: + color = "cyan" + elif score >= 4.0: + color = "yellow" + else: + color = "red" + row.append(f"[bold {color}]{score:.1f}[/]") + else: + row.append("-") + + if avg_across_iterations >= 8.0: + color = "green" + elif avg_across_iterations >= 6.0: + color = "cyan" + elif avg_across_iterations >= 4.0: + color = "yellow" + else: + color = "red" + row.append(f"[bold {color}]{avg_across_iterations:.1f}[/]") + + table.add_row(*row) + + all_metrics: set[Any] = set() + for metrics in agent_metrics_by_iteration.values(): + all_metrics.update(metrics.keys()) + + for metric in sorted(all_metrics, key=lambda x: x.value): + metric_scores = [] + + row = [f" - {metric.title()}"] + + for iter_num in sorted(iterations_results.keys()): + if (iter_num in agent_metrics_by_iteration and + metric in agent_metrics_by_iteration[iter_num]): + metric_score = agent_metrics_by_iteration[iter_num][metric].score + if metric_score is not None: + metric_scores.append(metric_score) + if metric_score >= 8.0: + color = "green" + elif metric_score >= 6.0: + color = "cyan" + elif metric_score >= 4.0: + color = "yellow" + else: + color = "red" + row.append(f"[{color}]{metric_score:.1f}[/]") + else: + row.append("[dim]N/A[/dim]") + else: + row.append("-") + + if metric_scores: + avg = sum(metric_scores) / len(metric_scores) + if avg >= 8.0: + color = "green" + elif avg >= 6.0: + color = "cyan" + elif avg >= 4.0: + color = "yellow" + else: + color = "red" + row.append(f"[{color}]{avg:.1f}[/]") + else: + row.append("-") + + table.add_row(*row) + + table.add_row(*[""] * (len(sorted(iterations_results.keys())) + 2)) + + self.console_formatter.print(table) + self.console_formatter.print("\n") + + def _aggregate_agent_results( + self, + agent_id: str, + agent_role: str, + results: Sequence[AgentEvaluationResult], + strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, + ) -> AgentAggregatedEvaluationResult: + metrics_by_category: dict[MetricCategory, list[EvaluationScore]] = defaultdict(list) + + for result in results: + for metric_name, evaluation_score in result.metrics.items(): + metrics_by_category[metric_name].append(evaluation_score) + + aggregated_metrics: dict[MetricCategory, EvaluationScore] = {} + for category, scores in metrics_by_category.items(): + valid_scores = [s.score for s in scores if s.score is not None] + avg_score = sum(valid_scores) / len(valid_scores) if valid_scores else None + + feedbacks = [s.feedback for s in scores if s.feedback] + + feedback_summary = None + if feedbacks: + if len(feedbacks) > 1: + # Use the summarization method for multiple feedbacks + feedback_summary = self._summarize_feedbacks( + agent_role=agent_role, + metric=category.title(), + feedbacks=feedbacks, + scores=[s.score for s in scores], + strategy=strategy + ) + else: + feedback_summary = feedbacks[0] + + aggregated_metrics[category] = EvaluationScore( + score=avg_score, + feedback=feedback_summary + ) + + overall_score = None + if aggregated_metrics: + valid_scores = [m.score for m in aggregated_metrics.values() if m.score is not None] + if valid_scores: + overall_score = sum(valid_scores) / len(valid_scores) + + return AgentAggregatedEvaluationResult( + agent_id=agent_id, + agent_role=agent_role, + metrics=aggregated_metrics, + overall_score=overall_score, + task_count=len(results), + aggregation_strategy=strategy + ) + + def _summarize_feedbacks( + self, + agent_role: str, + metric: str, + feedbacks: List[str], + scores: List[float | None], + strategy: AggregationStrategy + ) -> str: + if len(feedbacks) <= 2 and all(len(fb) < 200 for fb in feedbacks): + return "\n\n".join([f"Feedback {i+1}: {fb}" for i, fb in enumerate(feedbacks)]) + + try: + llm = create_llm() + + formatted_feedbacks = [] + for i, (feedback, score) in enumerate(zip(feedbacks, scores)): + if len(feedback) > 500: + feedback = feedback[:500] + "..." + score_text = f"{score:.1f}" if score is not None else "N/A" + formatted_feedbacks.append(f"Feedback #{i+1} (Score: {score_text}):\n{feedback}") + + all_feedbacks = "\n\n" + "\n\n---\n\n".join(formatted_feedbacks) + + strategy_guidance = "" + if strategy == AggregationStrategy.BEST_PERFORMANCE: + strategy_guidance = "Focus on the highest-scoring aspects and strengths demonstrated." + elif strategy == AggregationStrategy.WORST_PERFORMANCE: + strategy_guidance = "Focus on areas that need improvement and common issues across tasks." + else: # Default/average strategies + strategy_guidance = "Provide a balanced analysis of strengths and weaknesses across all tasks." + + prompt = [ + {"role": "system", "content": f"""You are an expert evaluator creating a comprehensive summary of agent performance feedback. + Your job is to synthesize multiple feedback points about the same metric across different tasks. + + Create a concise, insightful summary that captures the key patterns and themes from all feedback. + {strategy_guidance} + + Your summary should be: + 1. Specific and concrete (not vague or general) + 2. Focused on actionable insights + 3. Highlighting patterns across tasks + 4. 150-250 words in length + + The summary should be directly usable as final feedback for the agent's performance on this metric."""}, + {"role": "user", "content": f"""I need a synthesized summary of the following feedback for: + + Agent Role: {agent_role} + Metric: {metric.title()} + + {all_feedbacks} + """} + ] + assert llm is not None + response = llm.call(prompt) + + return response + + except Exception: + return "Synthesized from multiple tasks: " + "\n\n".join([f"- {fb[:500]}..." for fb in feedbacks]) diff --git a/src/crewai/experimental/evaluation/evaluation_listener.py b/src/crewai/experimental/evaluation/evaluation_listener.py new file mode 100644 index 000000000..8fba03e14 --- /dev/null +++ b/src/crewai/experimental/evaluation/evaluation_listener.py @@ -0,0 +1,190 @@ +from datetime import datetime +from typing import Any, Dict, Optional + +from collections.abc import Sequence + +from crewai.agent import Agent +from crewai.task import Task +from crewai.utilities.events.base_event_listener import BaseEventListener +from crewai.utilities.events.crewai_event_bus import CrewAIEventsBus +from crewai.utilities.events.agent_events import ( + AgentExecutionStartedEvent, + AgentExecutionCompletedEvent +) +from crewai.utilities.events.tool_usage_events import ( + ToolUsageFinishedEvent, + ToolUsageErrorEvent, + ToolExecutionErrorEvent, + ToolSelectionErrorEvent, + ToolValidateInputErrorEvent +) +from crewai.utilities.events.llm_events import ( + LLMCallStartedEvent, + LLMCallCompletedEvent +) + +class EvaluationTraceCallback(BaseEventListener): + """Event listener for collecting execution traces for evaluation. + + This listener attaches to the event bus to collect detailed information + about the execution process, including agent steps, tool uses, knowledge + retrievals, and final output - all for use in agent evaluation. + """ + + _instance = None + + def __new__(cls): + if cls._instance is None: + cls._instance = super().__new__(cls) + cls._instance._initialized = False + return cls._instance + + def __init__(self): + if not hasattr(self, "_initialized") or not self._initialized: + super().__init__() + self.traces = {} + self.current_agent_id = None + self.current_task_id = None + self._initialized = True + + def setup_listeners(self, event_bus: CrewAIEventsBus): + @event_bus.on(AgentExecutionStartedEvent) + def on_agent_started(source, event: AgentExecutionStartedEvent): + self.on_agent_start(event.agent, event.task) + + @event_bus.on(AgentExecutionCompletedEvent) + def on_agent_completed(source, event: AgentExecutionCompletedEvent): + self.on_agent_finish(event.agent, event.task, event.output) + + @event_bus.on(ToolUsageFinishedEvent) + def on_tool_completed(source, event: ToolUsageFinishedEvent): + self.on_tool_use(event.tool_name, event.tool_args, event.output, success=True) + + @event_bus.on(ToolUsageErrorEvent) + def on_tool_usage_error(source, event: ToolUsageErrorEvent): + self.on_tool_use(event.tool_name, event.tool_args, event.error, + success=False, error_type="usage_error") + + @event_bus.on(ToolExecutionErrorEvent) + def on_tool_execution_error(source, event: ToolExecutionErrorEvent): + self.on_tool_use(event.tool_name, event.tool_args, event.error, + success=False, error_type="execution_error") + + @event_bus.on(ToolSelectionErrorEvent) + def on_tool_selection_error(source, event: ToolSelectionErrorEvent): + self.on_tool_use(event.tool_name, event.tool_args, event.error, + success=False, error_type="selection_error") + + @event_bus.on(ToolValidateInputErrorEvent) + def on_tool_validate_input_error(source, event: ToolValidateInputErrorEvent): + self.on_tool_use(event.tool_name, event.tool_args, event.error, + success=False, error_type="validation_error") + + @event_bus.on(LLMCallStartedEvent) + def on_llm_call_started(source, event: LLMCallStartedEvent): + self.on_llm_call_start(event.messages, event.tools) + + @event_bus.on(LLMCallCompletedEvent) + def on_llm_call_completed(source, event: LLMCallCompletedEvent): + self.on_llm_call_end(event.messages, event.response) + + def on_agent_start(self, agent: Agent, task: Task): + self.current_agent_id = agent.id + self.current_task_id = task.id + + trace_key = f"{agent.id}_{task.id}" + self.traces[trace_key] = { + "agent_id": agent.id, + "task_id": task.id, + "tool_uses": [], + "llm_calls": [], + "start_time": datetime.now(), + "final_output": None + } + + def on_agent_finish(self, agent: Agent, task: Task, output: Any): + trace_key = f"{agent.id}_{task.id}" + if trace_key in self.traces: + self.traces[trace_key]["final_output"] = output + self.traces[trace_key]["end_time"] = datetime.now() + + self.current_agent_id = None + self.current_task_id = None + + def on_tool_use(self, tool_name: str, tool_args: dict[str, Any] | str, result: Any, + success: bool = True, error_type: str | None = None): + if not self.current_agent_id or not self.current_task_id: + return + + trace_key = f"{self.current_agent_id}_{self.current_task_id}" + if trace_key in self.traces: + tool_use = { + "tool": tool_name, + "args": tool_args, + "result": result, + "success": success, + "timestamp": datetime.now() + } + + # Add error information if applicable + if not success and error_type: + tool_use["error"] = True + tool_use["error_type"] = error_type + + self.traces[trace_key]["tool_uses"].append(tool_use) + + def on_llm_call_start(self, messages: str | Sequence[dict[str, Any]] | None, tools: Sequence[dict[str, Any]] | None = None): + if not self.current_agent_id or not self.current_task_id: + return + + trace_key = f"{self.current_agent_id}_{self.current_task_id}" + if trace_key not in self.traces: + return + + self.current_llm_call = { + "messages": messages, + "tools": tools, + "start_time": datetime.now(), + "response": None, + "end_time": None + } + + def on_llm_call_end(self, messages: str | list[dict[str, Any]] | None, response: Any): + if not self.current_agent_id or not self.current_task_id: + return + + trace_key = f"{self.current_agent_id}_{self.current_task_id}" + if trace_key not in self.traces: + return + + total_tokens = 0 + if hasattr(response, "usage") and hasattr(response.usage, "total_tokens"): + total_tokens = response.usage.total_tokens + + current_time = datetime.now() + start_time = None + if hasattr(self, "current_llm_call") and self.current_llm_call: + start_time = self.current_llm_call.get("start_time") + + if not start_time: + start_time = current_time + llm_call = { + "messages": messages, + "response": response, + "start_time": start_time, + "end_time": current_time, + "total_tokens": total_tokens + } + + self.traces[trace_key]["llm_calls"].append(llm_call) + + if hasattr(self, "current_llm_call"): + self.current_llm_call = {} + + def get_trace(self, agent_id: str, task_id: str) -> Optional[Dict[str, Any]]: + trace_key = f"{agent_id}_{task_id}" + return self.traces.get(trace_key) + + +def create_evaluation_callbacks() -> EvaluationTraceCallback: + return EvaluationTraceCallback() \ No newline at end of file diff --git a/src/crewai/experimental/evaluation/experiment/__init__.py b/src/crewai/experimental/evaluation/experiment/__init__.py new file mode 100644 index 000000000..8e4fd8983 --- /dev/null +++ b/src/crewai/experimental/evaluation/experiment/__init__.py @@ -0,0 +1,8 @@ +from crewai.experimental.evaluation.experiment.runner import ExperimentRunner +from crewai.experimental.evaluation.experiment.result import ExperimentResults, ExperimentResult + +__all__ = [ + "ExperimentRunner", + "ExperimentResults", + "ExperimentResult" +] diff --git a/src/crewai/experimental/evaluation/experiment/result.py b/src/crewai/experimental/evaluation/experiment/result.py new file mode 100644 index 000000000..44b24e95a --- /dev/null +++ b/src/crewai/experimental/evaluation/experiment/result.py @@ -0,0 +1,122 @@ +import json +import os +from datetime import datetime, timezone +from typing import Any +from pydantic import BaseModel + +class ExperimentResult(BaseModel): + identifier: str + inputs: dict[str, Any] + score: int | dict[str, int | float] + expected_score: int | dict[str, int | float] + passed: bool + agent_evaluations: dict[str, Any] | None = None + +class ExperimentResults: + def __init__(self, results: list[ExperimentResult], metadata: dict[str, Any] | None = None): + self.results = results + self.metadata = metadata or {} + self.timestamp = datetime.now(timezone.utc) + + from crewai.experimental.evaluation.experiment.result_display import ExperimentResultsDisplay + self.display = ExperimentResultsDisplay() + + def to_json(self, filepath: str | None = None) -> dict[str, Any]: + data = { + "timestamp": self.timestamp.isoformat(), + "metadata": self.metadata, + "results": [r.model_dump(exclude={"agent_evaluations"}) for r in self.results] + } + + if filepath: + with open(filepath, 'w') as f: + json.dump(data, f, indent=2) + self.display.console.print(f"[green]Results saved to {filepath}[/green]") + + return data + + def compare_with_baseline(self, baseline_filepath: str, save_current: bool = True, print_summary: bool = False) -> dict[str, Any]: + baseline_runs = [] + + if os.path.exists(baseline_filepath) and os.path.getsize(baseline_filepath) > 0: + try: + with open(baseline_filepath, 'r') as f: + baseline_data = json.load(f) + + if isinstance(baseline_data, dict) and "timestamp" in baseline_data: + baseline_runs = [baseline_data] + elif isinstance(baseline_data, list): + baseline_runs = baseline_data + except (json.JSONDecodeError, FileNotFoundError) as e: + self.display.console.print(f"[yellow]Warning: Could not load baseline file: {str(e)}[/yellow]") + + if not baseline_runs: + if save_current: + current_data = self.to_json() + with open(baseline_filepath, 'w') as f: + json.dump([current_data], f, indent=2) + self.display.console.print(f"[green]Saved current results as new baseline to {baseline_filepath}[/green]") + return {"is_baseline": True, "changes": {}} + + baseline_runs.sort(key=lambda x: x.get("timestamp", ""), reverse=True) + latest_run = baseline_runs[0] + + comparison = self._compare_with_run(latest_run) + + if print_summary: + self.display.comparison_summary(comparison, latest_run["timestamp"]) + + if save_current: + current_data = self.to_json() + baseline_runs.append(current_data) + with open(baseline_filepath, 'w') as f: + json.dump(baseline_runs, f, indent=2) + self.display.console.print(f"[green]Added current results to baseline file {baseline_filepath}[/green]") + + return comparison + + def _compare_with_run(self, baseline_run: dict[str, Any]) -> dict[str, Any]: + baseline_results = baseline_run.get("results", []) + + baseline_lookup = {} + for result in baseline_results: + test_identifier = result.get("identifier") + if test_identifier: + baseline_lookup[test_identifier] = result + + improved = [] + regressed = [] + unchanged = [] + new_tests = [] + + for result in self.results: + test_identifier = result.identifier + if not test_identifier or test_identifier not in baseline_lookup: + new_tests.append(test_identifier) + continue + + baseline_result = baseline_lookup[test_identifier] + baseline_passed = baseline_result.get("passed", False) + if result.passed and not baseline_passed: + improved.append(test_identifier) + elif not result.passed and baseline_passed: + regressed.append(test_identifier) + else: + unchanged.append(test_identifier) + + missing_tests = [] + current_test_identifiers = {result.identifier for result in self.results} + for result in baseline_results: + test_identifier = result.get("identifier") + if test_identifier and test_identifier not in current_test_identifiers: + missing_tests.append(test_identifier) + + return { + "improved": improved, + "regressed": regressed, + "unchanged": unchanged, + "new_tests": new_tests, + "missing_tests": missing_tests, + "total_compared": len(improved) + len(regressed) + len(unchanged), + "baseline_timestamp": baseline_run.get("timestamp", "unknown") + } diff --git a/src/crewai/experimental/evaluation/experiment/result_display.py b/src/crewai/experimental/evaluation/experiment/result_display.py new file mode 100644 index 000000000..79bab2fc8 --- /dev/null +++ b/src/crewai/experimental/evaluation/experiment/result_display.py @@ -0,0 +1,70 @@ +from typing import Dict, Any +from rich.console import Console +from rich.table import Table +from rich.panel import Panel +from crewai.experimental.evaluation.experiment.result import ExperimentResults + +class ExperimentResultsDisplay: + def __init__(self): + self.console = Console() + + def summary(self, experiment_results: ExperimentResults): + total = len(experiment_results.results) + passed = sum(1 for r in experiment_results.results if r.passed) + + table = Table(title="Experiment Summary") + table.add_column("Metric", style="cyan") + table.add_column("Value", style="green") + + table.add_row("Total Test Cases", str(total)) + table.add_row("Passed", str(passed)) + table.add_row("Failed", str(total - passed)) + table.add_row("Success Rate", f"{(passed / total * 100):.1f}%" if total > 0 else "N/A") + + self.console.print(table) + + def comparison_summary(self, comparison: Dict[str, Any], baseline_timestamp: str): + self.console.print(Panel(f"[bold]Comparison with baseline run from {baseline_timestamp}[/bold]", + expand=False)) + + table = Table(title="Results Comparison") + table.add_column("Metric", style="cyan") + table.add_column("Count", style="white") + table.add_column("Details", style="dim") + + improved = comparison.get("improved", []) + if improved: + details = ", ".join([f"{test_identifier}" for test_identifier in improved[:3]]) + if len(improved) > 3: + details += f" and {len(improved) - 3} more" + table.add_row("✅ Improved", str(len(improved)), details) + else: + table.add_row("✅ Improved", "0", "") + + regressed = comparison.get("regressed", []) + if regressed: + details = ", ".join([f"{test_identifier}" for test_identifier in regressed[:3]]) + if len(regressed) > 3: + details += f" and {len(regressed) - 3} more" + table.add_row("❌ Regressed", str(len(regressed)), details, style="red") + else: + table.add_row("❌ Regressed", "0", "") + + unchanged = comparison.get("unchanged", []) + table.add_row("⏺ Unchanged", str(len(unchanged)), "") + + new_tests = comparison.get("new_tests", []) + if new_tests: + details = ", ".join(new_tests[:3]) + if len(new_tests) > 3: + details += f" and {len(new_tests) - 3} more" + table.add_row("➕ New Tests", str(len(new_tests)), details) + + missing_tests = comparison.get("missing_tests", []) + if missing_tests: + details = ", ".join(missing_tests[:3]) + if len(missing_tests) > 3: + details += f" and {len(missing_tests) - 3} more" + table.add_row("➖ Missing Tests", str(len(missing_tests)), details) + + self.console.print(table) diff --git a/src/crewai/experimental/evaluation/experiment/runner.py b/src/crewai/experimental/evaluation/experiment/runner.py new file mode 100644 index 000000000..b1b653e68 --- /dev/null +++ b/src/crewai/experimental/evaluation/experiment/runner.py @@ -0,0 +1,117 @@ +from collections import defaultdict +from hashlib import md5 +from typing import Any + +from crewai import Crew +from crewai.experimental.evaluation import AgentEvaluator, create_default_evaluator +from crewai.experimental.evaluation.experiment.result_display import ExperimentResultsDisplay +from crewai.experimental.evaluation.experiment.result import ExperimentResults, ExperimentResult +from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult + +class ExperimentRunner: + def __init__(self, dataset: list[dict[str, Any]]): + self.dataset = dataset or [] + self.evaluator: AgentEvaluator | None = None + self.display = ExperimentResultsDisplay() + + def run(self, crew: Crew, print_summary: bool = False) -> ExperimentResults: + self.evaluator = create_default_evaluator(crew=crew) + + results = [] + + for test_case in self.dataset: + self.evaluator.reset_iterations_results() + result = self._run_test_case(test_case, crew) + results.append(result) + + experiment_results = ExperimentResults(results) + + if print_summary: + self.display.summary(experiment_results) + + return experiment_results + + def _run_test_case(self, test_case: dict[str, Any], crew: Crew) -> ExperimentResult: + inputs = test_case["inputs"] + expected_score = test_case["expected_score"] + identifier = test_case.get("identifier") or md5(str(test_case).encode(), usedforsecurity=False).hexdigest() + + try: + self.display.console.print(f"[dim]Running crew with input: {str(inputs)[:50]}...[/dim]") + self.display.console.print("\n") + crew.kickoff(inputs=inputs) + + assert self.evaluator is not None + agent_evaluations = self.evaluator.get_agent_evaluation() + + actual_score = self._extract_scores(agent_evaluations) + + passed = self._assert_scores(expected_score, actual_score) + return ExperimentResult( + identifier=identifier, + inputs=inputs, + score=actual_score, + expected_score=expected_score, + passed=passed, + agent_evaluations=agent_evaluations + ) + + except Exception as e: + self.display.console.print(f"[red]Error running test case: {str(e)}[/red]") + return ExperimentResult( + identifier=identifier, + inputs=inputs, + score=0, + expected_score=expected_score, + passed=False + ) + + def _extract_scores(self, agent_evaluations: dict[str, AgentAggregatedEvaluationResult]) -> float | dict[str, float]: + all_scores: dict[str, list[float]] = defaultdict(list) + for evaluation in agent_evaluations.values(): + for metric_name, score in evaluation.metrics.items(): + if score.score is not None: + all_scores[metric_name.value].append(score.score) + + avg_scores = {m: sum(s)/len(s) for m, s in all_scores.items()} + + if len(avg_scores) == 1: + return list(avg_scores.values())[0] + + return avg_scores + + def _assert_scores(self, expected: float | dict[str, float], + actual: float | dict[str, float]) -> bool: + """ + Compare expected and actual scores, and return whether the test case passed. + + The rules for comparison are as follows: + - If both expected and actual scores are single numbers, the actual score must be >= expected. + - If expected is a single number and actual is a dict, compare against the average of actual values. + - If expected is a dict and actual is a single number, actual must be >= all expected values. + - If both are dicts, actual must have matching keys with values >= expected values. + """ + + if isinstance(expected, (int, float)) and isinstance(actual, (int, float)): + return actual >= expected + + if isinstance(expected, dict) and isinstance(actual, (int, float)): + return all(actual >= exp_score for exp_score in expected.values()) + + if isinstance(expected, (int, float)) and isinstance(actual, dict): + if not actual: + return False + avg_score = sum(actual.values()) / len(actual) + return avg_score >= expected + + if isinstance(expected, dict) and isinstance(actual, dict): + if not expected: + return True + matching_keys = set(expected.keys()) & set(actual.keys()) + if not matching_keys: + return False + + # All matching keys must have actual >= expected + return all(actual[key] >= expected[key] for key in matching_keys) + + return False \ No newline at end of file diff --git a/src/crewai/experimental/evaluation/json_parser.py b/src/crewai/experimental/evaluation/json_parser.py new file mode 100644 index 000000000..ce7303cde --- /dev/null +++ b/src/crewai/experimental/evaluation/json_parser.py @@ -0,0 +1,30 @@ +"""Robust JSON parsing utilities for evaluation responses.""" + +import json +import re +from typing import Any + + +def extract_json_from_llm_response(text: str) -> dict[str, Any]: + try: + return json.loads(text) + except json.JSONDecodeError: + pass + + json_patterns = [ + # Standard markdown code blocks with json + r'```json\s*([\s\S]*?)\s*```', + # Code blocks without language specifier + r'```\s*([\s\S]*?)\s*```', + # Inline code with JSON + r'`([{\\[].*[}\]])`', + ] + + for pattern in json_patterns: + matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL) + for match in matches: + try: + return json.loads(match.strip()) + except json.JSONDecodeError: + continue + raise ValueError("No valid JSON found in the response") diff --git a/src/crewai/experimental/evaluation/metrics/__init__.py b/src/crewai/experimental/evaluation/metrics/__init__.py new file mode 100644 index 000000000..1b306587b --- /dev/null +++ b/src/crewai/experimental/evaluation/metrics/__init__.py @@ -0,0 +1,26 @@ +from crewai.experimental.evaluation.metrics.reasoning_metrics import ( + ReasoningEfficiencyEvaluator +) + +from crewai.experimental.evaluation.metrics.tools_metrics import ( + ToolSelectionEvaluator, + ParameterExtractionEvaluator, + ToolInvocationEvaluator +) + +from crewai.experimental.evaluation.metrics.goal_metrics import ( + GoalAlignmentEvaluator +) + +from crewai.experimental.evaluation.metrics.semantic_quality_metrics import ( + SemanticQualityEvaluator +) + +__all__ = [ + "ReasoningEfficiencyEvaluator", + "ToolSelectionEvaluator", + "ParameterExtractionEvaluator", + "ToolInvocationEvaluator", + "GoalAlignmentEvaluator", + "SemanticQualityEvaluator" +] \ No newline at end of file diff --git a/src/crewai/experimental/evaluation/metrics/goal_metrics.py b/src/crewai/experimental/evaluation/metrics/goal_metrics.py new file mode 100644 index 000000000..85f0b91aa --- /dev/null +++ b/src/crewai/experimental/evaluation/metrics/goal_metrics.py @@ -0,0 +1,66 @@ +from typing import Any, Dict + +from crewai.agent import Agent +from crewai.task import Task + +from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory +from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response + +class GoalAlignmentEvaluator(BaseEvaluator): + @property + def metric_category(self) -> MetricCategory: + return MetricCategory.GOAL_ALIGNMENT + + def evaluate( + self, + agent: Agent, + task: Task, + execution_trace: Dict[str, Any], + final_output: Any, + ) -> EvaluationScore: + prompt = [ + {"role": "system", "content": """You are an expert evaluator assessing how well an AI agent's output aligns with its assigned task goal. + +Score the agent's goal alignment on a scale from 0-10 where: +- 0: Complete misalignment, agent did not understand or attempt the task goal +- 5: Partial alignment, agent attempted the task but missed key requirements +- 10: Perfect alignment, agent fully satisfied all task requirements + +Consider: +1. Did the agent correctly interpret the task goal? +2. Did the final output directly address the requirements? +3. Did the agent focus on relevant aspects of the task? +4. Did the agent provide all requested information or deliverables? + +Return your evaluation as JSON with fields 'score' (number) and 'feedback' (string). +"""}, + {"role": "user", "content": f""" +Agent role: {agent.role} +Agent goal: {agent.goal} +Task description: {task.description} +Expected output: {task.expected_output} + +Agent's final output: +{final_output} + +Evaluate how well the agent's output aligns with the assigned task goal. +"""} + ] + assert self.llm is not None + response = self.llm.call(prompt) + + try: + evaluation_data: dict[str, Any] = extract_json_from_llm_response(response) + assert evaluation_data is not None + + return EvaluationScore( + score=evaluation_data.get("score", 0), + feedback=evaluation_data.get("feedback", response), + raw_response=response + ) + except Exception: + return EvaluationScore( + score=None, + feedback=f"Failed to parse evaluation. Raw response: {response}", + raw_response=response + ) diff --git a/src/crewai/experimental/evaluation/metrics/reasoning_metrics.py b/src/crewai/experimental/evaluation/metrics/reasoning_metrics.py new file mode 100644 index 000000000..605e5b06c --- /dev/null +++ b/src/crewai/experimental/evaluation/metrics/reasoning_metrics.py @@ -0,0 +1,355 @@ +"""Agent reasoning efficiency evaluators. + +This module provides evaluator implementations for: +- Reasoning efficiency +- Loop detection +- Thinking-to-action ratio +""" + +import logging +import re +from enum import Enum +from typing import Any, Dict, List, Tuple +import numpy as np +from collections.abc import Sequence + +from crewai.agent import Agent +from crewai.task import Task + +from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory +from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response +from crewai.tasks.task_output import TaskOutput + +class ReasoningPatternType(Enum): + EFFICIENT = "efficient" # Good reasoning flow + LOOP = "loop" # Agent is stuck in a loop + VERBOSE = "verbose" # Agent is unnecessarily verbose + INDECISIVE = "indecisive" # Agent struggles to make decisions + SCATTERED = "scattered" # Agent jumps between topics without focus + + +class ReasoningEfficiencyEvaluator(BaseEvaluator): + @property + def metric_category(self) -> MetricCategory: + return MetricCategory.REASONING_EFFICIENCY + + def evaluate( + self, + agent: Agent, + task: Task, + execution_trace: Dict[str, Any], + final_output: TaskOutput, + ) -> EvaluationScore: + llm_calls = execution_trace.get("llm_calls", []) + + if not llm_calls or len(llm_calls) < 2: + return EvaluationScore( + score=None, + feedback="Insufficient LLM calls to evaluate reasoning efficiency." + ) + + total_calls = len(llm_calls) + total_tokens = sum(call.get("total_tokens", 0) for call in llm_calls) + avg_tokens_per_call = total_tokens / total_calls if total_calls > 0 else 0 + time_intervals = [] + has_reliable_timing = True + for i in range(1, len(llm_calls)): + start_time = llm_calls[i-1].get("end_time") + end_time = llm_calls[i].get("start_time") + if start_time and end_time and start_time != end_time: + try: + interval = end_time - start_time + time_intervals.append(interval.total_seconds() if hasattr(interval, 'total_seconds') else 0) + except Exception: + has_reliable_timing = False + else: + has_reliable_timing = False + + loop_detected, loop_details = self._detect_loops(llm_calls) + pattern_analysis = self._analyze_reasoning_patterns(llm_calls) + + efficiency_metrics = { + "total_llm_calls": total_calls, + "total_tokens": total_tokens, + "avg_tokens_per_call": avg_tokens_per_call, + "reasoning_pattern": pattern_analysis["primary_pattern"].value, + "loops_detected": loop_detected, + } + + if has_reliable_timing and time_intervals: + efficiency_metrics["avg_time_between_calls"] = np.mean(time_intervals) + + loop_info = f"Detected {len(loop_details)} potential reasoning loops." if loop_detected else "No significant reasoning loops detected." + + call_samples = self._get_call_samples(llm_calls) + + prompt = [ + {"role": "system", "content": """You are an expert evaluator assessing the reasoning efficiency of an AI agent's thought process. + +Evaluate the agent's reasoning efficiency across these five key subcategories: + +1. Focus (0-10): How well the agent stays on topic and avoids unnecessary tangents +2. Progression (0-10): How effectively the agent builds on previous thoughts rather than repeating or circling +3. Decision Quality (0-10): How decisively and appropriately the agent makes decisions +4. Conciseness (0-10): How efficiently the agent communicates without unnecessary verbosity +5. Loop Avoidance (0-10): How well the agent avoids getting stuck in repetitive thinking patterns + +For each subcategory, provide a score from 0-10 where: +- 0: Completely inefficient +- 5: Moderately efficient +- 10: Highly efficient + +The overall score should be a weighted average of these subcategories. + +Return your evaluation as JSON with the following structure: +{ + "overall_score": float, + "scores": { + "focus": float, + "progression": float, + "decision_quality": float, + "conciseness": float, + "loop_avoidance": float + }, + "feedback": string (general feedback about overall reasoning efficiency), + "optimization_suggestions": string (concrete suggestions for improving reasoning efficiency), + "detected_patterns": string (describe any inefficient reasoning patterns you observe) +}"""}, + {"role": "user", "content": f""" +Agent role: {agent.role} +Task description: {task.description} + +Reasoning efficiency metrics: +- Total LLM calls: {efficiency_metrics["total_llm_calls"]} +- Average tokens per call: {efficiency_metrics["avg_tokens_per_call"]:.1f} +- Primary reasoning pattern: {efficiency_metrics["reasoning_pattern"]} +- {loop_info} +{"- Average time between calls: {:.2f} seconds".format(efficiency_metrics.get("avg_time_between_calls", 0)) if "avg_time_between_calls" in efficiency_metrics else ""} + +Sample of agent reasoning flow (chronological sequence): +{call_samples} + +Agent's final output: +{final_output.raw[:500]}... (truncated) + +Evaluate the reasoning efficiency of this agent based on these interaction patterns. +Identify any inefficient reasoning patterns and provide specific suggestions for optimization. +"""} + ] + + assert self.llm is not None + response = self.llm.call(prompt) + + try: + evaluation_data = extract_json_from_llm_response(response) + + scores = evaluation_data.get("scores", {}) + focus = scores.get("focus", 5.0) + progression = scores.get("progression", 5.0) + decision_quality = scores.get("decision_quality", 5.0) + conciseness = scores.get("conciseness", 5.0) + loop_avoidance = scores.get("loop_avoidance", 5.0) + + overall_score = evaluation_data.get("overall_score", evaluation_data.get("score", 5.0)) + feedback = evaluation_data.get("feedback", "No detailed feedback provided.") + optimization_suggestions = evaluation_data.get("optimization_suggestions", "No specific suggestions provided.") + + detailed_feedback = "Reasoning Efficiency Evaluation:\n" + detailed_feedback += f"• Focus: {focus}/10 - Staying on topic without tangents\n" + detailed_feedback += f"• Progression: {progression}/10 - Building on previous thinking\n" + detailed_feedback += f"• Decision Quality: {decision_quality}/10 - Making appropriate decisions\n" + detailed_feedback += f"• Conciseness: {conciseness}/10 - Communicating efficiently\n" + detailed_feedback += f"• Loop Avoidance: {loop_avoidance}/10 - Avoiding repetitive patterns\n\n" + + detailed_feedback += f"Feedback:\n{feedback}\n\n" + detailed_feedback += f"Optimization Suggestions:\n{optimization_suggestions}" + + return EvaluationScore( + score=float(overall_score), + feedback=detailed_feedback, + raw_response=response + ) + except Exception as e: + logging.warning(f"Failed to parse reasoning efficiency evaluation: {e}") + return EvaluationScore( + score=None, + feedback=f"Failed to parse reasoning efficiency evaluation. Raw response: {response[:200]}...", + raw_response=response + ) + + def _detect_loops(self, llm_calls: List[Dict]) -> Tuple[bool, List[Dict]]: + loop_details = [] + + messages = [] + for call in llm_calls: + content = call.get("response", "") + if isinstance(content, str): + messages.append(content) + elif isinstance(content, list) and len(content) > 0: + # Handle message list format + for msg in content: + if isinstance(msg, dict) and "content" in msg: + messages.append(msg["content"]) + + # Simple n-gram based similarity detection + # For a more robust implementation, consider using embedding-based similarity + for i in range(len(messages) - 2): + for j in range(i + 1, len(messages) - 1): + # Check for repeated patterns (simplistic approach) + # A more sophisticated approach would use semantic similarity + similarity = self._calculate_text_similarity(messages[i], messages[j]) + if similarity > 0.7: # Arbitrary threshold + loop_details.append({ + "first_occurrence": i, + "second_occurrence": j, + "similarity": similarity, + "snippet": messages[i][:100] + "..." + }) + + return len(loop_details) > 0, loop_details + + def _calculate_text_similarity(self, text1: str, text2: str) -> float: + text1 = re.sub(r'\s+', ' ', text1.lower()).strip() + text2 = re.sub(r'\s+', ' ', text2.lower()).strip() + + # Simple Jaccard similarity on word sets + words1 = set(text1.split()) + words2 = set(text2.split()) + + intersection = len(words1.intersection(words2)) + union = len(words1.union(words2)) + + return intersection / union if union > 0 else 0.0 + + def _analyze_reasoning_patterns(self, llm_calls: List[Dict]) -> Dict[str, Any]: + call_lengths = [] + response_times = [] + + for call in llm_calls: + content = call.get("response", "") + if isinstance(content, str): + call_lengths.append(len(content)) + elif isinstance(content, list) and len(content) > 0: + # Handle message list format + total_length = 0 + for msg in content: + if isinstance(msg, dict) and "content" in msg: + total_length += len(msg["content"]) + call_lengths.append(total_length) + + start_time = call.get("start_time") + end_time = call.get("end_time") + if start_time and end_time: + try: + response_times.append(end_time - start_time) + except Exception: + pass + + avg_length = np.mean(call_lengths) if call_lengths else 0 + std_length = np.std(call_lengths) if call_lengths else 0 + length_trend = self._calculate_trend(call_lengths) + + primary_pattern = ReasoningPatternType.EFFICIENT + details = "Agent demonstrates efficient reasoning patterns." + + loop_score = self._calculate_loop_likelihood(call_lengths, response_times) + if loop_score > 0.7: + primary_pattern = ReasoningPatternType.LOOP + details = "Agent appears to be stuck in repetitive thinking patterns." + elif avg_length > 1000 and std_length / avg_length < 0.3: + primary_pattern = ReasoningPatternType.VERBOSE + details = "Agent is consistently verbose across interactions." + elif len(llm_calls) > 10 and length_trend > 0.5: + primary_pattern = ReasoningPatternType.INDECISIVE + details = "Agent shows signs of indecisiveness with increasing message lengths." + elif std_length / avg_length > 0.8: + primary_pattern = ReasoningPatternType.SCATTERED + details = "Agent shows inconsistent reasoning flow with highly variable responses." + + return { + "primary_pattern": primary_pattern, + "details": details, + "metrics": { + "avg_length": avg_length, + "std_length": std_length, + "length_trend": length_trend, + "loop_score": loop_score + } + } + + def _calculate_trend(self, values: Sequence[float | int]) -> float: + if not values or len(values) < 2: + return 0.0 + + try: + x = np.arange(len(values)) + y = np.array(values) + + # Simple linear regression + slope = np.polyfit(x, y, 1)[0] + + # Normalize slope to -1 to 1 range + max_possible_slope = max(values) - min(values) + if max_possible_slope > 0: + normalized_slope = slope / max_possible_slope + return max(min(normalized_slope, 1.0), -1.0) + return 0.0 + except Exception: + return 0.0 + + def _calculate_loop_likelihood(self, call_lengths: Sequence[float], response_times: Sequence[float]) -> float: + if not call_lengths or len(call_lengths) < 3: + return 0.0 + + indicators = [] + + if len(call_lengths) >= 4: + repeated_lengths = 0 + for i in range(len(call_lengths) - 2): + ratio = call_lengths[i] / call_lengths[i + 2] if call_lengths[i + 2] > 0 else 0 + if 0.85 <= ratio <= 1.15: + repeated_lengths += 1 + + length_repetition_score = repeated_lengths / (len(call_lengths) - 2) + indicators.append(length_repetition_score) + + if response_times and len(response_times) >= 3: + try: + std_time = np.std(response_times) + mean_time = np.mean(response_times) + if mean_time > 0: + time_consistency = 1.0 - (std_time / mean_time) + indicators.append(max(0, time_consistency - 0.3) * 1.5) + except Exception: + pass + + return np.mean(indicators) if indicators else 0.0 + + def _get_call_samples(self, llm_calls: List[Dict]) -> str: + samples = [] + + if len(llm_calls) <= 6: + sample_indices = list(range(len(llm_calls))) + else: + sample_indices = [0, 1, len(llm_calls) // 2 - 1, len(llm_calls) // 2, + len(llm_calls) - 2, len(llm_calls) - 1] + + for idx in sample_indices: + call = llm_calls[idx] + content = call.get("response", "") + + if isinstance(content, str): + sample = content + elif isinstance(content, list) and len(content) > 0: + sample_parts = [] + for msg in content: + if isinstance(msg, dict) and "content" in msg: + sample_parts.append(msg["content"]) + sample = "\n".join(sample_parts) + else: + sample = str(content) + + truncated = sample[:200] + "..." if len(sample) > 200 else sample + samples.append(f"Call {idx + 1}:\n{truncated}\n") + + return "\n".join(samples) diff --git a/src/crewai/experimental/evaluation/metrics/semantic_quality_metrics.py b/src/crewai/experimental/evaluation/metrics/semantic_quality_metrics.py new file mode 100644 index 000000000..011618481 --- /dev/null +++ b/src/crewai/experimental/evaluation/metrics/semantic_quality_metrics.py @@ -0,0 +1,65 @@ +from typing import Any, Dict + +from crewai.agent import Agent +from crewai.task import Task + +from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory +from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response + +class SemanticQualityEvaluator(BaseEvaluator): + @property + def metric_category(self) -> MetricCategory: + return MetricCategory.SEMANTIC_QUALITY + + def evaluate( + self, + agent: Agent, + task: Task, + execution_trace: Dict[str, Any], + final_output: Any, + ) -> EvaluationScore: + prompt = [ + {"role": "system", "content": """You are an expert evaluator assessing the semantic quality of an AI agent's output. + +Score the semantic quality on a scale from 0-10 where: +- 0: Completely incoherent, confusing, or logically flawed output +- 5: Moderately clear and logical output with some issues +- 10: Exceptionally clear, coherent, and logically sound output + +Consider: +1. Is the output well-structured and organized? +2. Is the reasoning logical and well-supported? +3. Is the language clear, precise, and appropriate for the task? +4. Are claims supported by evidence when appropriate? +5. Is the output free from contradictions and logical fallacies? + +Return your evaluation as JSON with fields 'score' (number) and 'feedback' (string). +"""}, + {"role": "user", "content": f""" +Agent role: {agent.role} +Task description: {task.description} + +Agent's final output: +{final_output} + +Evaluate the semantic quality and reasoning of this output. +"""} + ] + + assert self.llm is not None + response = self.llm.call(prompt) + + try: + evaluation_data: dict[str, Any] = extract_json_from_llm_response(response) + assert evaluation_data is not None + return EvaluationScore( + score=float(evaluation_data["score"]) if evaluation_data.get("score") is not None else None, + feedback=evaluation_data.get("feedback", response), + raw_response=response + ) + except Exception: + return EvaluationScore( + score=None, + feedback=f"Failed to parse evaluation. Raw response: {response}", + raw_response=response + ) \ No newline at end of file diff --git a/src/crewai/experimental/evaluation/metrics/tools_metrics.py b/src/crewai/experimental/evaluation/metrics/tools_metrics.py new file mode 100644 index 000000000..16b2ec7cd --- /dev/null +++ b/src/crewai/experimental/evaluation/metrics/tools_metrics.py @@ -0,0 +1,400 @@ +import json +from typing import Dict, Any + +from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory +from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response +from crewai.agent import Agent +from crewai.task import Task + + +class ToolSelectionEvaluator(BaseEvaluator): + + @property + def metric_category(self) -> MetricCategory: + return MetricCategory.TOOL_SELECTION + + def evaluate( + self, + agent: Agent, + task: Task, + execution_trace: Dict[str, Any], + final_output: str, + ) -> EvaluationScore: + tool_uses = execution_trace.get("tool_uses", []) + tool_count = len(tool_uses) + unique_tool_types = set([tool.get("tool", "Unknown tool") for tool in tool_uses]) + + if tool_count == 0: + if not agent.tools: + return EvaluationScore( + score=None, + feedback="Agent had no tools available to use." + ) + else: + return EvaluationScore( + score=None, + feedback="Agent had tools available but didn't use any." + ) + + available_tools_info = "" + if agent.tools: + for tool in agent.tools: + available_tools_info += f"- {tool.name}: {tool.description}\n" + else: + available_tools_info = "No tools available" + + tool_types_summary = "Tools selected by the agent:\n" + for tool_type in sorted(unique_tool_types): + tool_types_summary += f"- {tool_type}\n" + + prompt = [ + {"role": "system", "content": """You are an expert evaluator assessing if an AI agent selected the most appropriate tools for a given task. + +You must evaluate based on these 2 criteria: +1. Relevance (0-10): Were the tools chosen directly aligned with the task's goals? +2. Coverage (0-10): Did the agent select ALL appropriate tools from the AVAILABLE tools? + +IMPORTANT: +- ONLY consider tools that are listed as available to the agent +- DO NOT suggest tools that aren't in the 'Available tools' list +- DO NOT evaluate the quality or accuracy of tool outputs/results +- DO NOT evaluate how many times each tool was used +- DO NOT evaluate how the agent used the parameters +- DO NOT evaluate whether the agent interpreted the task correctly + +Focus ONLY on whether the correct CATEGORIES of tools were selected from what was available. + +Return your evaluation as JSON with these fields: +- scores: {"relevance": number, "coverage": number} +- overall_score: number (average of all scores, 0-10) +- feedback: string (focused ONLY on tool selection decisions from available tools) +- improvement_suggestions: string (ONLY suggest better selection from the AVAILABLE tools list, NOT new tools) +"""}, + {"role": "user", "content": f""" +Agent role: {agent.role} +Task description: {task.description} + +Available tools for this agent: +{available_tools_info} + +{tool_types_summary} + +Based ONLY on the task description and comparing the AVAILABLE tools with those that were selected (listed above), evaluate if the agent selected the appropriate tool types for this task. + +IMPORTANT: +- ONLY evaluate selection from tools listed as available +- DO NOT suggest new tools that aren't in the available tools list +- DO NOT evaluate tool usage or results +"""} + ] + assert self.llm is not None + response = self.llm.call(prompt) + + try: + evaluation_data = extract_json_from_llm_response(response) + assert evaluation_data is not None + + scores = evaluation_data.get("scores", {}) + relevance = scores.get("relevance", 5.0) + coverage = scores.get("coverage", 5.0) + overall_score = float(evaluation_data.get("overall_score", 5.0)) + + feedback = "Tool Selection Evaluation:\n" + feedback += f"• Relevance: {relevance}/10 - Selection of appropriate tool types for the task\n" + feedback += f"• Coverage: {coverage}/10 - Selection of all necessary tool types\n" + if "improvement_suggestions" in evaluation_data: + feedback += f"Improvement Suggestions:\n{evaluation_data['improvement_suggestions']}" + else: + feedback += evaluation_data.get("feedback", "No detailed feedback available.") + + return EvaluationScore( + score=overall_score, + feedback=feedback, + raw_response=response + ) + except Exception as e: + return EvaluationScore( + score=None, + feedback=f"Error evaluating tool selection: {e}", + raw_response=response + ) + + +class ParameterExtractionEvaluator(BaseEvaluator): + @property + def metric_category(self) -> MetricCategory: + return MetricCategory.PARAMETER_EXTRACTION + + def evaluate( + self, + agent: Agent, + task: Task, + execution_trace: Dict[str, Any], + final_output: str, + ) -> EvaluationScore: + tool_uses = execution_trace.get("tool_uses", []) + tool_count = len(tool_uses) + + if tool_count == 0: + return EvaluationScore( + score=None, + feedback="No tool usage detected. Cannot evaluate parameter extraction." + ) + + validation_errors = [] + for tool_use in tool_uses: + if not tool_use.get("success", True) and tool_use.get("error_type") == "validation_error": + validation_errors.append({ + "tool": tool_use.get("tool", "Unknown tool"), + "error": tool_use.get("result"), + "args": tool_use.get("args", {}) + }) + + validation_error_rate = len(validation_errors) / tool_count if tool_count > 0 else 0 + + param_samples = [] + for i, tool_use in enumerate(tool_uses[:5]): + tool_name = tool_use.get("tool", "Unknown tool") + tool_args = tool_use.get("args", {}) + success = tool_use.get("success", True) and not tool_use.get("error", False) + error_type = tool_use.get("error_type", "") if not success else "" + + is_validation_error = error_type == "validation_error" + + sample = f"Tool use #{i+1} - {tool_name}:\n" + sample += f"- Parameters: {json.dumps(tool_args, indent=2)}\n" + sample += f"- Success: {'No' if not success else 'Yes'}" + + if is_validation_error: + sample += " (PARAMETER VALIDATION ERROR)\n" + sample += f"- Error: {tool_use.get('result', 'Unknown error')}" + elif not success: + sample += f" (Other error: {error_type})\n" + + param_samples.append(sample) + + validation_errors_info = "" + if validation_errors: + validation_errors_info = f"\nParameter validation errors detected: {len(validation_errors)} ({validation_error_rate:.1%} of tool uses)\n" + for i, err in enumerate(validation_errors[:3]): + tool_name = err.get("tool", "Unknown tool") + error_msg = err.get("error", "Unknown error") + args = err.get("args", {}) + validation_errors_info += f"\nValidation Error #{i+1}:\n- Tool: {tool_name}\n- Args: {json.dumps(args, indent=2)}\n- Error: {error_msg}" + + if len(validation_errors) > 3: + validation_errors_info += f"\n...and {len(validation_errors) - 3} more validation errors." + param_samples_text = "\n\n".join(param_samples) + prompt = [ + {"role": "system", "content": """You are an expert evaluator assessing how well an AI agent extracts and formats PARAMETER VALUES for tool calls. + +Your job is to evaluate ONLY whether the agent used the correct parameter VALUES, not whether the right tools were selected or how the tools were invoked. + +Evaluate parameter extraction based on these criteria: +1. Accuracy (0-10): Are parameter values correctly identified from the context/task? +2. Formatting (0-10): Are values formatted correctly for each tool's requirements? +3. Completeness (0-10): Are all required parameter values provided, with no missing information? + +IMPORTANT: DO NOT evaluate: +- Whether the right tool was chosen (that's the ToolSelectionEvaluator's job) +- How the tools were structurally invoked (that's the ToolInvocationEvaluator's job) +- The quality of results from tools + +Focus ONLY on the PARAMETER VALUES - whether they were correctly extracted from the context, properly formatted, and complete. + +Validation errors are important signals that parameter values weren't properly extracted or formatted. + +Return your evaluation as JSON with these fields: +- scores: {"accuracy": number, "formatting": number, "completeness": number} +- overall_score: number (average of all scores, 0-10) +- feedback: string (focused ONLY on parameter value extraction quality) +- improvement_suggestions: string (concrete suggestions for better parameter VALUE extraction) +"""}, + {"role": "user", "content": f""" +Agent role: {agent.role} +Task description: {task.description} + +Parameter extraction examples: +{param_samples_text} +{validation_errors_info} + +Evaluate the quality of the agent's parameter extraction for this task. +"""} + ] + + assert self.llm is not None + response = self.llm.call(prompt) + + try: + evaluation_data = extract_json_from_llm_response(response) + assert evaluation_data is not None + + scores = evaluation_data.get("scores", {}) + accuracy = scores.get("accuracy", 5.0) + formatting = scores.get("formatting", 5.0) + completeness = scores.get("completeness", 5.0) + + overall_score = float(evaluation_data.get("overall_score", 5.0)) + + feedback = "Parameter Extraction Evaluation:\n" + feedback += f"• Accuracy: {accuracy}/10 - Correctly identifying required parameters\n" + feedback += f"• Formatting: {formatting}/10 - Properly formatting parameters for tools\n" + feedback += f"• Completeness: {completeness}/10 - Including all necessary information\n\n" + + if "improvement_suggestions" in evaluation_data: + feedback += f"Improvement Suggestions:\n{evaluation_data['improvement_suggestions']}" + else: + feedback += evaluation_data.get("feedback", "No detailed feedback available.") + + return EvaluationScore( + score=overall_score, + feedback=feedback, + raw_response=response + ) + except Exception as e: + return EvaluationScore( + score=None, + feedback=f"Error evaluating parameter extraction: {e}", + raw_response=response + ) + + +class ToolInvocationEvaluator(BaseEvaluator): + @property + def metric_category(self) -> MetricCategory: + return MetricCategory.TOOL_INVOCATION + + def evaluate( + self, + agent: Agent, + task: Task, + execution_trace: Dict[str, Any], + final_output: str, + ) -> EvaluationScore: + tool_uses = execution_trace.get("tool_uses", []) + tool_errors = [] + tool_count = len(tool_uses) + + if tool_count == 0: + return EvaluationScore( + score=None, + feedback="No tool usage detected. Cannot evaluate tool invocation." + ) + + for tool_use in tool_uses: + if not tool_use.get("success", True) or tool_use.get("error", False): + error_info = { + "tool": tool_use.get("tool", "Unknown tool"), + "error": tool_use.get("result"), + "error_type": tool_use.get("error_type", "unknown_error") + } + tool_errors.append(error_info) + + error_rate = len(tool_errors) / tool_count if tool_count > 0 else 0 + + error_types = {} + for error in tool_errors: + error_type = error.get("error_type", "unknown_error") + if error_type not in error_types: + error_types[error_type] = 0 + error_types[error_type] += 1 + + invocation_samples = [] + for i, tool_use in enumerate(tool_uses[:5]): + tool_name = tool_use.get("tool", "Unknown tool") + tool_args = tool_use.get("args", {}) + success = tool_use.get("success", True) and not tool_use.get("error", False) + error_type = tool_use.get("error_type", "") if not success else "" + error_msg = tool_use.get("result", "No error") if not success else "No error" + + sample = f"Tool invocation #{i+1}:\n" + sample += f"- Tool: {tool_name}\n" + sample += f"- Parameters: {json.dumps(tool_args, indent=2)}\n" + sample += f"- Success: {'No' if not success else 'Yes'}\n" + if not success: + sample += f"- Error type: {error_type}\n" + sample += f"- Error: {error_msg}" + invocation_samples.append(sample) + + error_type_summary = "" + if error_types: + error_type_summary = "Error type breakdown:\n" + for error_type, count in error_types.items(): + error_type_summary += f"- {error_type}: {count} occurrences ({(count/tool_count):.1%})\n" + + invocation_samples_text = "\n\n".join(invocation_samples) + prompt = [ + {"role": "system", "content": """You are an expert evaluator assessing how correctly an AI agent's tool invocations are STRUCTURED. + +Your job is to evaluate ONLY the structural and syntactical aspects of how the agent called tools, NOT which tools were selected or what parameter values were used. + +Evaluate the agent's tool invocation based on these criteria: +1. Structure (0-10): Does the tool call follow the expected syntax and format? +2. Error Handling (0-10): Does the agent handle tool errors appropriately? +3. Invocation Patterns (0-10): Are tool calls properly sequenced, batched, or managed? + +Error types that indicate invocation issues: +- execution_error: The tool was called correctly but failed during execution +- usage_error: General errors in how the tool was used structurally + +IMPORTANT: DO NOT evaluate: +- Whether the right tool was chosen (that's the ToolSelectionEvaluator's job) +- Whether the parameter values are correct (that's the ParameterExtractionEvaluator's job) +- The quality of results from tools + +Focus ONLY on HOW tools were invoked - the structure, format, and handling of the invocation process. + +Return your evaluation as JSON with these fields: +- scores: {"structure": number, "error_handling": number, "invocation_patterns": number} +- overall_score: number (average of all scores, 0-10) +- feedback: string (focused ONLY on structural aspects of tool invocation) +- improvement_suggestions: string (concrete suggestions for better structuring of tool calls) +"""}, + {"role": "user", "content": f""" +Agent role: {agent.role} +Task description: {task.description} + +Tool invocation examples: +{invocation_samples_text} + +Tool error rate: {error_rate:.2%} ({len(tool_errors)} errors out of {tool_count} invocations) +{error_type_summary} + +Evaluate the quality of the agent's tool invocation structure during this task. +"""} + ] + + assert self.llm is not None + response = self.llm.call(prompt) + + try: + evaluation_data = extract_json_from_llm_response(response) + assert evaluation_data is not None + scores = evaluation_data.get("scores", {}) + structure = scores.get("structure", 5.0) + error_handling = scores.get("error_handling", 5.0) + invocation_patterns = scores.get("invocation_patterns", 5.0) + + overall_score = float(evaluation_data.get("overall_score", 5.0)) + + feedback = "Tool Invocation Evaluation:\n" + feedback += f"• Structure: {structure}/10 - Following proper syntax and format\n" + feedback += f"• Error Handling: {error_handling}/10 - Appropriately handling tool errors\n" + feedback += f"• Invocation Patterns: {invocation_patterns}/10 - Proper sequencing and management of calls\n\n" + + if "improvement_suggestions" in evaluation_data: + feedback += f"Improvement Suggestions:\n{evaluation_data['improvement_suggestions']}" + else: + feedback += evaluation_data.get("feedback", "No detailed feedback available.") + + return EvaluationScore( + score=overall_score, + feedback=feedback, + raw_response=response + ) + except Exception as e: + return EvaluationScore( + score=None, + feedback=f"Error evaluating tool invocation: {e}", + raw_response=response + ) diff --git a/tests/evaluation/__init__.py b/tests/experimental/evaluation/__init__.py similarity index 100% rename from tests/evaluation/__init__.py rename to tests/experimental/evaluation/__init__.py diff --git a/tests/evaluation/metrics/__init__.py b/tests/experimental/evaluation/metrics/__init__.py similarity index 100% rename from tests/evaluation/metrics/__init__.py rename to tests/experimental/evaluation/metrics/__init__.py diff --git a/tests/evaluation/metrics/base_evaluation_metrics_test.py b/tests/experimental/evaluation/metrics/base_evaluation_metrics_test.py similarity index 100% rename from tests/evaluation/metrics/base_evaluation_metrics_test.py rename to tests/experimental/evaluation/metrics/base_evaluation_metrics_test.py diff --git a/tests/evaluation/metrics/test_goal_metrics.py b/tests/experimental/evaluation/metrics/test_goal_metrics.py similarity index 88% rename from tests/evaluation/metrics/test_goal_metrics.py rename to tests/experimental/evaluation/metrics/test_goal_metrics.py index 69ec42d1f..5c10f4301 100644 --- a/tests/evaluation/metrics/test_goal_metrics.py +++ b/tests/experimental/evaluation/metrics/test_goal_metrics.py @@ -1,8 +1,8 @@ from unittest.mock import patch, MagicMock -from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest +from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest -from crewai.evaluation.base_evaluator import EvaluationScore -from crewai.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator +from crewai.experimental.evaluation.base_evaluator import EvaluationScore +from crewai.experimental.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator from crewai.utilities.llm_utils import LLM diff --git a/tests/evaluation/metrics/test_reasoning_metrics.py b/tests/experimental/evaluation/metrics/test_reasoning_metrics.py similarity index 96% rename from tests/evaluation/metrics/test_reasoning_metrics.py rename to tests/experimental/evaluation/metrics/test_reasoning_metrics.py index 5d8015e3b..547046965 100644 --- a/tests/evaluation/metrics/test_reasoning_metrics.py +++ b/tests/experimental/evaluation/metrics/test_reasoning_metrics.py @@ -3,12 +3,12 @@ from unittest.mock import patch, MagicMock from typing import List, Dict, Any from crewai.tasks.task_output import TaskOutput -from crewai.evaluation.metrics.reasoning_metrics import ( +from crewai.experimental.evaluation.metrics.reasoning_metrics import ( ReasoningEfficiencyEvaluator, ) -from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest +from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest from crewai.utilities.llm_utils import LLM -from crewai.evaluation.base_evaluator import EvaluationScore +from crewai.experimental.evaluation.base_evaluator import EvaluationScore class TestReasoningEfficiencyEvaluator(BaseEvaluationMetricsTest): @pytest.fixture diff --git a/tests/evaluation/metrics/test_semantic_quality_metrics.py b/tests/experimental/evaluation/metrics/test_semantic_quality_metrics.py similarity index 90% rename from tests/evaluation/metrics/test_semantic_quality_metrics.py rename to tests/experimental/evaluation/metrics/test_semantic_quality_metrics.py index e5adb198b..4050b9562 100644 --- a/tests/evaluation/metrics/test_semantic_quality_metrics.py +++ b/tests/experimental/evaluation/metrics/test_semantic_quality_metrics.py @@ -1,8 +1,8 @@ from unittest.mock import patch, MagicMock -from crewai.evaluation.base_evaluator import EvaluationScore -from crewai.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator -from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest +from crewai.experimental.evaluation.base_evaluator import EvaluationScore +from crewai.experimental.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator +from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest from crewai.utilities.llm_utils import LLM class TestSemanticQualityEvaluator(BaseEvaluationMetricsTest): diff --git a/tests/evaluation/metrics/test_tools_metrics.py b/tests/experimental/evaluation/metrics/test_tools_metrics.py similarity index 97% rename from tests/evaluation/metrics/test_tools_metrics.py rename to tests/experimental/evaluation/metrics/test_tools_metrics.py index ad6e50fdd..dab3eeee1 100644 --- a/tests/evaluation/metrics/test_tools_metrics.py +++ b/tests/experimental/evaluation/metrics/test_tools_metrics.py @@ -1,12 +1,12 @@ from unittest.mock import patch, MagicMock -from crewai.evaluation.metrics.tools_metrics import ( +from crewai.experimental.evaluation.metrics.tools_metrics import ( ToolSelectionEvaluator, ParameterExtractionEvaluator, ToolInvocationEvaluator ) from crewai.utilities.llm_utils import LLM -from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest +from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest class TestToolSelectionEvaluator(BaseEvaluationMetricsTest): def test_no_tools_available(self, mock_task, mock_agent): diff --git a/tests/evaluation/test_agent_evaluator.py b/tests/experimental/evaluation/test_agent_evaluator.py similarity index 91% rename from tests/evaluation/test_agent_evaluator.py rename to tests/experimental/evaluation/test_agent_evaluator.py index f7935e85f..88f493908 100644 --- a/tests/evaluation/test_agent_evaluator.py +++ b/tests/experimental/evaluation/test_agent_evaluator.py @@ -3,9 +3,9 @@ import pytest from crewai.agent import Agent from crewai.task import Task from crewai.crew import Crew -from crewai.evaluation.agent_evaluator import AgentEvaluator -from crewai.evaluation.base_evaluator import AgentEvaluationResult -from crewai.evaluation import ( +from crewai.experimental.evaluation.agent_evaluator import AgentEvaluator +from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult +from crewai.experimental.evaluation import ( GoalAlignmentEvaluator, SemanticQualityEvaluator, ToolSelectionEvaluator, @@ -14,7 +14,7 @@ from crewai.evaluation import ( ReasoningEfficiencyEvaluator ) -from crewai.evaluation import create_default_evaluator +from crewai.experimental.evaluation import create_default_evaluator class TestAgentEvaluator: @pytest.fixture def mock_crew(self): diff --git a/tests/experimental/evaluation/test_experiment_result.py b/tests/experimental/evaluation/test_experiment_result.py new file mode 100644 index 000000000..5ba390b48 --- /dev/null +++ b/tests/experimental/evaluation/test_experiment_result.py @@ -0,0 +1,111 @@ +import pytest +from unittest.mock import MagicMock, patch + +from crewai.experimental.evaluation.experiment.result import ExperimentResult, ExperimentResults + + +class TestExperimentResult: + @pytest.fixture + def mock_results(self): + return [ + ExperimentResult( + identifier="test-1", + inputs={"query": "What is the capital of France?"}, + score=10, + expected_score=7, + passed=True + ), + ExperimentResult( + identifier="test-2", + inputs={"query": "Who wrote Hamlet?"}, + score={"relevance": 9, "factuality": 8}, + expected_score={"relevance": 7, "factuality": 7}, + passed=True, + agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}} + ), + ExperimentResult( + identifier="test-3", + inputs={"query": "Any query"}, + score={"relevance": 9, "factuality": 8}, + expected_score={"relevance": 7, "factuality": 7}, + passed=False, + agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}} + ), + ExperimentResult( + identifier="test-4", + inputs={"query": "Another query"}, + score={"relevance": 9, "factuality": 8}, + expected_score={"relevance": 7, "factuality": 7}, + passed=True, + agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}} + ), + ExperimentResult( + identifier="test-6", + inputs={"query": "Yet another query"}, + score={"relevance": 9, "factuality": 8}, + expected_score={"relevance": 7, "factuality": 7}, + passed=True, + agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}} + ) + ] + + @patch('os.path.exists', return_value=True) + @patch('os.path.getsize', return_value=1) + @patch('json.load') + @patch('builtins.open', new_callable=MagicMock) + def test_experiment_results_compare_with_baseline(self, mock_open, mock_json_load, mock_path_getsize, mock_path_exists, mock_results): + baseline_data = { + "timestamp": "2023-01-01T00:00:00+00:00", + "results": [ + { + "identifier": "test-1", + "inputs": {"query": "What is the capital of France?"}, + "score": 7, + "expected_score": 7, + "passed": False + }, + { + "identifier": "test-2", + "inputs": {"query": "Who wrote Hamlet?"}, + "score": {"relevance": 8, "factuality": 7}, + "expected_score": {"relevance": 7, "factuality": 7}, + "passed": True + }, + { + "identifier": "test-3", + "inputs": {"query": "Any query"}, + "score": {"relevance": 8, "factuality": 7}, + "expected_score": {"relevance": 7, "factuality": 7}, + "passed": True + }, + { + "identifier": "test-4", + "inputs": {"query": "Another query"}, + "score": {"relevance": 8, "factuality": 7}, + "expected_score": {"relevance": 7, "factuality": 7}, + "passed": True + }, + { + "identifier": "test-5", + "inputs": {"query": "Another query"}, + "score": {"relevance": 8, "factuality": 7}, + "expected_score": {"relevance": 7, "factuality": 7}, + "passed": True + } + ] + } + + mock_json_load.return_value = baseline_data + + results = ExperimentResults(results=mock_results) + results.display = MagicMock() + + comparison = results.compare_with_baseline(baseline_filepath="baseline.json") + + assert "baseline_timestamp" in comparison + assert comparison["baseline_timestamp"] == "2023-01-01T00:00:00+00:00" + assert comparison["improved"] == ["test-1"] + assert comparison["regressed"] == ["test-3"] + assert comparison["unchanged"] == ["test-2", "test-4"] + assert comparison["new_tests"] == ["test-6"] + assert comparison["missing_tests"] == ["test-5"] \ No newline at end of file diff --git a/tests/experimental/evaluation/test_experiment_runner.py b/tests/experimental/evaluation/test_experiment_runner.py new file mode 100644 index 000000000..58382fa65 --- /dev/null +++ b/tests/experimental/evaluation/test_experiment_runner.py @@ -0,0 +1,197 @@ +import pytest +from unittest.mock import MagicMock, patch + +from crewai.crew import Crew +from crewai.experimental.evaluation.experiment.runner import ExperimentRunner +from crewai.experimental.evaluation.experiment.result import ExperimentResults +from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult +from crewai.experimental.evaluation.base_evaluator import MetricCategory, EvaluationScore + + +class TestExperimentRunner: + @pytest.fixture + def mock_crew(self): + return MagicMock(llm=Crew) + + @pytest.fixture + def mock_evaluator_results(self): + agent_evaluation = AgentAggregatedEvaluationResult( + agent_id="Test Agent", + agent_role="Test Agent Role", + metrics={ + MetricCategory.GOAL_ALIGNMENT: EvaluationScore( + score=9, + feedback="Test feedback for goal alignment", + raw_response="Test raw response for goal alignment" + ), + MetricCategory.REASONING_EFFICIENCY: EvaluationScore( + score=None, + feedback="Reasoning efficiency not applicable", + raw_response="Reasoning efficiency not applicable" + ), + MetricCategory.PARAMETER_EXTRACTION: EvaluationScore( + score=7, + feedback="Test parameter extraction explanation", + raw_response="Test raw output" + ), + MetricCategory.TOOL_SELECTION: EvaluationScore( + score=8, + feedback="Test tool selection explanation", + raw_response="Test raw output" + ) + } + ) + + return {"Test Agent": agent_evaluation} + + @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator') + def test_run_success(self, mock_create_evaluator, mock_crew, mock_evaluator_results): + dataset = [ + { + "identifier": "test-case-1", + "inputs": {"query": "Test query 1"}, + "expected_score": 8 + }, + { + "identifier": "test-case-2", + "inputs": {"query": "Test query 2"}, + "expected_score": {"goal_alignment": 7} + }, + { + "inputs": {"query": "Test query 3"}, + "expected_score": {"tool_selection": 9} + } + ] + + mock_evaluator = MagicMock() + mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results + mock_evaluator.reset_iterations_results = MagicMock() + mock_create_evaluator.return_value = mock_evaluator + + runner = ExperimentRunner(dataset=dataset) + + results = runner.run(crew=mock_crew) + + assert isinstance(results, ExperimentResults) + result_1, result_2, result_3 = results.results + assert len(results.results) == 3 + + assert result_1.identifier == "test-case-1" + assert result_1.inputs == {"query": "Test query 1"} + assert result_1.expected_score == 8 + assert result_1.passed is True + + assert result_2.identifier == "test-case-2" + assert result_2.inputs == {"query": "Test query 2"} + assert isinstance(result_2.expected_score, dict) + assert "goal_alignment" in result_2.expected_score + assert result_2.passed is True + + assert result_3.identifier == "c2ed49e63aa9a83af3ca382794134fd5" + assert result_3.inputs == {"query": "Test query 3"} + assert isinstance(result_3.expected_score, dict) + assert "tool_selection" in result_3.expected_score + assert result_3.passed is False + + assert mock_crew.kickoff.call_count == 3 + mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 1"}) + mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 2"}) + mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 3"}) + + assert mock_evaluator.reset_iterations_results.call_count == 3 + assert mock_evaluator.get_agent_evaluation.call_count == 3 + + + @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator') + def test_run_success_with_unknown_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results): + dataset = [ + { + "identifier": "test-case-2", + "inputs": {"query": "Test query 2"}, + "expected_score": {"goal_alignment": 7, "unknown_metric": 8} + } + ] + + mock_evaluator = MagicMock() + mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results + mock_evaluator.reset_iterations_results = MagicMock() + mock_create_evaluator.return_value = mock_evaluator + + runner = ExperimentRunner(dataset=dataset) + + results = runner.run(crew=mock_crew) + + result, = results.results + + assert result.identifier == "test-case-2" + assert result.inputs == {"query": "Test query 2"} + assert isinstance(result.expected_score, dict) + assert "goal_alignment" in result.expected_score.keys() + assert "unknown_metric" in result.expected_score.keys() + assert result.passed is True + + @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator') + def test_run_success_with_single_metric_evaluator_and_expected_specific_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results): + dataset = [ + { + "identifier": "test-case-2", + "inputs": {"query": "Test query 2"}, + "expected_score": {"goal_alignment": 7} + } + ] + + mock_evaluator = MagicMock() + mock_create_evaluator["Test Agent"].metrics = { + MetricCategory.GOAL_ALIGNMENT: EvaluationScore( + score=9, + feedback="Test feedback for goal alignment", + raw_response="Test raw response for goal alignment" + ) + } + mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results + mock_evaluator.reset_iterations_results = MagicMock() + mock_create_evaluator.return_value = mock_evaluator + + runner = ExperimentRunner(dataset=dataset) + + results = runner.run(crew=mock_crew) + result, = results.results + + assert result.identifier == "test-case-2" + assert result.inputs == {"query": "Test query 2"} + assert isinstance(result.expected_score, dict) + assert "goal_alignment" in result.expected_score.keys() + assert result.passed is True + + @patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator') + def test_run_success_when_expected_metric_is_not_available(self, mock_create_evaluator, mock_crew, mock_evaluator_results): + dataset = [ + { + "identifier": "test-case-2", + "inputs": {"query": "Test query 2"}, + "expected_score": {"unknown_metric": 7} + } + ] + + mock_evaluator = MagicMock() + mock_create_evaluator["Test Agent"].metrics = { + MetricCategory.GOAL_ALIGNMENT: EvaluationScore( + score=5, + feedback="Test feedback for goal alignment", + raw_response="Test raw response for goal alignment" + ) + } + mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results + mock_evaluator.reset_iterations_results = MagicMock() + mock_create_evaluator.return_value = mock_evaluator + + runner = ExperimentRunner(dataset=dataset) + + results = runner.run(crew=mock_crew) + result, = results.results + + assert result.identifier == "test-case-2" + assert result.inputs == {"query": "Test query 2"} + assert isinstance(result.expected_score, dict) + assert "unknown_metric" in result.expected_score.keys() + assert result.passed is False \ No newline at end of file