chore: resolve all ruff and mypy issues in experimental module

resolve linting, typing, and import issues; update Okta test
2026-01-13 18:18:29 +00:00 · 2025-09-22 12:56:28 -04:00
parent aa8dc9d77f
commit 0e370593f1
17 changed files with 595 additions and 402 deletions
--- a/src/crewai/experimental/init.py
+++ b/src/crewai/experimental/init.py
@@ -1,40 +1,39 @@
 from crewai.experimental.evaluation import (
+    AgentEvaluationResult,
+    AgentEvaluator,
    BaseEvaluator,
    EvaluationScore,
-    MetricCategory,
-    AgentEvaluationResult,
-    SemanticQualityEvaluator,
-    GoalAlignmentEvaluator,
-    ReasoningEfficiencyEvaluator,
-    ToolSelectionEvaluator,
-    ParameterExtractionEvaluator,
-    ToolInvocationEvaluator,
    EvaluationTraceCallback,
-    create_evaluation_callbacks,
-    AgentEvaluator,
-    create_default_evaluator,
-    ExperimentRunner,
-    ExperimentResults,
    ExperimentResult,
+    ExperimentResults,
+    ExperimentRunner,
+    GoalAlignmentEvaluator,
+    MetricCategory,
+    ParameterExtractionEvaluator,
+    ReasoningEfficiencyEvaluator,
+    SemanticQualityEvaluator,
+    ToolInvocationEvaluator,
+    ToolSelectionEvaluator,
+    create_default_evaluator,
+    create_evaluation_callbacks,
 )

-
 __all__ = [
+    "AgentEvaluationResult",
+    "AgentEvaluator",
    "BaseEvaluator",
    "EvaluationScore",
-    "MetricCategory",
-    "AgentEvaluationResult",
-    "SemanticQualityEvaluator",
-    "GoalAlignmentEvaluator",
-    "ReasoningEfficiencyEvaluator",
-    "ToolSelectionEvaluator",
-    "ParameterExtractionEvaluator",
-    "ToolInvocationEvaluator",
    "EvaluationTraceCallback",
-    "create_evaluation_callbacks",
-    "AgentEvaluator",
-    "create_default_evaluator",
-    "ExperimentRunner",
+    "ExperimentResult",
    "ExperimentResults",
-    "ExperimentResult"
-]
+    "ExperimentRunner",
+    "GoalAlignmentEvaluator",
+    "MetricCategory",
+    "ParameterExtractionEvaluator",
+    "ReasoningEfficiencyEvaluator",
+    "SemanticQualityEvaluator",
+    "ToolInvocationEvaluator",
+    "ToolSelectionEvaluator",
+    "create_default_evaluator",
+    "create_evaluation_callbacks",
+]
--- a/src/crewai/experimental/evaluation/init.py
+++ b/src/crewai/experimental/evaluation/init.py
@@ -1,51 +1,47 @@
+from crewai.experimental.evaluation.agent_evaluator import (
+    AgentEvaluator,
+    create_default_evaluator,
+)
 from crewai.experimental.evaluation.base_evaluator import (
+    AgentEvaluationResult,
    BaseEvaluator,
    EvaluationScore,
    MetricCategory,
-    AgentEvaluationResult
 )
-
-from crewai.experimental.evaluation.metrics import (
-    SemanticQualityEvaluator,
-    GoalAlignmentEvaluator,
-    ReasoningEfficiencyEvaluator,
-    ToolSelectionEvaluator,
-    ParameterExtractionEvaluator,
-    ToolInvocationEvaluator
-)
-
 from crewai.experimental.evaluation.evaluation_listener import (
    EvaluationTraceCallback,
-    create_evaluation_callbacks
+    create_evaluation_callbacks,
 )
-
-from crewai.experimental.evaluation.agent_evaluator import (
-    AgentEvaluator,
-    create_default_evaluator
-)
-
 from crewai.experimental.evaluation.experiment import (
-    ExperimentRunner,
+    ExperimentResult,
    ExperimentResults,
-    ExperimentResult
+    ExperimentRunner,
+)
+from crewai.experimental.evaluation.metrics import (
+    GoalAlignmentEvaluator,
+    ParameterExtractionEvaluator,
+    ReasoningEfficiencyEvaluator,
+    SemanticQualityEvaluator,
+    ToolInvocationEvaluator,
+    ToolSelectionEvaluator,
 )

 __all__ = [
+    "AgentEvaluationResult",
+    "AgentEvaluator",
    "BaseEvaluator",
    "EvaluationScore",
-    "MetricCategory",
-    "AgentEvaluationResult",
-    "SemanticQualityEvaluator",
-    "GoalAlignmentEvaluator",
-    "ReasoningEfficiencyEvaluator",
-    "ToolSelectionEvaluator",
-    "ParameterExtractionEvaluator",
-    "ToolInvocationEvaluator",
    "EvaluationTraceCallback",
-    "create_evaluation_callbacks",
-    "AgentEvaluator",
-    "create_default_evaluator",
-    "ExperimentRunner",
+    "ExperimentResult",
    "ExperimentResults",
-    "ExperimentResult"
+    "ExperimentRunner",
+    "GoalAlignmentEvaluator",
+    "MetricCategory",
+    "ParameterExtractionEvaluator",
+    "ReasoningEfficiencyEvaluator",
+    "SemanticQualityEvaluator",
+    "ToolInvocationEvaluator",
+    "ToolSelectionEvaluator",
+    "create_default_evaluator",
+    "create_evaluation_callbacks",
 ]
--- a/src/crewai/experimental/evaluation/agent_evaluator.py
+++ b/src/crewai/experimental/evaluation/agent_evaluator.py
@@ -1,34 +1,36 @@
 import threading
-from typing import Any, Optional
+from collections.abc import Sequence
+from typing import Any

-from crewai.experimental.evaluation.base_evaluator import (
-    AgentEvaluationResult,
-    AggregationStrategy,
-)
 from crewai.agent import Agent
-from crewai.task import Task
-from crewai.experimental.evaluation.evaluation_display import EvaluationDisplayFormatter
+from crewai.agents.agent_builder.base_agent import BaseAgent
+from crewai.events.event_bus import crewai_event_bus
 from crewai.events.types.agent_events import (
-    AgentEvaluationStartedEvent,
    AgentEvaluationCompletedEvent,
    AgentEvaluationFailedEvent,
+    AgentEvaluationStartedEvent,
+    LiteAgentExecutionCompletedEvent,
 )
-from crewai.experimental.evaluation import BaseEvaluator, create_evaluation_callbacks
-from collections.abc import Sequence
-from crewai.events.event_bus import crewai_event_bus
-from crewai.events.utils.console_formatter import ConsoleFormatter
 from crewai.events.types.task_events import TaskCompletedEvent
-from crewai.events.types.agent_events import LiteAgentExecutionCompletedEvent
+from crewai.events.utils.console_formatter import ConsoleFormatter
 from crewai.experimental.evaluation.base_evaluator import (
    AgentAggregatedEvaluationResult,
+    AgentEvaluationResult,
+    AggregationStrategy,
+    BaseEvaluator,
    EvaluationScore,
    MetricCategory,
 )
+from crewai.experimental.evaluation.evaluation_display import EvaluationDisplayFormatter
+from crewai.experimental.evaluation.evaluation_listener import (
+    create_evaluation_callbacks,
+)
+from crewai.task import Task


 class ExecutionState:
-    current_agent_id: Optional[str] = None
-    current_task_id: Optional[str] = None
+    current_agent_id: str | None = None
+    current_task_id: str | None = None

    def __init__(self):
        self.traces = {}
@@ -40,10 +42,10 @@ class ExecutionState:
 class AgentEvaluator:
    def __init__(
        self,
-        agents: list[Agent],
+        agents: list[Agent] | list[BaseAgent],
        evaluators: Sequence[BaseEvaluator] | None = None,
    ):
-        self.agents: list[Agent] = agents
+        self.agents: list[Agent] | list[BaseAgent] = agents
        self.evaluators: Sequence[BaseEvaluator] | None = evaluators

        self.callback = create_evaluation_callbacks()
@@ -75,7 +77,8 @@ class AgentEvaluator:
        )

    def _handle_task_completed(self, source: Any, event: TaskCompletedEvent) -> None:
-        assert event.task is not None
+        if event.task is None:
+            raise ValueError("TaskCompletedEvent must have a task")
        agent = event.task.agent
        if (
            agent
@@ -92,9 +95,8 @@ class AgentEvaluator:
            state.current_agent_id = str(agent.id)
            state.current_task_id = str(event.task.id)

-            assert (
-                state.current_agent_id is not None and state.current_task_id is not None
-            )
+            if state.current_agent_id is None or state.current_task_id is None:
+                raise ValueError("Agent ID and Task ID must not be None")
            trace = self.callback.get_trace(
                state.current_agent_id, state.current_task_id
            )
@@ -146,9 +148,8 @@ class AgentEvaluator:
            if not target_agent:
                return

-            assert (
-                state.current_agent_id is not None and state.current_task_id is not None
-            )
+            if state.current_agent_id is None or state.current_task_id is None:
+                raise ValueError("Agent ID and Task ID must not be None")
            trace = self.callback.get_trace(
                state.current_agent_id, state.current_task_id
            )
@@ -244,7 +245,7 @@ class AgentEvaluator:

    def evaluate(
        self,
-        agent: Agent,
+        agent: Agent | BaseAgent,
        execution_trace: dict[str, Any],
        final_output: Any,
        state: ExecutionState,
@@ -255,7 +256,8 @@ class AgentEvaluator:
            task_id=state.current_task_id or (str(task.id) if task else "unknown_task"),
        )

-        assert self.evaluators is not None
+        if self.evaluators is None:
+            raise ValueError("Evaluators must be initialized")
        task_id = str(task.id) if task else None
        for evaluator in self.evaluators:
            try:
@@ -276,7 +278,7 @@ class AgentEvaluator:
                    metric_category=evaluator.metric_category,
                    score=score,
                )
-            except Exception as e:
+            except Exception as e:  # noqa: PERF203
                self.emit_evaluation_failed_event(
                    agent_role=agent.role,
                    agent_id=str(agent.id),
@@ -284,7 +286,7 @@ class AgentEvaluator:
                    error=str(e),
                )
                self.console_formatter.print(
-                    f"Error in {evaluator.metric_category.value} evaluator: {str(e)}"
+                    f"Error in {evaluator.metric_category.value} evaluator: {e!s}"
                )

        return result
@@ -337,14 +339,14 @@ class AgentEvaluator:
        )


-def create_default_evaluator(agents: list[Agent], llm: None = None):
+def create_default_evaluator(agents: list[Agent] | list[BaseAgent], llm: None = None):
    from crewai.experimental.evaluation import (
        GoalAlignmentEvaluator,
-        SemanticQualityEvaluator,
-        ToolSelectionEvaluator,
        ParameterExtractionEvaluator,
-        ToolInvocationEvaluator,
        ReasoningEfficiencyEvaluator,
+        SemanticQualityEvaluator,
+        ToolInvocationEvaluator,
+        ToolSelectionEvaluator,
    )

    evaluators = [
--- a/src/crewai/experimental/evaluation/base_evaluator.py
+++ b/src/crewai/experimental/evaluation/base_evaluator.py
@@ -1,15 +1,17 @@
 import abc
 import enum
 from enum import Enum
-from typing import Any, Dict, List, Optional
+from typing import Any

 from pydantic import BaseModel, Field

 from crewai.agent import Agent
-from crewai.task import Task
+from crewai.agents.agent_builder.base_agent import BaseAgent
 from crewai.llm import BaseLLM
+from crewai.task import Task
 from crewai.utilities.llm_utils import create_llm

+
 class MetricCategory(enum.Enum):
    GOAL_ALIGNMENT = "goal_alignment"
    SEMANTIC_QUALITY = "semantic_quality"
@@ -19,7 +21,7 @@ class MetricCategory(enum.Enum):
    TOOL_INVOCATION = "tool_invocation"

    def title(self):
-        return self.value.replace('_', ' ').title()
+        return self.value.replace("_", " ").title()


 class EvaluationScore(BaseModel):
@@ -27,15 +29,13 @@ class EvaluationScore(BaseModel):
        default=5.0,
        description="Numeric score from 0-10 where 0 is worst and 10 is best, None if not applicable",
        ge=0.0,
-        le=10.0
+        le=10.0,
    )
    feedback: str = Field(
-        default="",
-        description="Detailed feedback explaining the evaluation score"
+        default="", description="Detailed feedback explaining the evaluation score"
    )
    raw_response: str | None = Field(
-        default=None,
-        description="Raw response from the evaluator (e.g., LLM)"
+        default=None, description="Raw response from the evaluator (e.g., LLM)"
    )

    def __str__(self) -> str:
@@ -56,8 +56,8 @@ class BaseEvaluator(abc.ABC):
    @abc.abstractmethod
    def evaluate(
        self,
-        agent: Agent,
-        execution_trace: Dict[str, Any],
+        agent: Agent | BaseAgent,
+        execution_trace: dict[str, Any],
        final_output: Any,
        task: Task | None = None,
    ) -> EvaluationScore:
@@ -67,9 +67,8 @@ class BaseEvaluator(abc.ABC):
 class AgentEvaluationResult(BaseModel):
    agent_id: str = Field(description="ID of the evaluated agent")
    task_id: str = Field(description="ID of the task that was executed")
-    metrics: Dict[MetricCategory, EvaluationScore] = Field(
-        default_factory=dict,
-        description="Evaluation scores for each metric category"
+    metrics: dict[MetricCategory, EvaluationScore] = Field(
+        default_factory=dict, description="Evaluation scores for each metric category"
    )


@@ -81,33 +80,23 @@ class AggregationStrategy(Enum):


 class AgentAggregatedEvaluationResult(BaseModel):
-    agent_id: str = Field(
-        default="",
-        description="ID of the agent"
-    )
-    agent_role: str = Field(
-        default="",
-        description="Role of the agent"
-    )
+    agent_id: str = Field(default="", description="ID of the agent")
+    agent_role: str = Field(default="", description="Role of the agent")
    task_count: int = Field(
-        default=0,
-        description="Number of tasks included in this aggregation"
+        default=0, description="Number of tasks included in this aggregation"
    )
    aggregation_strategy: AggregationStrategy = Field(
        default=AggregationStrategy.SIMPLE_AVERAGE,
-        description="Strategy used for aggregation"
+        description="Strategy used for aggregation",
    )
-    metrics: Dict[MetricCategory, EvaluationScore] = Field(
-        default_factory=dict,
-        description="Aggregated metrics across all tasks"
+    metrics: dict[MetricCategory, EvaluationScore] = Field(
+        default_factory=dict, description="Aggregated metrics across all tasks"
    )
-    task_results: List[str] = Field(
-        default_factory=list,
-        description="IDs of tasks included in this aggregation"
+    task_results: list[str] = Field(
+        default_factory=list, description="IDs of tasks included in this aggregation"
    )
-    overall_score: Optional[float] = Field(
-        default=None,
-        description="Overall score for this agent"
+    overall_score: float | None = Field(
+        default=None, description="Overall score for this agent"
    )

    def __str__(self) -> str:
@@ -119,7 +108,7 @@ class AgentAggregatedEvaluationResult(BaseModel):
            result += f"\n\n- {category.value.upper()}: {score.score}/10\n"

            if score.feedback:
-                detailed_feedback = "\n  ".join(score.feedback.split('\n'))
+                detailed_feedback = "\n  ".join(score.feedback.split("\n"))
                result += f"  {detailed_feedback}\n"

-        return result
+        return result
--- a/src/crewai/experimental/evaluation/evaluation_display.py
+++ b/src/crewai/experimental/evaluation/evaluation_display.py
@@ -1,16 +1,18 @@
 from collections import defaultdict
-from typing import Dict, Any, List
-from rich.table import Table
-from rich.box import HEAVY_EDGE, ROUNDED
 from collections.abc import Sequence
+from typing import Any
+
+from rich.box import HEAVY_EDGE, ROUNDED
+from rich.table import Table
+
+from crewai.events.utils.console_formatter import ConsoleFormatter
 from crewai.experimental.evaluation.base_evaluator import (
    AgentAggregatedEvaluationResult,
-    AggregationStrategy,
    AgentEvaluationResult,
+    AggregationStrategy,
+    EvaluationScore,
    MetricCategory,
 )
-from crewai.experimental.evaluation import EvaluationScore
-from crewai.events.utils.console_formatter import ConsoleFormatter
 from crewai.utilities.llm_utils import create_llm


@@ -19,7 +21,7 @@ class EvaluationDisplayFormatter:
        self.console_formatter = ConsoleFormatter()

    def display_evaluation_with_feedback(
-        self, iterations_results: Dict[int, Dict[str, List[Any]]]
+        self, iterations_results: dict[int, dict[str, list[Any]]]
    ):
        if not iterations_results:
            self.console_formatter.print(
@@ -99,7 +101,7 @@ class EvaluationDisplayFormatter:

    def display_summary_results(
        self,
-        iterations_results: Dict[int, Dict[str, List[AgentAggregatedEvaluationResult]]],
+        iterations_results: dict[int, dict[str, list[AgentEvaluationResult]]],
    ):
        if not iterations_results:
            self.console_formatter.print(
@@ -280,7 +282,7 @@ class EvaluationDisplayFormatter:
                    feedback_summary = feedbacks[0]

            aggregated_metrics[category] = EvaluationScore(
-                score=avg_score, feedback=feedback_summary
+                score=avg_score, feedback=feedback_summary or ""
            )

        overall_score = None
@@ -304,25 +306,25 @@ class EvaluationDisplayFormatter:
        self,
        agent_role: str,
        metric: str,
-        feedbacks: List[str],
-        scores: List[float | None],
+        feedbacks: list[str],
+        scores: list[float | None],
        strategy: AggregationStrategy,
    ) -> str:
        if len(feedbacks) <= 2 and all(len(fb) < 200 for fb in feedbacks):
            return "\n\n".join(
-                [f"Feedback {i+1}: {fb}" for i, fb in enumerate(feedbacks)]
+                [f"Feedback {i + 1}: {fb}" for i, fb in enumerate(feedbacks)]
            )

        try:
            llm = create_llm()

            formatted_feedbacks = []
-            for i, (feedback, score) in enumerate(zip(feedbacks, scores)):
+            for i, (feedback, score) in enumerate(zip(feedbacks, scores, strict=False)):
                if len(feedback) > 500:
                    feedback = feedback[:500] + "..."
                score_text = f"{score:.1f}" if score is not None else "N/A"
                formatted_feedbacks.append(
-                    f"Feedback #{i+1} (Score: {score_text}):\n{feedback}"
+                    f"Feedback #{i + 1} (Score: {score_text}):\n{feedback}"
                )

            all_feedbacks = "\n\n" + "\n\n---\n\n".join(formatted_feedbacks)
@@ -365,10 +367,9 @@ class EvaluationDisplayFormatter:
                """,
                },
            ]
-            assert llm is not None
-            response = llm.call(prompt)
-
-            return response
+            if llm is None:
+                raise ValueError("LLM must be initialized")
+            return llm.call(prompt)

        except Exception:
            return "Synthesized from multiple tasks: " + "\n\n".join(
--- a/src/crewai/experimental/evaluation/evaluation_listener.py
+++ b/src/crewai/experimental/evaluation/evaluation_listener.py
@@ -1,26 +1,25 @@
-from datetime import datetime
-from typing import Any, Dict, Optional
-
 from collections.abc import Sequence
+from datetime import datetime
+from typing import Any

-from crewai.agent import Agent
-from crewai.task import Task
+from crewai.agents.agent_builder.base_agent import BaseAgent
 from crewai.events.base_event_listener import BaseEventListener
 from crewai.events.event_bus import CrewAIEventsBus
 from crewai.events.types.agent_events import (
-    AgentExecutionStartedEvent,
    AgentExecutionCompletedEvent,
-    LiteAgentExecutionStartedEvent,
+    AgentExecutionStartedEvent,
    LiteAgentExecutionCompletedEvent,
+    LiteAgentExecutionStartedEvent,
 )
+from crewai.events.types.llm_events import LLMCallCompletedEvent, LLMCallStartedEvent
 from crewai.events.types.tool_usage_events import (
-    ToolUsageFinishedEvent,
-    ToolUsageErrorEvent,
    ToolExecutionErrorEvent,
    ToolSelectionErrorEvent,
+    ToolUsageErrorEvent,
+    ToolUsageFinishedEvent,
    ToolValidateInputErrorEvent,
 )
-from crewai.events.types.llm_events import LLMCallStartedEvent, LLMCallCompletedEvent
+from crewai.task import Task


 class EvaluationTraceCallback(BaseEventListener):
@@ -136,7 +135,7 @@ class EvaluationTraceCallback(BaseEventListener):
    def _init_trace(self, trace_key: str, **kwargs: Any):
        self.traces[trace_key] = kwargs

-    def on_agent_start(self, agent: Agent, task: Task):
+    def on_agent_start(self, agent: BaseAgent, task: Task):
        self.current_agent_id = agent.id
        self.current_task_id = task.id

@@ -151,7 +150,7 @@ class EvaluationTraceCallback(BaseEventListener):
            final_output=None,
        )

-    def on_agent_finish(self, agent: Agent, task: Task, output: Any):
+    def on_agent_finish(self, agent: BaseAgent, task: Task, output: Any):
        trace_key = f"{agent.id}_{task.id}"
        if trace_key in self.traces:
            self.traces[trace_key]["final_output"] = output
@@ -253,7 +252,7 @@ class EvaluationTraceCallback(BaseEventListener):
        if hasattr(self, "current_llm_call"):
            self.current_llm_call = {}

-    def get_trace(self, agent_id: str, task_id: str) -> Optional[Dict[str, Any]]:
+    def get_trace(self, agent_id: str, task_id: str) -> dict[str, Any] | None:
        trace_key = f"{agent_id}_{task_id}"
        return self.traces.get(trace_key)

--- a/src/crewai/experimental/evaluation/experiment/init.py
+++ b/src/crewai/experimental/evaluation/experiment/init.py
@@ -1,8 +1,7 @@
+from crewai.experimental.evaluation.experiment.result import (
+    ExperimentResult,
+    ExperimentResults,
+)
 from crewai.experimental.evaluation.experiment.runner import ExperimentRunner
-from crewai.experimental.evaluation.experiment.result import ExperimentResults, ExperimentResult

-__all__ = [
-    "ExperimentRunner",
-    "ExperimentResults",
-    "ExperimentResult"
-]
+__all__ = ["ExperimentResult", "ExperimentResults", "ExperimentRunner"]
--- a/src/crewai/experimental/evaluation/experiment/result.py
+++ b/src/crewai/experimental/evaluation/experiment/result.py
@@ -2,45 +2,60 @@ import json
 import os
 from datetime import datetime, timezone
 from typing import Any
+
 from pydantic import BaseModel

+
 class ExperimentResult(BaseModel):
    identifier: str
    inputs: dict[str, Any]
-    score: int | dict[str, int | float]
-    expected_score: int | dict[str, int | float]
+    score: float | dict[str, float]
+    expected_score: float | dict[str, float]
    passed: bool
    agent_evaluations: dict[str, Any] | None = None

+
 class ExperimentResults:
-    def __init__(self, results: list[ExperimentResult], metadata: dict[str, Any] | None = None):
+    def __init__(
+        self, results: list[ExperimentResult], metadata: dict[str, Any] | None = None
+    ):
        self.results = results
        self.metadata = metadata or {}
        self.timestamp = datetime.now(timezone.utc)

-        from crewai.experimental.evaluation.experiment.result_display import ExperimentResultsDisplay
+        from crewai.experimental.evaluation.experiment.result_display import (
+            ExperimentResultsDisplay,
+        )
+
        self.display = ExperimentResultsDisplay()

    def to_json(self, filepath: str | None = None) -> dict[str, Any]:
        data = {
            "timestamp": self.timestamp.isoformat(),
            "metadata": self.metadata,
-            "results": [r.model_dump(exclude={"agent_evaluations"}) for r in self.results]
+            "results": [
+                r.model_dump(exclude={"agent_evaluations"}) for r in self.results
+            ],
        }

        if filepath:
-            with open(filepath, 'w') as f:
+            with open(filepath, "w") as f:
                json.dump(data, f, indent=2)
            self.display.console.print(f"[green]Results saved to {filepath}[/green]")

        return data

-    def compare_with_baseline(self, baseline_filepath: str, save_current: bool = True, print_summary: bool = False) -> dict[str, Any]:
+    def compare_with_baseline(
+        self,
+        baseline_filepath: str,
+        save_current: bool = True,
+        print_summary: bool = False,
+    ) -> dict[str, Any]:
        baseline_runs = []

        if os.path.exists(baseline_filepath) and os.path.getsize(baseline_filepath) > 0:
            try:
-                with open(baseline_filepath, 'r') as f:
+                with open(baseline_filepath, "r") as f:
                    baseline_data = json.load(f)

                if isinstance(baseline_data, dict) and "timestamp" in baseline_data:
@@ -48,14 +63,18 @@ class ExperimentResults:
                elif isinstance(baseline_data, list):
                    baseline_runs = baseline_data
            except (json.JSONDecodeError, FileNotFoundError) as e:
-                self.display.console.print(f"[yellow]Warning: Could not load baseline file: {str(e)}[/yellow]")
+                self.display.console.print(
+                    f"[yellow]Warning: Could not load baseline file: {e!s}[/yellow]"
+                )

        if not baseline_runs:
            if save_current:
                current_data = self.to_json()
-                with open(baseline_filepath, 'w') as f:
+                with open(baseline_filepath, "w") as f:
                    json.dump([current_data], f, indent=2)
-                self.display.console.print(f"[green]Saved current results as new baseline to {baseline_filepath}[/green]")
+                self.display.console.print(
+                    f"[green]Saved current results as new baseline to {baseline_filepath}[/green]"
+                )
            return {"is_baseline": True, "changes": {}}

        baseline_runs.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
@@ -69,9 +88,11 @@ class ExperimentResults:
        if save_current:
            current_data = self.to_json()
            baseline_runs.append(current_data)
-            with open(baseline_filepath, 'w') as f:
+            with open(baseline_filepath, "w") as f:
                json.dump(baseline_runs, f, indent=2)
-            self.display.console.print(f"[green]Added current results to baseline file {baseline_filepath}[/green]")
+            self.display.console.print(
+                f"[green]Added current results to baseline file {baseline_filepath}[/green]"
+            )

        return comparison

@@ -118,5 +139,5 @@ class ExperimentResults:
            "new_tests": new_tests,
            "missing_tests": missing_tests,
            "total_compared": len(improved) + len(regressed) + len(unchanged),
-            "baseline_timestamp": baseline_run.get("timestamp", "unknown")
+            "baseline_timestamp": baseline_run.get("timestamp", "unknown"),
        }
--- a/src/crewai/experimental/evaluation/experiment/result_display.py
+++ b/src/crewai/experimental/evaluation/experiment/result_display.py
@@ -1,9 +1,12 @@
-from typing import Dict, Any
+from typing import Any
+
 from rich.console import Console
-from rich.table import Table
 from rich.panel import Panel
+from rich.table import Table
+
 from crewai.experimental.evaluation.experiment.result import ExperimentResults

+
 class ExperimentResultsDisplay:
    def __init__(self):
        self.console = Console()
@@ -19,13 +22,19 @@ class ExperimentResultsDisplay:
        table.add_row("Total Test Cases", str(total))
        table.add_row("Passed", str(passed))
        table.add_row("Failed", str(total - passed))
-        table.add_row("Success Rate", f"{(passed / total * 100):.1f}%" if total > 0 else "N/A")
+        table.add_row(
+            "Success Rate", f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
+        )

        self.console.print(table)

-    def comparison_summary(self, comparison: Dict[str, Any], baseline_timestamp: str):
-        self.console.print(Panel(f"[bold]Comparison with baseline run from {baseline_timestamp}[/bold]",
-                                 expand=False))
+    def comparison_summary(self, comparison: dict[str, Any], baseline_timestamp: str):
+        self.console.print(
+            Panel(
+                f"[bold]Comparison with baseline run from {baseline_timestamp}[/bold]",
+                expand=False,
+            )
+        )

        table = Table(title="Results Comparison")
        table.add_column("Metric", style="cyan")
@@ -34,7 +43,9 @@ class ExperimentResultsDisplay:

        improved = comparison.get("improved", [])
        if improved:
-            details = ", ".join([f"{test_identifier}" for test_identifier in improved[:3]])
+            details = ", ".join(
+                [f"{test_identifier}" for test_identifier in improved[:3]]
+            )
            if len(improved) > 3:
                details += f" and {len(improved) - 3} more"
            table.add_row("✅ Improved", str(len(improved)), details)
@@ -43,7 +54,9 @@ class ExperimentResultsDisplay:

        regressed = comparison.get("regressed", [])
        if regressed:
-            details = ", ".join([f"{test_identifier}" for test_identifier in regressed[:3]])
+            details = ", ".join(
+                [f"{test_identifier}" for test_identifier in regressed[:3]]
+            )
            if len(regressed) > 3:
                details += f" and {len(regressed) - 3} more"
            table.add_row("❌ Regressed", str(len(regressed)), details, style="red")
@@ -58,13 +71,13 @@ class ExperimentResultsDisplay:
            details = ", ".join(new_tests[:3])
            if len(new_tests) > 3:
                details += f" and {len(new_tests) - 3} more"
-            table.add_row("➕ New Tests", str(len(new_tests)), details)
+            table.add_row("+ New Tests", str(len(new_tests)), details)

        missing_tests = comparison.get("missing_tests", [])
        if missing_tests:
            details = ", ".join(missing_tests[:3])
            if len(missing_tests) > 3:
                details += f" and {len(missing_tests) - 3} more"
-            table.add_row("➖ Missing Tests", str(len(missing_tests)), details)
+            table.add_row("- Missing Tests", str(len(missing_tests)), details)

        self.console.print(table)
--- a/src/crewai/experimental/evaluation/experiment/runner.py
+++ b/src/crewai/experimental/evaluation/experiment/runner.py
@@ -2,11 +2,20 @@ from collections import defaultdict
 from hashlib import md5
 from typing import Any

-from crewai import Crew, Agent
+from crewai import Agent, Crew
+from crewai.agents.agent_builder.base_agent import BaseAgent
 from crewai.experimental.evaluation import AgentEvaluator, create_default_evaluator
-from crewai.experimental.evaluation.experiment.result_display import ExperimentResultsDisplay
-from crewai.experimental.evaluation.experiment.result import ExperimentResults, ExperimentResult
-from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
+from crewai.experimental.evaluation.evaluation_display import (
+    AgentAggregatedEvaluationResult,
+)
+from crewai.experimental.evaluation.experiment.result import (
+    ExperimentResult,
+    ExperimentResults,
+)
+from crewai.experimental.evaluation.experiment.result_display import (
+    ExperimentResultsDisplay,
+)
+

 class ExperimentRunner:
    def __init__(self, dataset: list[dict[str, Any]]):
@@ -14,11 +23,17 @@ class ExperimentRunner:
        self.evaluator: AgentEvaluator | None = None
        self.display = ExperimentResultsDisplay()

-    def run(self, crew: Crew | None = None, agents: list[Agent] | None = None, print_summary: bool = False) -> ExperimentResults:
+    def run(
+        self,
+        crew: Crew | None = None,
+        agents: list[Agent] | list[BaseAgent] | None = None,
+        print_summary: bool = False,
+    ) -> ExperimentResults:
        if crew and not agents:
            agents = crew.agents

-        assert agents is not None
+        if agents is None:
+            raise ValueError("Agents must be provided either directly or via a crew")
        self.evaluator = create_default_evaluator(agents=agents)

        results = []
@@ -35,21 +50,37 @@ class ExperimentRunner:

        return experiment_results

-    def _run_test_case(self, test_case: dict[str, Any], agents: list[Agent], crew: Crew | None = None) -> ExperimentResult:
+    def _run_test_case(
+        self,
+        test_case: dict[str, Any],
+        agents: list[Agent] | list[BaseAgent],
+        crew: Crew | None = None,
+    ) -> ExperimentResult:
        inputs = test_case["inputs"]
        expected_score = test_case["expected_score"]
-        identifier = test_case.get("identifier") or md5(str(test_case).encode(), usedforsecurity=False).hexdigest()
+        identifier = (
+            test_case.get("identifier")
+            or md5(str(test_case).encode(), usedforsecurity=False).hexdigest()
+        )

        try:
-            self.display.console.print(f"[dim]Running crew with input: {str(inputs)[:50]}...[/dim]")
+            self.display.console.print(
+                f"[dim]Running crew with input: {str(inputs)[:50]}...[/dim]"
+            )
            self.display.console.print("\n")
            if crew:
                crew.kickoff(inputs=inputs)
            else:
                for agent in agents:
-                    agent.kickoff(**inputs)
+                    if isinstance(agent, Agent):
+                        agent.kickoff(**inputs)
+                    else:
+                        raise TypeError(
+                            f"Agent {agent} is not an instance of Agent and cannot be kicked off directly"
+                        )

-            assert self.evaluator is not None
+            if self.evaluator is None:
+                raise ValueError("Evaluator must be initialized")
            agent_evaluations = self.evaluator.get_agent_evaluation()

            actual_score = self._extract_scores(agent_evaluations)
@@ -61,35 +92,38 @@ class ExperimentRunner:
                score=actual_score,
                expected_score=expected_score,
                passed=passed,
-                agent_evaluations=agent_evaluations
+                agent_evaluations=agent_evaluations,
            )

        except Exception as e:
-            self.display.console.print(f"[red]Error running test case: {str(e)}[/red]")
+            self.display.console.print(f"[red]Error running test case: {e!s}[/red]")
            return ExperimentResult(
                identifier=identifier,
                inputs=inputs,
-                score=0,
+                score=0.0,
                expected_score=expected_score,
-                passed=False
+                passed=False,
            )

-    def _extract_scores(self, agent_evaluations: dict[str, AgentAggregatedEvaluationResult]) -> float | dict[str,  float]:
+    def _extract_scores(
+        self, agent_evaluations: dict[str, AgentAggregatedEvaluationResult]
+    ) -> float | dict[str, float]:
        all_scores: dict[str, list[float]] = defaultdict(list)
        for evaluation in agent_evaluations.values():
            for metric_name, score in evaluation.metrics.items():
                if score.score is not None:
                    all_scores[metric_name.value].append(score.score)

-        avg_scores = {m: sum(s)/len(s) for m, s in all_scores.items()}
+        avg_scores = {m: sum(s) / len(s) for m, s in all_scores.items()}

        if len(avg_scores) == 1:
-            return list(avg_scores.values())[0]
+            return next(iter(avg_scores.values()))

        return avg_scores

-    def _assert_scores(self, expected: float | dict[str, float],
-                        actual: float | dict[str, float]) -> bool:
+    def _assert_scores(
+        self, expected: float | dict[str, float], actual: float | dict[str, float]
+    ) -> bool:
        """
        Compare expected and actual scores, and return whether the test case passed.

@@ -122,4 +156,4 @@ class ExperimentRunner:
            # All matching keys must have actual >= expected
            return all(actual[key] >= expected[key] for key in matching_keys)

-        return False
+        return False
--- a/src/crewai/experimental/evaluation/json_parser.py
+++ b/src/crewai/experimental/evaluation/json_parser.py
@@ -13,11 +13,11 @@ def extract_json_from_llm_response(text: str) -> dict[str, Any]:

    json_patterns = [
        # Standard markdown code blocks with json
-        r'```json\s*([\s\S]*?)\s*```',
+        r"```json\s*([\s\S]*?)\s*```",
        # Code blocks without language specifier
-        r'```\s*([\s\S]*?)\s*```',
+        r"```\s*([\s\S]*?)\s*```",
        # Inline code with JSON
-        r'`([{\\[].*[}\]])`',
+        r"`([{\\[].*[}\]])`",
    ]

    for pattern in json_patterns:
@@ -25,6 +25,6 @@ def extract_json_from_llm_response(text: str) -> dict[str, Any]:
        for match in matches:
            try:
                return json.loads(match.strip())
-            except json.JSONDecodeError:
+            except json.JSONDecodeError:  # noqa: PERF203
                continue
    raise ValueError("No valid JSON found in the response")
--- a/src/crewai/experimental/evaluation/metrics/init.py
+++ b/src/crewai/experimental/evaluation/metrics/init.py
@@ -1,26 +1,21 @@
+from crewai.experimental.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator
 from crewai.experimental.evaluation.metrics.reasoning_metrics import (
-    ReasoningEfficiencyEvaluator
+    ReasoningEfficiencyEvaluator,
 )
-
-from crewai.experimental.evaluation.metrics.tools_metrics import (
-    ToolSelectionEvaluator,
-    ParameterExtractionEvaluator,
-    ToolInvocationEvaluator
-)
-
-from crewai.experimental.evaluation.metrics.goal_metrics import (
-    GoalAlignmentEvaluator
-)
-
 from crewai.experimental.evaluation.metrics.semantic_quality_metrics import (
-    SemanticQualityEvaluator
+    SemanticQualityEvaluator,
+)
+from crewai.experimental.evaluation.metrics.tools_metrics import (
+    ParameterExtractionEvaluator,
+    ToolInvocationEvaluator,
+    ToolSelectionEvaluator,
 )

 __all__ = [
-    "ReasoningEfficiencyEvaluator",
-    "ToolSelectionEvaluator",
-    "ParameterExtractionEvaluator",
-    "ToolInvocationEvaluator",
    "GoalAlignmentEvaluator",
-    "SemanticQualityEvaluator"
-]
+    "ParameterExtractionEvaluator",
+    "ReasoningEfficiencyEvaluator",
+    "SemanticQualityEvaluator",
+    "ToolInvocationEvaluator",
+    "ToolSelectionEvaluator",
+]
--- a/src/crewai/experimental/evaluation/metrics/goal_metrics.py
+++ b/src/crewai/experimental/evaluation/metrics/goal_metrics.py
@@ -1,10 +1,15 @@
-from typing import Any, Dict
+from typing import Any

 from crewai.agent import Agent
+from crewai.agents.agent_builder.base_agent import BaseAgent
+from crewai.experimental.evaluation.base_evaluator import (
+    BaseEvaluator,
+    EvaluationScore,
+    MetricCategory,
+)
+from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
 from crewai.task import Task

-from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
-from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response

 class GoalAlignmentEvaluator(BaseEvaluator):
    @property
@@ -13,8 +18,8 @@ class GoalAlignmentEvaluator(BaseEvaluator):

    def evaluate(
        self,
-        agent: Agent,
-        execution_trace: Dict[str, Any],
+        agent: Agent | BaseAgent,
+        execution_trace: dict[str, Any],
        final_output: Any,
        task: Task | None = None,
    ) -> EvaluationScore:
@@ -23,7 +28,9 @@ class GoalAlignmentEvaluator(BaseEvaluator):
            task_context = f"Task description: {task.description}\nExpected output: {task.expected_output}\n"

        prompt = [
-            {"role": "system", "content": """You are an expert evaluator assessing how well an AI agent's output aligns with its assigned task goal.
+            {
+                "role": "system",
+                "content": """You are an expert evaluator assessing how well an AI agent's output aligns with its assigned task goal.

 Score the agent's goal alignment on a scale from 0-10 where:
 - 0: Complete misalignment, agent did not understand or attempt the task goal
@@ -37,8 +44,11 @@ Consider:
 4. Did the agent provide all requested information or deliverables?

 Return your evaluation as JSON with fields 'score' (number) and 'feedback' (string).
-"""},
-            {"role": "user", "content": f"""
+""",
+            },
+            {
+                "role": "user",
+                "content": f"""
 Agent role: {agent.role}
 Agent goal: {agent.goal}
 {task_context}
@@ -47,23 +57,26 @@ Agent's final output:
 {final_output}

 Evaluate how well the agent's output aligns with the assigned task goal.
-"""}
+""",
+            },
        ]
-        assert self.llm is not None
+        if self.llm is None:
+            raise ValueError("LLM must be initialized")
        response = self.llm.call(prompt)

        try:
            evaluation_data: dict[str, Any] = extract_json_from_llm_response(response)
-            assert evaluation_data is not None
+            if evaluation_data is None:
+                raise ValueError("Failed to extract evaluation data from LLM response")

            return EvaluationScore(
                score=evaluation_data.get("score", 0),
                feedback=evaluation_data.get("feedback", response),
-                raw_response=response
+                raw_response=response,
            )
        except Exception:
            return EvaluationScore(
                score=None,
                feedback=f"Failed to parse evaluation. Raw response: {response}",
-                raw_response=response
+                raw_response=response,
            )
--- a/src/crewai/experimental/evaluation/metrics/reasoning_metrics.py
+++ b/src/crewai/experimental/evaluation/metrics/reasoning_metrics.py
@@ -8,18 +8,24 @@ This module provides evaluator implementations for:

 import logging
 import re
-from enum import Enum
-from typing import Any, Dict, List, Tuple
-import numpy as np
 from collections.abc import Sequence
+from enum import Enum
+from typing import Any
+
+import numpy as np

 from crewai.agent import Agent
-from crewai.task import Task
-
-from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
+from crewai.agents.agent_builder.base_agent import BaseAgent
+from crewai.experimental.evaluation.base_evaluator import (
+    BaseEvaluator,
+    EvaluationScore,
+    MetricCategory,
+)
 from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
+from crewai.task import Task
 from crewai.tasks.task_output import TaskOutput

+
 class ReasoningPatternType(Enum):
    EFFICIENT = "efficient"  # Good reasoning flow
    LOOP = "loop"  # Agent is stuck in a loop
@@ -35,8 +41,8 @@ class ReasoningEfficiencyEvaluator(BaseEvaluator):

    def evaluate(
        self,
-        agent: Agent,
-        execution_trace: Dict[str, Any],
+        agent: Agent | BaseAgent,
+        execution_trace: dict[str, Any],
        final_output: TaskOutput | str,
        task: Task | None = None,
    ) -> EvaluationScore:
@@ -49,7 +55,7 @@ class ReasoningEfficiencyEvaluator(BaseEvaluator):
        if not llm_calls or len(llm_calls) < 2:
            return EvaluationScore(
                score=None,
-                feedback="Insufficient LLM calls to evaluate reasoning efficiency."
+                feedback="Insufficient LLM calls to evaluate reasoning efficiency.",
            )

        total_calls = len(llm_calls)
@@ -58,12 +64,16 @@ class ReasoningEfficiencyEvaluator(BaseEvaluator):
        time_intervals = []
        has_reliable_timing = True
        for i in range(1, len(llm_calls)):
-            start_time = llm_calls[i-1].get("end_time")
+            start_time = llm_calls[i - 1].get("end_time")
            end_time = llm_calls[i].get("start_time")
            if start_time and end_time and start_time != end_time:
                try:
                    interval = end_time - start_time
-                    time_intervals.append(interval.total_seconds() if hasattr(interval, 'total_seconds') else 0)
+                    time_intervals.append(
+                        interval.total_seconds()
+                        if hasattr(interval, "total_seconds")
+                        else 0
+                    )
                except Exception:
                    has_reliable_timing = False
            else:
@@ -83,14 +93,22 @@ class ReasoningEfficiencyEvaluator(BaseEvaluator):
        if has_reliable_timing and time_intervals:
            efficiency_metrics["avg_time_between_calls"] = np.mean(time_intervals)

-        loop_info = f"Detected {len(loop_details)} potential reasoning loops." if loop_detected else "No significant reasoning loops detected."
+        loop_info = (
+            f"Detected {len(loop_details)} potential reasoning loops."
+            if loop_detected
+            else "No significant reasoning loops detected."
+        )

        call_samples = self._get_call_samples(llm_calls)

-        final_output = final_output.raw if isinstance(final_output, TaskOutput) else final_output
+        final_output = (
+            final_output.raw if isinstance(final_output, TaskOutput) else final_output
+        )

        prompt = [
-            {"role": "system", "content": """You are an expert evaluator assessing the reasoning efficiency of an AI agent's thought process.
+            {
+                "role": "system",
+                "content": """You are an expert evaluator assessing the reasoning efficiency of an AI agent's thought process.

 Evaluate the agent's reasoning efficiency across these five key subcategories:

@@ -120,8 +138,11 @@ Return your evaluation as JSON with the following structure:
    "feedback": string (general feedback about overall reasoning efficiency),
    "optimization_suggestions": string (concrete suggestions for improving reasoning efficiency),
    "detected_patterns": string (describe any inefficient reasoning patterns you observe)
-}"""},
-            {"role": "user", "content": f"""
+}""",
+            },
+            {
+                "role": "user",
+                "content": f"""
 Agent role: {agent.role}
 {task_context}

@@ -140,10 +161,12 @@ Agent's final output:

 Evaluate the reasoning efficiency of this agent based on these interaction patterns.
 Identify any inefficient reasoning patterns and provide specific suggestions for optimization.
-"""}
+""",
+            },
        ]

-        assert self.llm is not None
+        if self.llm is None:
+            raise ValueError("LLM must be initialized")
        response = self.llm.call(prompt)

        try:
@@ -156,34 +179,46 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
            conciseness = scores.get("conciseness", 5.0)
            loop_avoidance = scores.get("loop_avoidance", 5.0)

-            overall_score = evaluation_data.get("overall_score", evaluation_data.get("score", 5.0))
+            overall_score = evaluation_data.get(
+                "overall_score", evaluation_data.get("score", 5.0)
+            )
            feedback = evaluation_data.get("feedback", "No detailed feedback provided.")
-            optimization_suggestions = evaluation_data.get("optimization_suggestions", "No specific suggestions provided.")
+            optimization_suggestions = evaluation_data.get(
+                "optimization_suggestions", "No specific suggestions provided."
+            )

            detailed_feedback = "Reasoning Efficiency Evaluation:\n"
-            detailed_feedback += f"• Focus: {focus}/10 - Staying on topic without tangents\n"
-            detailed_feedback += f"• Progression: {progression}/10 - Building on previous thinking\n"
+            detailed_feedback += (
+                f"• Focus: {focus}/10 - Staying on topic without tangents\n"
+            )
+            detailed_feedback += (
+                f"• Progression: {progression}/10 - Building on previous thinking\n"
+            )
            detailed_feedback += f"• Decision Quality: {decision_quality}/10 - Making appropriate decisions\n"
-            detailed_feedback += f"• Conciseness: {conciseness}/10 - Communicating efficiently\n"
+            detailed_feedback += (
+                f"• Conciseness: {conciseness}/10 - Communicating efficiently\n"
+            )
            detailed_feedback += f"• Loop Avoidance: {loop_avoidance}/10 - Avoiding repetitive patterns\n\n"

            detailed_feedback += f"Feedback:\n{feedback}\n\n"
-            detailed_feedback += f"Optimization Suggestions:\n{optimization_suggestions}"
+            detailed_feedback += (
+                f"Optimization Suggestions:\n{optimization_suggestions}"
+            )

            return EvaluationScore(
                score=float(overall_score),
                feedback=detailed_feedback,
-                raw_response=response
+                raw_response=response,
            )
        except Exception as e:
            logging.warning(f"Failed to parse reasoning efficiency evaluation: {e}")
            return EvaluationScore(
                score=None,
                feedback=f"Failed to parse reasoning efficiency evaluation. Raw response: {response[:200]}...",
-                raw_response=response
+                raw_response=response,
            )

-    def _detect_loops(self, llm_calls: List[Dict]) -> Tuple[bool, List[Dict]]:
+    def _detect_loops(self, llm_calls: list[dict]) -> tuple[bool, list[dict]]:
        loop_details = []

        messages = []
@@ -193,9 +228,11 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
                messages.append(content)
            elif isinstance(content, list) and len(content) > 0:
                # Handle message list format
-                for msg in content:
-                    if isinstance(msg, dict) and "content" in msg:
-                        messages.append(msg["content"])
+                messages.extend(
+                    msg["content"]
+                    for msg in content
+                    if isinstance(msg, dict) and "content" in msg
+                )

        # Simple n-gram based similarity detection
        # For a more robust implementation, consider using embedding-based similarity
@@ -205,18 +242,20 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
                # A more sophisticated approach would use semantic similarity
                similarity = self._calculate_text_similarity(messages[i], messages[j])
                if similarity > 0.7:  # Arbitrary threshold
-                    loop_details.append({
-                        "first_occurrence": i,
-                        "second_occurrence": j,
-                        "similarity": similarity,
-                        "snippet": messages[i][:100] + "..."
-                    })
+                    loop_details.append(
+                        {
+                            "first_occurrence": i,
+                            "second_occurrence": j,
+                            "similarity": similarity,
+                            "snippet": messages[i][:100] + "...",
+                        }
+                    )

        return len(loop_details) > 0, loop_details

    def _calculate_text_similarity(self, text1: str, text2: str) -> float:
-        text1 = re.sub(r'\s+', ' ', text1.lower()).strip()
-        text2 = re.sub(r'\s+', ' ', text2.lower()).strip()
+        text1 = re.sub(r"\s+", " ", text1.lower()).strip()
+        text2 = re.sub(r"\s+", " ", text2.lower()).strip()

        # Simple Jaccard similarity on word sets
        words1 = set(text1.split())
@@ -227,7 +266,7 @@ Identify any inefficient reasoning patterns and provide specific suggestions for

        return intersection / union if union > 0 else 0.0

-    def _analyze_reasoning_patterns(self, llm_calls: List[Dict]) -> Dict[str, Any]:
+    def _analyze_reasoning_patterns(self, llm_calls: list[dict]) -> dict[str, Any]:
        call_lengths = []
        response_times = []

@@ -248,8 +287,8 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
            if start_time and end_time:
                try:
                    response_times.append(end_time - start_time)
-                except Exception:
-                    pass
+                except Exception as e:
+                    logging.debug(f"Failed to calculate response time: {e}")

        avg_length = np.mean(call_lengths) if call_lengths else 0
        std_length = np.std(call_lengths) if call_lengths else 0
@@ -267,7 +306,9 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
            details = "Agent is consistently verbose across interactions."
        elif len(llm_calls) > 10 and length_trend > 0.5:
            primary_pattern = ReasoningPatternType.INDECISIVE
-            details = "Agent shows signs of indecisiveness with increasing message lengths."
+            details = (
+                "Agent shows signs of indecisiveness with increasing message lengths."
+            )
        elif std_length / avg_length > 0.8:
            primary_pattern = ReasoningPatternType.SCATTERED
            details = "Agent shows inconsistent reasoning flow with highly variable responses."
@@ -279,8 +320,8 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
                "avg_length": avg_length,
                "std_length": std_length,
                "length_trend": length_trend,
-                "loop_score": loop_score
-            }
+                "loop_score": loop_score,
+            },
        }

    def _calculate_trend(self, values: Sequence[float | int]) -> float:
@@ -303,7 +344,9 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
        except Exception:
            return 0.0

-    def _calculate_loop_likelihood(self, call_lengths: Sequence[float], response_times: Sequence[float]) -> float:
+    def _calculate_loop_likelihood(
+        self, call_lengths: Sequence[float], response_times: Sequence[float]
+    ) -> float:
        if not call_lengths or len(call_lengths) < 3:
            return 0.0

@@ -312,7 +355,11 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
        if len(call_lengths) >= 4:
            repeated_lengths = 0
            for i in range(len(call_lengths) - 2):
-                ratio = call_lengths[i] / call_lengths[i + 2] if call_lengths[i + 2] > 0 else 0
+                ratio = (
+                    call_lengths[i] / call_lengths[i + 2]
+                    if call_lengths[i + 2] > 0
+                    else 0
+                )
                if 0.85 <= ratio <= 1.15:
                    repeated_lengths += 1

@@ -324,21 +371,27 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
                std_time = np.std(response_times)
                mean_time = np.mean(response_times)
                if mean_time > 0:
-                    time_consistency = 1.0 - (std_time / mean_time)
-                    indicators.append(max(0, time_consistency - 0.3) * 1.5)
-            except Exception:
-                pass
+                    time_consistency = 1.0 - (float(std_time) / float(mean_time))
+                    indicators.append(max(0.0, float(time_consistency - 0.3)) * 1.5)
+            except Exception as e:
+                logging.debug(f"Time consistency calculation failed: {e}")

-        return np.mean(indicators) if indicators else 0.0
+        return float(np.mean(indicators)) if indicators else 0.0

-    def _get_call_samples(self, llm_calls: List[Dict]) -> str:
+    def _get_call_samples(self, llm_calls: list[dict]) -> str:
        samples = []

        if len(llm_calls) <= 6:
            sample_indices = list(range(len(llm_calls)))
        else:
-            sample_indices = [0, 1, len(llm_calls) // 2 - 1, len(llm_calls) // 2,
-                             len(llm_calls) - 2, len(llm_calls) - 1]
+            sample_indices = [
+                0,
+                1,
+                len(llm_calls) // 2 - 1,
+                len(llm_calls) // 2,
+                len(llm_calls) - 2,
+                len(llm_calls) - 1,
+            ]

        for idx in sample_indices:
            call = llm_calls[idx]
@@ -347,10 +400,11 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
            if isinstance(content, str):
                sample = content
            elif isinstance(content, list) and len(content) > 0:
-                sample_parts = []
-                for msg in content:
-                    if isinstance(msg, dict) and "content" in msg:
-                        sample_parts.append(msg["content"])
+                sample_parts = [
+                    msg["content"]
+                    for msg in content
+                    if isinstance(msg, dict) and "content" in msg
+                ]
                sample = "\n".join(sample_parts)
            else:
                sample = str(content)
--- a/src/crewai/experimental/evaluation/metrics/semantic_quality_metrics.py
+++ b/src/crewai/experimental/evaluation/metrics/semantic_quality_metrics.py
@@ -1,10 +1,15 @@
-from typing import Any, Dict
+from typing import Any

 from crewai.agent import Agent
+from crewai.agents.agent_builder.base_agent import BaseAgent
+from crewai.experimental.evaluation.base_evaluator import (
+    BaseEvaluator,
+    EvaluationScore,
+    MetricCategory,
+)
+from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
 from crewai.task import Task

-from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
-from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response

 class SemanticQualityEvaluator(BaseEvaluator):
    @property
@@ -13,8 +18,8 @@ class SemanticQualityEvaluator(BaseEvaluator):

    def evaluate(
        self,
-        agent: Agent,
-        execution_trace: Dict[str, Any],
+        agent: Agent | BaseAgent,
+        execution_trace: dict[str, Any],
        final_output: Any,
        task: Task | None = None,
    ) -> EvaluationScore:
@@ -22,7 +27,9 @@ class SemanticQualityEvaluator(BaseEvaluator):
        if task is not None:
            task_context = f"Task description: {task.description}"
        prompt = [
-            {"role": "system", "content": """You are an expert evaluator assessing the semantic quality of an AI agent's output.
+            {
+                "role": "system",
+                "content": """You are an expert evaluator assessing the semantic quality of an AI agent's output.

 Score the semantic quality on a scale from 0-10 where:
 - 0: Completely incoherent, confusing, or logically flawed output
@@ -37,8 +44,11 @@ Consider:
 5. Is the output free from contradictions and logical fallacies?

 Return your evaluation as JSON with fields 'score' (number) and 'feedback' (string).
-"""},
-            {"role": "user", "content": f"""
+""",
+            },
+            {
+                "role": "user",
+                "content": f"""
 Agent role: {agent.role}
 {task_context}

@@ -46,23 +56,28 @@ Agent's final output:
 {final_output}

 Evaluate the semantic quality and reasoning of this output.
-"""}
+""",
+            },
        ]

-        assert self.llm is not None
+        if self.llm is None:
+            raise ValueError("LLM must be initialized")
        response = self.llm.call(prompt)

        try:
            evaluation_data: dict[str, Any] = extract_json_from_llm_response(response)
-            assert evaluation_data is not None
+            if evaluation_data is None:
+                raise ValueError("Failed to extract evaluation data from LLM response")
            return EvaluationScore(
-                score=float(evaluation_data["score"]) if evaluation_data.get("score") is not None else None,
+                score=float(evaluation_data["score"])
+                if evaluation_data.get("score") is not None
+                else None,
                feedback=evaluation_data.get("feedback", response),
-                raw_response=response
+                raw_response=response,
            )
        except Exception:
            return EvaluationScore(
                score=None,
                feedback=f"Failed to parse evaluation. Raw response: {response}",
-                raw_response=response
-            )
+                raw_response=response,
+            )
--- a/src/crewai/experimental/evaluation/metrics/tools_metrics.py
+++ b/src/crewai/experimental/evaluation/metrics/tools_metrics.py
@@ -1,22 +1,26 @@
 import json
-from typing import Dict, Any
+from typing import Any

-from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
-from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
 from crewai.agent import Agent
+from crewai.agents.agent_builder.base_agent import BaseAgent
+from crewai.experimental.evaluation.base_evaluator import (
+    BaseEvaluator,
+    EvaluationScore,
+    MetricCategory,
+)
+from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
 from crewai.task import Task


 class ToolSelectionEvaluator(BaseEvaluator):
-
    @property
    def metric_category(self) -> MetricCategory:
        return MetricCategory.TOOL_SELECTION

    def evaluate(
        self,
-        agent: Agent,
-        execution_trace: Dict[str, Any],
+        agent: Agent | BaseAgent,
+        execution_trace: dict[str, Any],
        final_output: str,
        task: Task | None = None,
    ) -> EvaluationScore:
@@ -26,19 +30,18 @@ class ToolSelectionEvaluator(BaseEvaluator):

        tool_uses = execution_trace.get("tool_uses", [])
        tool_count = len(tool_uses)
-        unique_tool_types = set([tool.get("tool", "Unknown tool") for tool in tool_uses])
+        unique_tool_types = set(
+            [tool.get("tool", "Unknown tool") for tool in tool_uses]
+        )

        if tool_count == 0:
            if not agent.tools:
                return EvaluationScore(
-                    score=None,
-                    feedback="Agent had no tools available to use."
-                )
-            else:
-                return EvaluationScore(
-                    score=None,
-                    feedback="Agent had tools available but didn't use any."
+                    score=None, feedback="Agent had no tools available to use."
                )
+            return EvaluationScore(
+                score=None, feedback="Agent had tools available but didn't use any."
+            )

        available_tools_info = ""
        if agent.tools:
@@ -52,7 +55,9 @@ class ToolSelectionEvaluator(BaseEvaluator):
            tool_types_summary += f"- {tool_type}\n"

        prompt = [
-            {"role": "system", "content": """You are an expert evaluator assessing if an AI agent selected the most appropriate tools for a given task.
+            {
+                "role": "system",
+                "content": """You are an expert evaluator assessing if an AI agent selected the most appropriate tools for a given task.

 You must evaluate based on these 2 criteria:
 1. Relevance (0-10): Were the tools chosen directly aligned with the task's goals?
@@ -73,8 +78,11 @@ Return your evaluation as JSON with these fields:
 - overall_score: number (average of all scores, 0-10)
 - feedback: string (focused ONLY on tool selection decisions from available tools)
 - improvement_suggestions: string (ONLY suggest better selection from the AVAILABLE tools list, NOT new tools)
-"""},
-            {"role": "user", "content": f"""
+""",
+            },
+            {
+                "role": "user",
+                "content": f"""
 Agent role: {agent.role}
 {task_context}

@@ -89,14 +97,17 @@ IMPORTANT:
 - ONLY evaluate selection from tools listed as available
 - DO NOT suggest new tools that aren't in the available tools list
 - DO NOT evaluate tool usage or results
-"""}
+""",
+            },
        ]
-        assert self.llm is not None
+        if self.llm is None:
+            raise ValueError("LLM must be initialized")
        response = self.llm.call(prompt)

        try:
            evaluation_data = extract_json_from_llm_response(response)
-            assert evaluation_data is not None
+            if evaluation_data is None:
+                raise ValueError("Failed to extract evaluation data from LLM response")

            scores = evaluation_data.get("scores", {})
            relevance = scores.get("relevance", 5.0)
@@ -105,22 +116,24 @@ IMPORTANT:

            feedback = "Tool Selection Evaluation:\n"
            feedback += f"• Relevance: {relevance}/10 - Selection of appropriate tool types for the task\n"
-            feedback += f"• Coverage: {coverage}/10 - Selection of all necessary tool types\n"
+            feedback += (
+                f"• Coverage: {coverage}/10 - Selection of all necessary tool types\n"
+            )
            if "improvement_suggestions" in evaluation_data:
                feedback += f"Improvement Suggestions:\n{evaluation_data['improvement_suggestions']}"
            else:
-                feedback += evaluation_data.get("feedback", "No detailed feedback available.")
+                feedback += evaluation_data.get(
+                    "feedback", "No detailed feedback available."
+                )

            return EvaluationScore(
-                score=overall_score,
-                feedback=feedback,
-                raw_response=response
+                score=overall_score, feedback=feedback, raw_response=response
            )
        except Exception as e:
            return EvaluationScore(
                score=None,
                feedback=f"Error evaluating tool selection: {e}",
-                raw_response=response
+                raw_response=response,
            )


@@ -131,8 +144,8 @@ class ParameterExtractionEvaluator(BaseEvaluator):

    def evaluate(
        self,
-        agent: Agent,
-        execution_trace: Dict[str, Any],
+        agent: Agent | BaseAgent,
+        execution_trace: dict[str, Any],
        final_output: str,
        task: Task | None = None,
    ) -> EvaluationScore:
@@ -145,19 +158,23 @@ class ParameterExtractionEvaluator(BaseEvaluator):
        if tool_count == 0:
            return EvaluationScore(
                score=None,
-                feedback="No tool usage detected. Cannot evaluate parameter extraction."
+                feedback="No tool usage detected. Cannot evaluate parameter extraction.",
            )

-        validation_errors = []
-        for tool_use in tool_uses:
-            if not tool_use.get("success", True) and tool_use.get("error_type") == "validation_error":
-                validation_errors.append({
-                    "tool": tool_use.get("tool", "Unknown tool"),
-                    "error": tool_use.get("result"),
-                    "args": tool_use.get("args", {})
-                })
+        validation_errors = [
+            {
+                "tool": tool_use.get("tool", "Unknown tool"),
+                "error": tool_use.get("result"),
+                "args": tool_use.get("args", {}),
+            }
+            for tool_use in tool_uses
+            if not tool_use.get("success", True)
+            and tool_use.get("error_type") == "validation_error"
+        ]

-        validation_error_rate = len(validation_errors) / tool_count if tool_count > 0 else 0
+        validation_error_rate = (
+            len(validation_errors) / tool_count if tool_count > 0 else 0
+        )

        param_samples = []
        for i, tool_use in enumerate(tool_uses[:5]):
@@ -168,7 +185,7 @@ class ParameterExtractionEvaluator(BaseEvaluator):

            is_validation_error = error_type == "validation_error"

-            sample = f"Tool use #{i+1} - {tool_name}:\n"
+            sample = f"Tool use #{i + 1} - {tool_name}:\n"
            sample += f"- Parameters: {json.dumps(tool_args, indent=2)}\n"
            sample += f"- Success: {'No' if not success else 'Yes'}"

@@ -187,13 +204,17 @@ class ParameterExtractionEvaluator(BaseEvaluator):
                tool_name = err.get("tool", "Unknown tool")
                error_msg = err.get("error", "Unknown error")
                args = err.get("args", {})
-                validation_errors_info += f"\nValidation Error #{i+1}:\n- Tool: {tool_name}\n- Args: {json.dumps(args, indent=2)}\n- Error: {error_msg}"
+                validation_errors_info += f"\nValidation Error #{i + 1}:\n- Tool: {tool_name}\n- Args: {json.dumps(args, indent=2)}\n- Error: {error_msg}"

            if len(validation_errors) > 3:
-                validation_errors_info += f"\n...and {len(validation_errors) - 3} more validation errors."
+                validation_errors_info += (
+                    f"\n...and {len(validation_errors) - 3} more validation errors."
+                )
        param_samples_text = "\n\n".join(param_samples)
        prompt = [
-            {"role": "system", "content": """You are an expert evaluator assessing how well an AI agent extracts and formats PARAMETER VALUES for tool calls.
+            {
+                "role": "system",
+                "content": """You are an expert evaluator assessing how well an AI agent extracts and formats PARAMETER VALUES for tool calls.

 Your job is to evaluate ONLY whether the agent used the correct parameter VALUES, not whether the right tools were selected or how the tools were invoked.

@@ -216,8 +237,11 @@ Return your evaluation as JSON with these fields:
 - overall_score: number (average of all scores, 0-10)
 - feedback: string (focused ONLY on parameter value extraction quality)
 - improvement_suggestions: string (concrete suggestions for better parameter VALUE extraction)
-"""},
-            {"role": "user", "content": f"""
+""",
+            },
+            {
+                "role": "user",
+                "content": f"""
 Agent role: {agent.role}
 {task_context}

@@ -226,15 +250,18 @@ Parameter extraction examples:
 {validation_errors_info}

 Evaluate the quality of the agent's parameter extraction for this task.
-"""}
+""",
+            },
        ]

-        assert self.llm is not None
+        if self.llm is None:
+            raise ValueError("LLM must be initialized")
        response = self.llm.call(prompt)

        try:
            evaluation_data = extract_json_from_llm_response(response)
-            assert evaluation_data is not None
+            if evaluation_data is None:
+                raise ValueError("Failed to extract evaluation data from LLM response")

            scores = evaluation_data.get("scores", {})
            accuracy = scores.get("accuracy", 5.0)
@@ -251,18 +278,18 @@ Evaluate the quality of the agent's parameter extraction for this task.
            if "improvement_suggestions" in evaluation_data:
                feedback += f"Improvement Suggestions:\n{evaluation_data['improvement_suggestions']}"
            else:
-                feedback += evaluation_data.get("feedback", "No detailed feedback available.")
+                feedback += evaluation_data.get(
+                    "feedback", "No detailed feedback available."
+                )

            return EvaluationScore(
-                score=overall_score,
-                feedback=feedback,
-                raw_response=response
+                score=overall_score, feedback=feedback, raw_response=response
            )
        except Exception as e:
            return EvaluationScore(
                score=None,
                feedback=f"Error evaluating parameter extraction: {e}",
-                raw_response=response
+                raw_response=response,
            )


@@ -273,8 +300,8 @@ class ToolInvocationEvaluator(BaseEvaluator):

    def evaluate(
        self,
-        agent: Agent,
-        execution_trace: Dict[str, Any],
+        agent: Agent | BaseAgent,
+        execution_trace: dict[str, Any],
        final_output: str,
        task: Task | None = None,
    ) -> EvaluationScore:
@@ -288,7 +315,7 @@ class ToolInvocationEvaluator(BaseEvaluator):
        if tool_count == 0:
            return EvaluationScore(
                score=None,
-                feedback="No tool usage detected. Cannot evaluate tool invocation."
+                feedback="No tool usage detected. Cannot evaluate tool invocation.",
            )

        for tool_use in tool_uses:
@@ -296,7 +323,7 @@ class ToolInvocationEvaluator(BaseEvaluator):
                error_info = {
                    "tool": tool_use.get("tool", "Unknown tool"),
                    "error": tool_use.get("result"),
-                    "error_type": tool_use.get("error_type", "unknown_error")
+                    "error_type": tool_use.get("error_type", "unknown_error"),
                }
                tool_errors.append(error_info)

@@ -315,9 +342,11 @@ class ToolInvocationEvaluator(BaseEvaluator):
            tool_args = tool_use.get("args", {})
            success = tool_use.get("success", True) and not tool_use.get("error", False)
            error_type = tool_use.get("error_type", "") if not success else ""
-            error_msg = tool_use.get("result", "No error") if not success else "No error"
+            error_msg = (
+                tool_use.get("result", "No error") if not success else "No error"
+            )

-            sample = f"Tool invocation #{i+1}:\n"
+            sample = f"Tool invocation #{i + 1}:\n"
            sample += f"- Tool: {tool_name}\n"
            sample += f"- Parameters: {json.dumps(tool_args, indent=2)}\n"
            sample += f"- Success: {'No' if not success else 'Yes'}\n"
@@ -330,11 +359,13 @@ class ToolInvocationEvaluator(BaseEvaluator):
        if error_types:
            error_type_summary = "Error type breakdown:\n"
            for error_type, count in error_types.items():
-                error_type_summary += f"- {error_type}: {count} occurrences ({(count/tool_count):.1%})\n"
+                error_type_summary += f"- {error_type}: {count} occurrences ({(count / tool_count):.1%})\n"

        invocation_samples_text = "\n\n".join(invocation_samples)
        prompt = [
-            {"role": "system", "content": """You are an expert evaluator assessing how correctly an AI agent's tool invocations are STRUCTURED.
+            {
+                "role": "system",
+                "content": """You are an expert evaluator assessing how correctly an AI agent's tool invocations are STRUCTURED.

 Your job is to evaluate ONLY the structural and syntactical aspects of how the agent called tools, NOT which tools were selected or what parameter values were used.

@@ -359,8 +390,11 @@ Return your evaluation as JSON with these fields:
 - overall_score: number (average of all scores, 0-10)
 - feedback: string (focused ONLY on structural aspects of tool invocation)
 - improvement_suggestions: string (concrete suggestions for better structuring of tool calls)
-"""},
-            {"role": "user", "content": f"""
+""",
+            },
+            {
+                "role": "user",
+                "content": f"""
 Agent role: {agent.role}
 {task_context}

@@ -371,15 +405,18 @@ Tool error rate: {error_rate:.2%} ({len(tool_errors)} errors out of {tool_count}
 {error_type_summary}

 Evaluate the quality of the agent's tool invocation structure during this task.
-"""}
+""",
+            },
        ]

-        assert self.llm is not None
+        if self.llm is None:
+            raise ValueError("LLM must be initialized")
        response = self.llm.call(prompt)

        try:
            evaluation_data = extract_json_from_llm_response(response)
-            assert evaluation_data is not None
+            if evaluation_data is None:
+                raise ValueError("Failed to extract evaluation data from LLM response")
            scores = evaluation_data.get("scores", {})
            structure = scores.get("structure", 5.0)
            error_handling = scores.get("error_handling", 5.0)
@@ -388,23 +425,25 @@ Evaluate the quality of the agent's tool invocation structure during this task.
            overall_score = float(evaluation_data.get("overall_score", 5.0))

            feedback = "Tool Invocation Evaluation:\n"
-            feedback += f"• Structure: {structure}/10 - Following proper syntax and format\n"
+            feedback += (
+                f"• Structure: {structure}/10 - Following proper syntax and format\n"
+            )
            feedback += f"• Error Handling: {error_handling}/10 - Appropriately handling tool errors\n"
            feedback += f"• Invocation Patterns: {invocation_patterns}/10 - Proper sequencing and management of calls\n\n"

            if "improvement_suggestions" in evaluation_data:
                feedback += f"Improvement Suggestions:\n{evaluation_data['improvement_suggestions']}"
            else:
-                feedback += evaluation_data.get("feedback", "No detailed feedback available.")
+                feedback += evaluation_data.get(
+                    "feedback", "No detailed feedback available."
+                )

            return EvaluationScore(
-                score=overall_score,
-                feedback=feedback,
-                raw_response=response
+                score=overall_score, feedback=feedback, raw_response=response
            )
        except Exception as e:
            return EvaluationScore(
                score=None,
                feedback=f"Error evaluating tool invocation: {e}",
-                raw_response=response
+                raw_response=response,
            )
--- a/src/crewai/experimental/evaluation/testing.py
+++ b/src/crewai/experimental/evaluation/testing.py
@@ -1,12 +1,21 @@
 import inspect
+import warnings

 from typing_extensions import Any
-import warnings
-from crewai.experimental.evaluation.experiment import ExperimentResults, ExperimentRunner
-from crewai import Crew, Agent

-def assert_experiment_successfully(experiment_results: ExperimentResults, baseline_filepath: str | None = None) -> None:
-    failed_tests = [result for result in experiment_results.results if not result.passed]
+from crewai import Agent, Crew
+from crewai.experimental.evaluation.experiment import (
+    ExperimentResults,
+    ExperimentRunner,
+)
+
+
+def assert_experiment_successfully(
+    experiment_results: ExperimentResults, baseline_filepath: str | None = None
+) -> None:
+    failed_tests = [
+        result for result in experiment_results.results if not result.passed
+    ]

    if failed_tests:
        detailed_failures: list[str] = []
@@ -14,39 +23,54 @@ def assert_experiment_successfully(experiment_results: ExperimentResults, baseli
        for result in failed_tests:
            expected = result.expected_score
            actual = result.score
-            detailed_failures.append(f"- {result.identifier}: expected {expected}, got {actual}")
+            detailed_failures.append(
+                f"- {result.identifier}: expected {expected}, got {actual}"
+            )

        failure_details = "\n".join(detailed_failures)
        raise AssertionError(f"The following test cases failed:\n{failure_details}")

    baseline_filepath = baseline_filepath or _get_baseline_filepath_fallback()
-    comparison = experiment_results.compare_with_baseline(baseline_filepath=baseline_filepath)
+    comparison = experiment_results.compare_with_baseline(
+        baseline_filepath=baseline_filepath
+    )
    assert_experiment_no_regression(comparison)

+
 def assert_experiment_no_regression(comparison_result: dict[str, list[str]]) -> None:
    regressed = comparison_result.get("regressed", [])
    if regressed:
-        raise AssertionError(f"Regression detected! The following tests that previously passed now fail: {regressed}")
+        raise AssertionError(
+            f"Regression detected! The following tests that previously passed now fail: {regressed}"
+        )

    missing_tests = comparison_result.get("missing_tests", [])
    if missing_tests:
        warnings.warn(
            f"Warning: {len(missing_tests)} tests from the baseline are missing in the current run: {missing_tests}",
-            UserWarning
+            UserWarning,
+            stacklevel=2,
        )

-def run_experiment(dataset: list[dict[str, Any]], crew: Crew | None = None, agents: list[Agent] | None = None, verbose: bool = False) -> ExperimentResults:
+
+def run_experiment(
+    dataset: list[dict[str, Any]],
+    crew: Crew | None = None,
+    agents: list[Agent] | None = None,
+    verbose: bool = False,
+) -> ExperimentResults:
    runner = ExperimentRunner(dataset=dataset)

    return runner.run(agents=agents, crew=crew, print_summary=verbose)

+
 def _get_baseline_filepath_fallback() -> str:
    test_func_name = "experiment_fallback"

    try:
        current_frame = inspect.currentframe()
        if current_frame is not None:
-            test_func_name = current_frame.f_back.f_back.f_code.co_name # type: ignore[union-attr]
+            test_func_name = current_frame.f_back.f_back.f_code.co_name  # type: ignore[union-attr]
    except Exception:
        ...
-    return f"{test_func_name}_results.json"
+    return f"{test_func_name}_results.json"