diff --git a/src/crewai/evaluation/agent_evaluator.py b/src/crewai/evaluation/agent_evaluator.py
index 78520417c..430196d14 100644
--- a/src/crewai/evaluation/agent_evaluator.py
+++ b/src/crewai/evaluation/agent_evaluator.py
@@ -3,9 +3,10 @@ from crewai.agent import Agent
 from crewai.task import Task
 from crewai.evaluation.evaluation_display import EvaluationDisplayFormatter
 
-from typing import List, Optional, Dict, Any
+from typing import Any, Dict
 from collections import defaultdict
 from crewai.evaluation import BaseEvaluator, create_evaluation_callbacks
+from collections.abc import Sequence
 from crewai.crew import Crew
 from crewai.utilities.events.crewai_event_bus import crewai_event_bus
 from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
@@ -13,28 +14,29 @@ from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
 class AgentEvaluator:
     def __init__(
         self,
-        evaluators: Optional[List[BaseEvaluator]] = None,
-        crew: Optional[Any] = None,
+        evaluators: Sequence[BaseEvaluator] | None = None,
+        crew: Crew | None = None,
     ):
-        self.crew: Crew = crew
-        self.evaluators = evaluators
+        self.crew: Crew | None = crew
+        self.evaluators: Sequence[BaseEvaluator] | None = evaluators
 
-        self.agent_evaluators = {}
+        self.agent_evaluators: dict[str, Sequence[BaseEvaluator] | None] = {}
         if crew is not None:
+            assert crew and crew.agents is not None
             for agent in crew.agents:
-                self.agent_evaluators[agent.id] = self.evaluators.copy()
+                self.agent_evaluators[str(agent.id)] = self.evaluators
 
         self.callback = create_evaluation_callbacks()
         self.console_formatter = ConsoleFormatter()
         self.display_formatter = EvaluationDisplayFormatter()
 
         self.iteration = 1
-        self.iterations_results = {}
+        self.iterations_results: dict[int, dict[str, list[AgentEvaluationResult]]] = {}
 
     def set_iteration(self, iteration: int) -> None:
         self.iteration = iteration
 
-    def evaluate_current_iteration(self):
+    def evaluate_current_iteration(self) -> dict[str, list[AgentEvaluationResult]]:
         if not self.crew:
             raise ValueError("Cannot evaluate: no crew was provided to the evaluator.")
 
@@ -42,12 +44,12 @@ class AgentEvaluator:
             raise ValueError("Cannot evaluate: no callback was set. Use set_callback() method first.")
 
         from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
-        evaluation_results = defaultdict(list)
+        evaluation_results: defaultdict[str, list[AgentEvaluationResult]] = defaultdict(list)
 
         total_evals = 0
         for agent in self.crew.agents:
             for task in self.crew.tasks:
-                if task.agent.id == agent.id and self.agent_evaluators.get(agent.id):
+                if task.agent and task.agent.id == agent.id and self.agent_evaluators.get(str(agent.id)):
                     total_evals += 1
 
         with Progress(
@@ -60,15 +62,16 @@ class AgentEvaluator:
             eval_task = progress.add_task(f"Evaluating agents (iteration {self.iteration})...", total=total_evals)
 
             for agent in self.crew.agents:
-                evaluator = self.agent_evaluators.get(agent.id)
+                evaluator = self.agent_evaluators.get(str(agent.id))
                 if not evaluator:
                     continue
 
                 for task in self.crew.tasks:
-                    if task.agent.id != agent.id:
+
+                    if task.agent and str(task.agent.id) != str(agent.id):
                         continue
 
-                    trace = self.callback.get_trace(agent.id, task.id)
+                    trace = self.callback.get_trace(str(agent.id), str(task.id))
                     if not trace:
                         self.console_formatter.print(f"[yellow]Warning: No trace found for agent {agent.role} on task {task.description[:30]}...[/yellow]")
                         progress.update(eval_task, advance=1)
@@ -138,7 +141,7 @@ class AgentEvaluator:
             agent_id=str(agent.id),
             task_id=str(task.id)
         )
-
+        assert self.evaluators is not None
         for evaluator in self.evaluators:
             try:
                 score = evaluator.evaluate(
diff --git a/src/crewai/evaluation/base_evaluator.py b/src/crewai/evaluation/base_evaluator.py
index 848e08468..b11c61973 100644
--- a/src/crewai/evaluation/base_evaluator.py
+++ b/src/crewai/evaluation/base_evaluator.py
@@ -23,7 +23,7 @@ class MetricCategory(enum.Enum):
 
 
 class EvaluationScore(BaseModel):
-    score: Optional[float] = Field(
+    score: float | None = Field(
         default=5.0,
         description="Numeric score from 0-10 where 0 is worst and 10 is best, None if not applicable",
         ge=0.0,
@@ -33,7 +33,7 @@ class EvaluationScore(BaseModel):
         default="",
         description="Detailed feedback explaining the evaluation score"
     )
-    raw_response: Optional[str] = Field(
+    raw_response: str | None = Field(
         default=None,
         description="Raw response from the evaluator (e.g., LLM)"
     )
@@ -45,8 +45,8 @@ class EvaluationScore(BaseModel):
 
 
 class BaseEvaluator(abc.ABC):
-    def __init__(self, llm: Optional[BaseLLM] = None):
-        self.llm = create_llm(llm)
+    def __init__(self, llm: BaseLLM | None = None):
+        self.llm: BaseLLM | None = create_llm(llm)
 
     @property
     @abc.abstractmethod
diff --git a/src/crewai/evaluation/evaluation_display.py b/src/crewai/evaluation/evaluation_display.py
index cf3f82f6a..0e30c53f0 100644
--- a/src/crewai/evaluation/evaluation_display.py
+++ b/src/crewai/evaluation/evaluation_display.py
@@ -1,7 +1,9 @@
+from collections import defaultdict
 from typing import Dict, Any, List
 from rich.table import Table
 from rich.box import HEAVY_EDGE, ROUNDED
-from crewai.evaluation.base_evaluator import AgentAggregatedEvaluationResult, AggregationStrategy
+from collections.abc import Sequence
+from crewai.evaluation.base_evaluator import AgentAggregatedEvaluationResult, AggregationStrategy, AgentEvaluationResult, MetricCategory
 from crewai.evaluation import EvaluationScore
 from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
 from crewai.utilities.llm_utils import create_llm
@@ -16,7 +18,7 @@ class EvaluationDisplayFormatter:
             return
 
         # Get all agent roles across all iterations
-        all_agent_roles = set()
+        all_agent_roles: set[str] = set()
         for iter_results in iterations_results.values():
             all_agent_roles.update(iter_results.keys())
 
@@ -50,9 +52,9 @@ class EvaluationDisplayFormatter:
                 # Add metrics to table
                 if aggregated_result.metrics:
                     for metric, evaluation_score in aggregated_result.metrics.items():
-                        score = evaluation_score.score if evaluation_score.score is not None else "N/A"
+                        score = evaluation_score.score
 
-                        if isinstance(score, (int, float)) and score is not None:
+                        if isinstance(score, (int, float)):
                             if score >= 8.0:
                                 score_text = f"[green]{score:.1f}[/green]"
                             elif score >= 6.0:
@@ -109,7 +111,7 @@ class EvaluationDisplayFormatter:
 
         table.add_column("Avg. Total", justify="center")
 
-        all_agent_roles = set()
+        all_agent_roles: set[str] = set()
         for results in iterations_results.values():
             all_agent_roles.update(results.keys())
 
@@ -173,7 +175,7 @@ class EvaluationDisplayFormatter:
 
             table.add_row(*row)
 
-            all_metrics = set()
+            all_metrics: set[Any] = set()
             for metrics in agent_metrics_by_iteration.values():
                 all_metrics.update(metrics.keys())
 
@@ -185,18 +187,18 @@ class EvaluationDisplayFormatter:
                 for iter_num in sorted(iterations_results.keys()):
                     if (iter_num in agent_metrics_by_iteration and
                             metric in agent_metrics_by_iteration[iter_num]):
-                        score = agent_metrics_by_iteration[iter_num][metric].score
-                        if score is not None:
-                            metric_scores.append(score)
-                            if score >= 8.0:
+                        metric_score = agent_metrics_by_iteration[iter_num][metric].score
+                        if metric_score is not None:
+                            metric_scores.append(metric_score)
+                            if metric_score >= 8.0:
                                 color = "green"
-                            elif score >= 6.0:
+                            elif metric_score >= 6.0:
                                 color = "cyan"
-                            elif score >= 4.0:
+                            elif metric_score >= 4.0:
                                 color = "yellow"
                             else:
                                 color = "red"
-                            row.append(f"[{color}]{score:.1f}[/]")
+                            row.append(f"[{color}]{metric_score:.1f}[/]")
                         else:
                             row.append("[dim]N/A[/dim]")
                     else:
@@ -227,34 +229,29 @@ class EvaluationDisplayFormatter:
         self,
         agent_id: str,
         agent_role: str,
-        results: List[Any],
+        results: Sequence[AgentEvaluationResult],
         strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE,
     ) -> AgentAggregatedEvaluationResult:
-        metrics_by_category = {}
+        metrics_by_category: dict[MetricCategory, list[EvaluationScore]] = defaultdict(list)
 
         for result in results:
             for metric_name, evaluation_score in result.metrics.items():
-                if metric_name not in metrics_by_category:
-                    metrics_by_category[metric_name] = []
                 metrics_by_category[metric_name].append(evaluation_score)
 
-        aggregated_metrics = {}
+        aggregated_metrics: dict[MetricCategory, EvaluationScore] = {}
         for category, scores in metrics_by_category.items():
-            valid_scores = [s for s in scores if s.score is not None]
+            valid_scores = [s.score for s in scores if s.score is not None]
+            avg_score = sum(valid_scores) / len(valid_scores) if valid_scores else None
 
-            avg_score = sum(s.score for s in valid_scores) / len(valid_scores) if valid_scores else None
-
-            # Extract all feedback text from scores
             feedbacks = [s.feedback for s in scores if s.feedback]
 
-            # Process feedback based on number of entries
             feedback_summary = None
             if feedbacks:
                 if len(feedbacks) > 1:
                     # Use the summarization method for multiple feedbacks
                     feedback_summary = self._summarize_feedbacks(
                         agent_role=agent_role,
-                        metric=category,
+                        metric=category.title(),
                         feedbacks=feedbacks,
                         scores=[s.score for s in scores],
                         strategy=strategy
@@ -269,9 +266,9 @@ class EvaluationDisplayFormatter:
 
         overall_score = None
         if aggregated_metrics:
-            scores = [m.score for m in aggregated_metrics.values() if m.score is not None]
-            if scores:
-                overall_score = sum(scores) / len(scores)
+            valid_scores = [m.score for m in aggregated_metrics.values() if m.score is not None]
+            if valid_scores:
+                overall_score = sum(valid_scores) / len(valid_scores)
 
         return AgentAggregatedEvaluationResult(
             agent_id=agent_id,
@@ -287,7 +284,7 @@ class EvaluationDisplayFormatter:
         agent_role: str,
         metric: str,
         feedbacks: List[str],
-        scores: List[float],
+        scores: List[float | None],
         strategy: AggregationStrategy
     ) -> str:
         if len(feedbacks) <= 2 and all(len(fb) < 200 for fb in feedbacks):
@@ -335,7 +332,7 @@ class EvaluationDisplayFormatter:
                 {all_feedbacks}
                 """}
             ]
-
+            assert llm is not None
             response = llm.call(prompt)
 
             return response
diff --git a/src/crewai/evaluation/evaluation_listener.py b/src/crewai/evaluation/evaluation_listener.py
index 91a79c051..7da1c65a8 100644
--- a/src/crewai/evaluation/evaluation_listener.py
+++ b/src/crewai/evaluation/evaluation_listener.py
@@ -1,6 +1,8 @@
 from datetime import datetime
 from typing import Any, Dict, List, Optional, Union
 
+from collections.abc import Sequence
+
 from crewai.agent import Agent
 from crewai.task import Task
 from crewai.utilities.events.base_event_listener import BaseEventListener
@@ -40,7 +42,7 @@ class EvaluationTraceCallback(BaseEventListener):
     def __init__(self):
         if not hasattr(self, "_initialized") or not self._initialized:
             super().__init__()
-            self.traces: Dict[str, Dict[str, Any]] = {}
+            self.traces = {}
             self.current_agent_id = None
             self.current_task_id = None
             self._initialized = True
@@ -109,8 +111,8 @@ class EvaluationTraceCallback(BaseEventListener):
         self.current_agent_id = None
         self.current_task_id = None
 
-    def on_tool_use(self, tool_name: str, tool_args: Dict[str, Any], result: Any,
-                   success: bool = True, error_type: Optional[str] = None):
+    def on_tool_use(self, tool_name: str, tool_args: dict[str, Any] | str, result: Any,
+                   success: bool = True, error_type: str | None = None):
         if not self.current_agent_id or not self.current_task_id:
             return
 
@@ -131,7 +133,7 @@ class EvaluationTraceCallback(BaseEventListener):
 
             self.traces[trace_key]["tool_uses"].append(tool_use)
 
-    def on_llm_call_start(self, messages: Union[str, List[Dict[str, Any]]], tools: Optional[List[Dict]] = None):
+    def on_llm_call_start(self, messages: str | Sequence[dict[str, Any]] | None, tools: Sequence[dict[str, Any]] | None = None):
         if not self.current_agent_id or not self.current_task_id:
             return
 
@@ -177,7 +179,7 @@ class EvaluationTraceCallback(BaseEventListener):
         self.traces[trace_key]["llm_calls"].append(llm_call)
 
         if hasattr(self, "current_llm_call"):
-            self.current_llm_call = None
+            self.current_llm_call = {}
 
     def get_trace(self, agent_id: str, task_id: str) -> Optional[Dict[str, Any]]:
         trace_key = f"{agent_id}_{task_id}"
diff --git a/src/crewai/evaluation/json_parser.py b/src/crewai/evaluation/json_parser.py
index a36084903..ce7303cde 100644
--- a/src/crewai/evaluation/json_parser.py
+++ b/src/crewai/evaluation/json_parser.py
@@ -2,10 +2,10 @@
 
 import json
 import re
-from typing import Dict, Any
+from typing import Any
 
 
-def extract_json_from_llm_response(text: str) -> Dict[str, Any]:
+def extract_json_from_llm_response(text: str) -> dict[str, Any]:
     try:
         return json.loads(text)
     except json.JSONDecodeError:
@@ -27,4 +27,4 @@ def extract_json_from_llm_response(text: str) -> Dict[str, Any]:
                 return json.loads(match.strip())
             except json.JSONDecodeError:
                 continue
-    return text
+    raise ValueError("No valid JSON found in the response")
diff --git a/src/crewai/evaluation/metrics/goal_metrics.py b/src/crewai/evaluation/metrics/goal_metrics.py
index c46070f6a..bc6c63801 100644
--- a/src/crewai/evaluation/metrics/goal_metrics.py
+++ b/src/crewai/evaluation/metrics/goal_metrics.py
@@ -46,13 +46,15 @@ Agent's final output:
 Evaluate how well the agent's output aligns with the assigned task goal.
 """}
         ]
-
+        assert self.llm is not None
         response = self.llm.call(prompt)
 
         try:
-            evaluation_data = extract_json_from_llm_response(response)
+            evaluation_data: dict[str, Any] = extract_json_from_llm_response(response)
+            assert evaluation_data is not None
+
             return EvaluationScore(
-                score=float(evaluation_data.get("score", None)),
+                score=evaluation_data.get("score", 0),
                 feedback=evaluation_data.get("feedback", response),
                 raw_response=response
             )
diff --git a/src/crewai/evaluation/metrics/reasoning_metrics.py b/src/crewai/evaluation/metrics/reasoning_metrics.py
index 796be7cd3..e1ce06c23 100644
--- a/src/crewai/evaluation/metrics/reasoning_metrics.py
+++ b/src/crewai/evaluation/metrics/reasoning_metrics.py
@@ -11,6 +11,7 @@ import re
 from enum import Enum
 from typing import Any, Dict, List, Tuple
 import numpy as np
+from collections.abc import Sequence
 
 from crewai.agent import Agent
 from crewai.task import Task
@@ -136,6 +137,7 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
 """}
         ]
 
+        assert self.llm is not None
         response = self.llm.call(prompt)
 
         try:
@@ -275,7 +277,7 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
             }
         }
 
-    def _calculate_trend(self, values: List[float]) -> float:
+    def _calculate_trend(self, values: Sequence[float | int]) -> float:
         if not values or len(values) < 2:
             return 0.0
 
@@ -295,7 +297,7 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
         except Exception:
             return 0.0
 
-    def _calculate_loop_likelihood(self, call_lengths: List[float], response_times: List[float]) -> float:
+    def _calculate_loop_likelihood(self, call_lengths: Sequence[float], response_times: Sequence[float]) -> float:
         if not call_lengths or len(call_lengths) < 3:
             return 0.0
 
diff --git a/src/crewai/evaluation/metrics/semantic_quality_metrics.py b/src/crewai/evaluation/metrics/semantic_quality_metrics.py
index 3e163bbab..a12c62ae3 100644
--- a/src/crewai/evaluation/metrics/semantic_quality_metrics.py
+++ b/src/crewai/evaluation/metrics/semantic_quality_metrics.py
@@ -46,12 +46,14 @@ Evaluate the semantic quality and reasoning of this output.
 """}
         ]
 
+        assert self.llm is not None
         response = self.llm.call(prompt)
 
         try:
-            evaluation_data = extract_json_from_llm_response(response)
+            evaluation_data: dict[str, Any] = extract_json_from_llm_response(response)
+            assert evaluation_data is not None
             return EvaluationScore(
-                score=float(evaluation_data.get("score", None)),
+                score=float(evaluation_data["score"]) if evaluation_data.get("score") is not None else None,
                 feedback=evaluation_data.get("feedback", response),
                 raw_response=response
             )
diff --git a/src/crewai/evaluation/metrics/tools_metrics.py b/src/crewai/evaluation/metrics/tools_metrics.py
index dcc01e6d7..00762fc76 100644
--- a/src/crewai/evaluation/metrics/tools_metrics.py
+++ b/src/crewai/evaluation/metrics/tools_metrics.py
@@ -87,11 +87,13 @@ IMPORTANT:
 - DO NOT evaluate tool usage or results
 """}
         ]
-
+        assert self.llm is not None
         response = self.llm.call(prompt)
 
         try:
             evaluation_data = extract_json_from_llm_response(response)
+            assert evaluation_data is not None
+
             scores = evaluation_data.get("scores", {})
             relevance = scores.get("relevance", 5.0)
             coverage = scores.get("coverage", 5.0)
@@ -220,10 +222,13 @@ Evaluate the quality of the agent's parameter extraction for this task.
 """}
         ]
 
+        assert self.llm is not None
         response = self.llm.call(prompt)
 
         try:
             evaluation_data = extract_json_from_llm_response(response)
+            assert evaluation_data is not None
+
             scores = evaluation_data.get("scores", {})
             accuracy = scores.get("accuracy", 5.0)
             formatting = scores.get("formatting", 5.0)
@@ -359,10 +364,12 @@ Evaluate the quality of the agent's tool invocation structure during this task.
 """}
         ]
 
+        assert self.llm is not None
         response = self.llm.call(prompt)
 
         try:
             evaluation_data = extract_json_from_llm_response(response)
+            assert evaluation_data is not None
             scores = evaluation_data.get("scores", {})
             structure = scores.get("structure", 5.0)
             error_handling = scores.get("error_handling", 5.0)
diff --git a/src/crewai/utilities/events/llm_events.py b/src/crewai/utilities/events/llm_events.py
index 0872bbc85..6dbb470b8 100644
--- a/src/crewai/utilities/events/llm_events.py
+++ b/src/crewai/utilities/events/llm_events.py
@@ -49,7 +49,7 @@ class LLMCallStartedEvent(LLMEventBase):
 
     type: str = "llm_call_started"
     messages: Optional[Union[str, List[Dict[str, Any]]]] = None
-    tools: Optional[List[dict]] = None
+    tools: Optional[List[dict[str, Any]]] = None
     callbacks: Optional[List[Any]] = None
     available_functions: Optional[Dict[str, Any]] = None