diff --git a/src/crewai/evaluation/experiment/runner.py b/src/crewai/evaluation/experiment/runner.py
index 4cd818ec3..52fe756b9 100644
--- a/src/crewai/evaluation/experiment/runner.py
+++ b/src/crewai/evaluation/experiment/runner.py
@@ -1,6 +1,6 @@
 from collections import defaultdict
 from hashlib import md5
-from typing import List, Dict, Union, Any
+from typing import Any
 
 from crewai import Crew
 from crewai.evaluation import AgentEvaluator, create_default_evaluator
@@ -9,7 +9,7 @@ from crewai.evaluation.experiment.result import ExperimentResults, ExperimentRes
 from crewai.evaluation.evaluation_display import AgentAggregatedEvaluationResult
 
 class ExperimentRunner:
-    def __init__(self, dataset: List[Dict[str, Any]]):
+    def __init__(self, dataset: list[dict[str, Any]]):
         self.dataset = dataset or []
         self.evaluator: AgentEvaluator | None = None
         self.display = ExperimentResultsDisplay()
@@ -31,7 +31,7 @@ class ExperimentRunner:
 
         return experiment_results
 
-    def _run_test_case(self, test_case: Dict[str, Any], crew: Crew) -> ExperimentResult:
+    def _run_test_case(self, test_case: dict[str, Any], crew: Crew) -> ExperimentResult:
         inputs = test_case["inputs"]
         expected_score = test_case["expected_score"]
         identifier = test_case.get("identifier") or md5(str(test_case).encode(), usedforsecurity=False).hexdigest()
@@ -41,6 +41,7 @@ class ExperimentRunner:
             self.display.console.print("\n")
             crew.kickoff(inputs=inputs)
 
+            assert self.evaluator is not None
             agent_evaluations = self.evaluator.get_agent_evaluation()
 
             actual_score = self._extract_scores(agent_evaluations)
@@ -65,8 +66,8 @@ class ExperimentRunner:
                 passed=False
             )
 
-    def _extract_scores(self, agent_evaluations: Dict[str, AgentAggregatedEvaluationResult]) -> Union[int, Dict[str, int]]:
-        all_scores = defaultdict(list)
+    def _extract_scores(self, agent_evaluations: dict[str, AgentAggregatedEvaluationResult]) -> float | dict[str,  float]:
+        all_scores: dict[str, list[float]] = defaultdict(list)
         for evaluation in agent_evaluations.values():
             for metric_name, score in evaluation.metrics.items():
                 if score.score is not None:
@@ -79,8 +80,8 @@ class ExperimentRunner:
 
         return avg_scores
 
-    def _assert_scores(self, expected: Union[int, Dict[str, int]],
-                        actual: Union[int, Dict[str, int]]) -> bool:
+    def _assert_scores(self, expected: float | dict[str, float],
+                        actual: float | dict[str, float]) -> bool:
         """
         Compare expected and actual scores, and return whether the test case passed.