diff --git a/src/crewai/evaluation/agent_evaluator.py b/src/crewai/evaluation/agent_evaluator.py index 78520417c..430196d14 100644 --- a/src/crewai/evaluation/agent_evaluator.py +++ b/src/crewai/evaluation/agent_evaluator.py @@ -3,9 +3,10 @@ from crewai.agent import Agent from crewai.task import Task from crewai.evaluation.evaluation_display import EvaluationDisplayFormatter -from typing import List, Optional, Dict, Any +from typing import Any, Dict from collections import defaultdict from crewai.evaluation import BaseEvaluator, create_evaluation_callbacks +from collections.abc import Sequence from crewai.crew import Crew from crewai.utilities.events.crewai_event_bus import crewai_event_bus from crewai.utilities.events.utils.console_formatter import ConsoleFormatter @@ -13,28 +14,29 @@ from crewai.utilities.events.utils.console_formatter import ConsoleFormatter class AgentEvaluator: def __init__( self, - evaluators: Optional[List[BaseEvaluator]] = None, - crew: Optional[Any] = None, + evaluators: Sequence[BaseEvaluator] | None = None, + crew: Crew | None = None, ): - self.crew: Crew = crew - self.evaluators = evaluators + self.crew: Crew | None = crew + self.evaluators: Sequence[BaseEvaluator] | None = evaluators - self.agent_evaluators = {} + self.agent_evaluators: dict[str, Sequence[BaseEvaluator] | None] = {} if crew is not None: + assert crew and crew.agents is not None for agent in crew.agents: - self.agent_evaluators[agent.id] = self.evaluators.copy() + self.agent_evaluators[str(agent.id)] = self.evaluators self.callback = create_evaluation_callbacks() self.console_formatter = ConsoleFormatter() self.display_formatter = EvaluationDisplayFormatter() self.iteration = 1 - self.iterations_results = {} + self.iterations_results: dict[int, dict[str, list[AgentEvaluationResult]]] = {} def set_iteration(self, iteration: int) -> None: self.iteration = iteration - def evaluate_current_iteration(self): + def evaluate_current_iteration(self) -> dict[str, list[AgentEvaluationResult]]: if not self.crew: raise ValueError("Cannot evaluate: no crew was provided to the evaluator.") @@ -42,12 +44,12 @@ class AgentEvaluator: raise ValueError("Cannot evaluate: no callback was set. Use set_callback() method first.") from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn - evaluation_results = defaultdict(list) + evaluation_results: defaultdict[str, list[AgentEvaluationResult]] = defaultdict(list) total_evals = 0 for agent in self.crew.agents: for task in self.crew.tasks: - if task.agent.id == agent.id and self.agent_evaluators.get(agent.id): + if task.agent and task.agent.id == agent.id and self.agent_evaluators.get(str(agent.id)): total_evals += 1 with Progress( @@ -60,15 +62,16 @@ class AgentEvaluator: eval_task = progress.add_task(f"Evaluating agents (iteration {self.iteration})...", total=total_evals) for agent in self.crew.agents: - evaluator = self.agent_evaluators.get(agent.id) + evaluator = self.agent_evaluators.get(str(agent.id)) if not evaluator: continue for task in self.crew.tasks: - if task.agent.id != agent.id: + + if task.agent and str(task.agent.id) != str(agent.id): continue - trace = self.callback.get_trace(agent.id, task.id) + trace = self.callback.get_trace(str(agent.id), str(task.id)) if not trace: self.console_formatter.print(f"[yellow]Warning: No trace found for agent {agent.role} on task {task.description[:30]}...[/yellow]") progress.update(eval_task, advance=1) @@ -138,7 +141,7 @@ class AgentEvaluator: agent_id=str(agent.id), task_id=str(task.id) ) - + assert self.evaluators is not None for evaluator in self.evaluators: try: score = evaluator.evaluate( diff --git a/src/crewai/evaluation/base_evaluator.py b/src/crewai/evaluation/base_evaluator.py index 848e08468..b11c61973 100644 --- a/src/crewai/evaluation/base_evaluator.py +++ b/src/crewai/evaluation/base_evaluator.py @@ -23,7 +23,7 @@ class MetricCategory(enum.Enum): class EvaluationScore(BaseModel): - score: Optional[float] = Field( + score: float | None = Field( default=5.0, description="Numeric score from 0-10 where 0 is worst and 10 is best, None if not applicable", ge=0.0, @@ -33,7 +33,7 @@ class EvaluationScore(BaseModel): default="", description="Detailed feedback explaining the evaluation score" ) - raw_response: Optional[str] = Field( + raw_response: str | None = Field( default=None, description="Raw response from the evaluator (e.g., LLM)" ) @@ -45,8 +45,8 @@ class EvaluationScore(BaseModel): class BaseEvaluator(abc.ABC): - def __init__(self, llm: Optional[BaseLLM] = None): - self.llm = create_llm(llm) + def __init__(self, llm: BaseLLM | None = None): + self.llm: BaseLLM | None = create_llm(llm) @property @abc.abstractmethod diff --git a/src/crewai/evaluation/evaluation_display.py b/src/crewai/evaluation/evaluation_display.py index cf3f82f6a..0e30c53f0 100644 --- a/src/crewai/evaluation/evaluation_display.py +++ b/src/crewai/evaluation/evaluation_display.py @@ -1,7 +1,9 @@ +from collections import defaultdict from typing import Dict, Any, List from rich.table import Table from rich.box import HEAVY_EDGE, ROUNDED -from crewai.evaluation.base_evaluator import AgentAggregatedEvaluationResult, AggregationStrategy +from collections.abc import Sequence +from crewai.evaluation.base_evaluator import AgentAggregatedEvaluationResult, AggregationStrategy, AgentEvaluationResult, MetricCategory from crewai.evaluation import EvaluationScore from crewai.utilities.events.utils.console_formatter import ConsoleFormatter from crewai.utilities.llm_utils import create_llm @@ -16,7 +18,7 @@ class EvaluationDisplayFormatter: return # Get all agent roles across all iterations - all_agent_roles = set() + all_agent_roles: set[str] = set() for iter_results in iterations_results.values(): all_agent_roles.update(iter_results.keys()) @@ -50,9 +52,9 @@ class EvaluationDisplayFormatter: # Add metrics to table if aggregated_result.metrics: for metric, evaluation_score in aggregated_result.metrics.items(): - score = evaluation_score.score if evaluation_score.score is not None else "N/A" + score = evaluation_score.score - if isinstance(score, (int, float)) and score is not None: + if isinstance(score, (int, float)): if score >= 8.0: score_text = f"[green]{score:.1f}[/green]" elif score >= 6.0: @@ -109,7 +111,7 @@ class EvaluationDisplayFormatter: table.add_column("Avg. Total", justify="center") - all_agent_roles = set() + all_agent_roles: set[str] = set() for results in iterations_results.values(): all_agent_roles.update(results.keys()) @@ -173,7 +175,7 @@ class EvaluationDisplayFormatter: table.add_row(*row) - all_metrics = set() + all_metrics: set[Any] = set() for metrics in agent_metrics_by_iteration.values(): all_metrics.update(metrics.keys()) @@ -185,18 +187,18 @@ class EvaluationDisplayFormatter: for iter_num in sorted(iterations_results.keys()): if (iter_num in agent_metrics_by_iteration and metric in agent_metrics_by_iteration[iter_num]): - score = agent_metrics_by_iteration[iter_num][metric].score - if score is not None: - metric_scores.append(score) - if score >= 8.0: + metric_score = agent_metrics_by_iteration[iter_num][metric].score + if metric_score is not None: + metric_scores.append(metric_score) + if metric_score >= 8.0: color = "green" - elif score >= 6.0: + elif metric_score >= 6.0: color = "cyan" - elif score >= 4.0: + elif metric_score >= 4.0: color = "yellow" else: color = "red" - row.append(f"[{color}]{score:.1f}[/]") + row.append(f"[{color}]{metric_score:.1f}[/]") else: row.append("[dim]N/A[/dim]") else: @@ -227,34 +229,29 @@ class EvaluationDisplayFormatter: self, agent_id: str, agent_role: str, - results: List[Any], + results: Sequence[AgentEvaluationResult], strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, ) -> AgentAggregatedEvaluationResult: - metrics_by_category = {} + metrics_by_category: dict[MetricCategory, list[EvaluationScore]] = defaultdict(list) for result in results: for metric_name, evaluation_score in result.metrics.items(): - if metric_name not in metrics_by_category: - metrics_by_category[metric_name] = [] metrics_by_category[metric_name].append(evaluation_score) - aggregated_metrics = {} + aggregated_metrics: dict[MetricCategory, EvaluationScore] = {} for category, scores in metrics_by_category.items(): - valid_scores = [s for s in scores if s.score is not None] + valid_scores = [s.score for s in scores if s.score is not None] + avg_score = sum(valid_scores) / len(valid_scores) if valid_scores else None - avg_score = sum(s.score for s in valid_scores) / len(valid_scores) if valid_scores else None - - # Extract all feedback text from scores feedbacks = [s.feedback for s in scores if s.feedback] - # Process feedback based on number of entries feedback_summary = None if feedbacks: if len(feedbacks) > 1: # Use the summarization method for multiple feedbacks feedback_summary = self._summarize_feedbacks( agent_role=agent_role, - metric=category, + metric=category.title(), feedbacks=feedbacks, scores=[s.score for s in scores], strategy=strategy @@ -269,9 +266,9 @@ class EvaluationDisplayFormatter: overall_score = None if aggregated_metrics: - scores = [m.score for m in aggregated_metrics.values() if m.score is not None] - if scores: - overall_score = sum(scores) / len(scores) + valid_scores = [m.score for m in aggregated_metrics.values() if m.score is not None] + if valid_scores: + overall_score = sum(valid_scores) / len(valid_scores) return AgentAggregatedEvaluationResult( agent_id=agent_id, @@ -287,7 +284,7 @@ class EvaluationDisplayFormatter: agent_role: str, metric: str, feedbacks: List[str], - scores: List[float], + scores: List[float | None], strategy: AggregationStrategy ) -> str: if len(feedbacks) <= 2 and all(len(fb) < 200 for fb in feedbacks): @@ -335,7 +332,7 @@ class EvaluationDisplayFormatter: {all_feedbacks} """} ] - + assert llm is not None response = llm.call(prompt) return response diff --git a/src/crewai/evaluation/evaluation_listener.py b/src/crewai/evaluation/evaluation_listener.py index 91a79c051..7da1c65a8 100644 --- a/src/crewai/evaluation/evaluation_listener.py +++ b/src/crewai/evaluation/evaluation_listener.py @@ -1,6 +1,8 @@ from datetime import datetime from typing import Any, Dict, List, Optional, Union +from collections.abc import Sequence + from crewai.agent import Agent from crewai.task import Task from crewai.utilities.events.base_event_listener import BaseEventListener @@ -40,7 +42,7 @@ class EvaluationTraceCallback(BaseEventListener): def __init__(self): if not hasattr(self, "_initialized") or not self._initialized: super().__init__() - self.traces: Dict[str, Dict[str, Any]] = {} + self.traces = {} self.current_agent_id = None self.current_task_id = None self._initialized = True @@ -109,8 +111,8 @@ class EvaluationTraceCallback(BaseEventListener): self.current_agent_id = None self.current_task_id = None - def on_tool_use(self, tool_name: str, tool_args: Dict[str, Any], result: Any, - success: bool = True, error_type: Optional[str] = None): + def on_tool_use(self, tool_name: str, tool_args: dict[str, Any] | str, result: Any, + success: bool = True, error_type: str | None = None): if not self.current_agent_id or not self.current_task_id: return @@ -131,7 +133,7 @@ class EvaluationTraceCallback(BaseEventListener): self.traces[trace_key]["tool_uses"].append(tool_use) - def on_llm_call_start(self, messages: Union[str, List[Dict[str, Any]]], tools: Optional[List[Dict]] = None): + def on_llm_call_start(self, messages: str | Sequence[dict[str, Any]] | None, tools: Sequence[dict[str, Any]] | None = None): if not self.current_agent_id or not self.current_task_id: return @@ -177,7 +179,7 @@ class EvaluationTraceCallback(BaseEventListener): self.traces[trace_key]["llm_calls"].append(llm_call) if hasattr(self, "current_llm_call"): - self.current_llm_call = None + self.current_llm_call = {} def get_trace(self, agent_id: str, task_id: str) -> Optional[Dict[str, Any]]: trace_key = f"{agent_id}_{task_id}" diff --git a/src/crewai/evaluation/json_parser.py b/src/crewai/evaluation/json_parser.py index a36084903..ce7303cde 100644 --- a/src/crewai/evaluation/json_parser.py +++ b/src/crewai/evaluation/json_parser.py @@ -2,10 +2,10 @@ import json import re -from typing import Dict, Any +from typing import Any -def extract_json_from_llm_response(text: str) -> Dict[str, Any]: +def extract_json_from_llm_response(text: str) -> dict[str, Any]: try: return json.loads(text) except json.JSONDecodeError: @@ -27,4 +27,4 @@ def extract_json_from_llm_response(text: str) -> Dict[str, Any]: return json.loads(match.strip()) except json.JSONDecodeError: continue - return text + raise ValueError("No valid JSON found in the response") diff --git a/src/crewai/evaluation/metrics/goal_metrics.py b/src/crewai/evaluation/metrics/goal_metrics.py index c46070f6a..bc6c63801 100644 --- a/src/crewai/evaluation/metrics/goal_metrics.py +++ b/src/crewai/evaluation/metrics/goal_metrics.py @@ -46,13 +46,15 @@ Agent's final output: Evaluate how well the agent's output aligns with the assigned task goal. """} ] - + assert self.llm is not None response = self.llm.call(prompt) try: - evaluation_data = extract_json_from_llm_response(response) + evaluation_data: dict[str, Any] = extract_json_from_llm_response(response) + assert evaluation_data is not None + return EvaluationScore( - score=float(evaluation_data.get("score", None)), + score=evaluation_data.get("score", 0), feedback=evaluation_data.get("feedback", response), raw_response=response ) diff --git a/src/crewai/evaluation/metrics/reasoning_metrics.py b/src/crewai/evaluation/metrics/reasoning_metrics.py index 796be7cd3..e1ce06c23 100644 --- a/src/crewai/evaluation/metrics/reasoning_metrics.py +++ b/src/crewai/evaluation/metrics/reasoning_metrics.py @@ -11,6 +11,7 @@ import re from enum import Enum from typing import Any, Dict, List, Tuple import numpy as np +from collections.abc import Sequence from crewai.agent import Agent from crewai.task import Task @@ -136,6 +137,7 @@ Identify any inefficient reasoning patterns and provide specific suggestions for """} ] + assert self.llm is not None response = self.llm.call(prompt) try: @@ -275,7 +277,7 @@ Identify any inefficient reasoning patterns and provide specific suggestions for } } - def _calculate_trend(self, values: List[float]) -> float: + def _calculate_trend(self, values: Sequence[float | int]) -> float: if not values or len(values) < 2: return 0.0 @@ -295,7 +297,7 @@ Identify any inefficient reasoning patterns and provide specific suggestions for except Exception: return 0.0 - def _calculate_loop_likelihood(self, call_lengths: List[float], response_times: List[float]) -> float: + def _calculate_loop_likelihood(self, call_lengths: Sequence[float], response_times: Sequence[float]) -> float: if not call_lengths or len(call_lengths) < 3: return 0.0 diff --git a/src/crewai/evaluation/metrics/semantic_quality_metrics.py b/src/crewai/evaluation/metrics/semantic_quality_metrics.py index 3e163bbab..a12c62ae3 100644 --- a/src/crewai/evaluation/metrics/semantic_quality_metrics.py +++ b/src/crewai/evaluation/metrics/semantic_quality_metrics.py @@ -46,12 +46,14 @@ Evaluate the semantic quality and reasoning of this output. """} ] + assert self.llm is not None response = self.llm.call(prompt) try: - evaluation_data = extract_json_from_llm_response(response) + evaluation_data: dict[str, Any] = extract_json_from_llm_response(response) + assert evaluation_data is not None return EvaluationScore( - score=float(evaluation_data.get("score", None)), + score=float(evaluation_data["score"]) if evaluation_data.get("score") is not None else None, feedback=evaluation_data.get("feedback", response), raw_response=response ) diff --git a/src/crewai/evaluation/metrics/tools_metrics.py b/src/crewai/evaluation/metrics/tools_metrics.py index dcc01e6d7..00762fc76 100644 --- a/src/crewai/evaluation/metrics/tools_metrics.py +++ b/src/crewai/evaluation/metrics/tools_metrics.py @@ -87,11 +87,13 @@ IMPORTANT: - DO NOT evaluate tool usage or results """} ] - + assert self.llm is not None response = self.llm.call(prompt) try: evaluation_data = extract_json_from_llm_response(response) + assert evaluation_data is not None + scores = evaluation_data.get("scores", {}) relevance = scores.get("relevance", 5.0) coverage = scores.get("coverage", 5.0) @@ -220,10 +222,13 @@ Evaluate the quality of the agent's parameter extraction for this task. """} ] + assert self.llm is not None response = self.llm.call(prompt) try: evaluation_data = extract_json_from_llm_response(response) + assert evaluation_data is not None + scores = evaluation_data.get("scores", {}) accuracy = scores.get("accuracy", 5.0) formatting = scores.get("formatting", 5.0) @@ -359,10 +364,12 @@ Evaluate the quality of the agent's tool invocation structure during this task. """} ] + assert self.llm is not None response = self.llm.call(prompt) try: evaluation_data = extract_json_from_llm_response(response) + assert evaluation_data is not None scores = evaluation_data.get("scores", {}) structure = scores.get("structure", 5.0) error_handling = scores.get("error_handling", 5.0) diff --git a/src/crewai/utilities/events/llm_events.py b/src/crewai/utilities/events/llm_events.py index 0872bbc85..6dbb470b8 100644 --- a/src/crewai/utilities/events/llm_events.py +++ b/src/crewai/utilities/events/llm_events.py @@ -49,7 +49,7 @@ class LLMCallStartedEvent(LLMEventBase): type: str = "llm_call_started" messages: Optional[Union[str, List[Dict[str, Any]]]] = None - tools: Optional[List[dict]] = None + tools: Optional[List[dict[str, Any]]] = None callbacks: Optional[List[Any]] = None available_functions: Optional[Dict[str, Any]] = None