mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-10 00:28:31 +00:00
fix: use create_default_llm when llm is None in BaseEvaluator
This commit is contained in:
@@ -1,14 +1,15 @@
|
|||||||
import abc
|
import abc
|
||||||
import enum
|
import enum
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
from crewai.agent import Agent
|
from crewai.agent import Agent
|
||||||
from crewai.task import Task
|
|
||||||
from crewai.llm import BaseLLM
|
from crewai.llm import BaseLLM
|
||||||
from crewai.utilities.llm_utils import create_llm
|
from crewai.task import Task
|
||||||
|
from crewai.utilities.llm_utils import create_default_llm, create_llm
|
||||||
|
|
||||||
|
|
||||||
class MetricCategory(enum.Enum):
|
class MetricCategory(enum.Enum):
|
||||||
GOAL_ALIGNMENT = "goal_alignment"
|
GOAL_ALIGNMENT = "goal_alignment"
|
||||||
@@ -19,7 +20,7 @@ class MetricCategory(enum.Enum):
|
|||||||
TOOL_INVOCATION = "tool_invocation"
|
TOOL_INVOCATION = "tool_invocation"
|
||||||
|
|
||||||
def title(self):
|
def title(self):
|
||||||
return self.value.replace('_', ' ').title()
|
return self.value.replace("_", " ").title()
|
||||||
|
|
||||||
|
|
||||||
class EvaluationScore(BaseModel):
|
class EvaluationScore(BaseModel):
|
||||||
@@ -27,15 +28,13 @@ class EvaluationScore(BaseModel):
|
|||||||
default=5.0,
|
default=5.0,
|
||||||
description="Numeric score from 0-10 where 0 is worst and 10 is best, None if not applicable",
|
description="Numeric score from 0-10 where 0 is worst and 10 is best, None if not applicable",
|
||||||
ge=0.0,
|
ge=0.0,
|
||||||
le=10.0
|
le=10.0,
|
||||||
)
|
)
|
||||||
feedback: str = Field(
|
feedback: str = Field(
|
||||||
default="",
|
default="", description="Detailed feedback explaining the evaluation score"
|
||||||
description="Detailed feedback explaining the evaluation score"
|
|
||||||
)
|
)
|
||||||
raw_response: str | None = Field(
|
raw_response: str | None = Field(
|
||||||
default=None,
|
default=None, description="Raw response from the evaluator (e.g., LLM)"
|
||||||
description="Raw response from the evaluator (e.g., LLM)"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
@@ -46,7 +45,9 @@ class EvaluationScore(BaseModel):
|
|||||||
|
|
||||||
class BaseEvaluator(abc.ABC):
|
class BaseEvaluator(abc.ABC):
|
||||||
def __init__(self, llm: BaseLLM | None = None):
|
def __init__(self, llm: BaseLLM | None = None):
|
||||||
self.llm: BaseLLM | None = create_llm(llm)
|
self.llm: BaseLLM | None = (
|
||||||
|
create_llm(llm) if llm is not None else create_default_llm()
|
||||||
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
@@ -57,7 +58,7 @@ class BaseEvaluator(abc.ABC):
|
|||||||
def evaluate(
|
def evaluate(
|
||||||
self,
|
self,
|
||||||
agent: Agent,
|
agent: Agent,
|
||||||
execution_trace: Dict[str, Any],
|
execution_trace: dict[str, Any],
|
||||||
final_output: Any,
|
final_output: Any,
|
||||||
task: Task | None = None,
|
task: Task | None = None,
|
||||||
) -> EvaluationScore:
|
) -> EvaluationScore:
|
||||||
@@ -67,9 +68,8 @@ class BaseEvaluator(abc.ABC):
|
|||||||
class AgentEvaluationResult(BaseModel):
|
class AgentEvaluationResult(BaseModel):
|
||||||
agent_id: str = Field(description="ID of the evaluated agent")
|
agent_id: str = Field(description="ID of the evaluated agent")
|
||||||
task_id: str = Field(description="ID of the task that was executed")
|
task_id: str = Field(description="ID of the task that was executed")
|
||||||
metrics: Dict[MetricCategory, EvaluationScore] = Field(
|
metrics: dict[MetricCategory, EvaluationScore] = Field(
|
||||||
default_factory=dict,
|
default_factory=dict, description="Evaluation scores for each metric category"
|
||||||
description="Evaluation scores for each metric category"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -81,33 +81,23 @@ class AggregationStrategy(Enum):
|
|||||||
|
|
||||||
|
|
||||||
class AgentAggregatedEvaluationResult(BaseModel):
|
class AgentAggregatedEvaluationResult(BaseModel):
|
||||||
agent_id: str = Field(
|
agent_id: str = Field(default="", description="ID of the agent")
|
||||||
default="",
|
agent_role: str = Field(default="", description="Role of the agent")
|
||||||
description="ID of the agent"
|
|
||||||
)
|
|
||||||
agent_role: str = Field(
|
|
||||||
default="",
|
|
||||||
description="Role of the agent"
|
|
||||||
)
|
|
||||||
task_count: int = Field(
|
task_count: int = Field(
|
||||||
default=0,
|
default=0, description="Number of tasks included in this aggregation"
|
||||||
description="Number of tasks included in this aggregation"
|
|
||||||
)
|
)
|
||||||
aggregation_strategy: AggregationStrategy = Field(
|
aggregation_strategy: AggregationStrategy = Field(
|
||||||
default=AggregationStrategy.SIMPLE_AVERAGE,
|
default=AggregationStrategy.SIMPLE_AVERAGE,
|
||||||
description="Strategy used for aggregation"
|
description="Strategy used for aggregation",
|
||||||
)
|
)
|
||||||
metrics: Dict[MetricCategory, EvaluationScore] = Field(
|
metrics: dict[MetricCategory, EvaluationScore] = Field(
|
||||||
default_factory=dict,
|
default_factory=dict, description="Aggregated metrics across all tasks"
|
||||||
description="Aggregated metrics across all tasks"
|
|
||||||
)
|
)
|
||||||
task_results: List[str] = Field(
|
task_results: list[str] = Field(
|
||||||
default_factory=list,
|
default_factory=list, description="IDs of tasks included in this aggregation"
|
||||||
description="IDs of tasks included in this aggregation"
|
|
||||||
)
|
)
|
||||||
overall_score: Optional[float] = Field(
|
overall_score: Optional[float] = Field(
|
||||||
default=None,
|
default=None, description="Overall score for this agent"
|
||||||
description="Overall score for this agent"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
@@ -119,7 +109,7 @@ class AgentAggregatedEvaluationResult(BaseModel):
|
|||||||
result += f"\n\n- {category.value.upper()}: {score.score}/10\n"
|
result += f"\n\n- {category.value.upper()}: {score.score}/10\n"
|
||||||
|
|
||||||
if score.feedback:
|
if score.feedback:
|
||||||
detailed_feedback = "\n ".join(score.feedback.split('\n'))
|
detailed_feedback = "\n ".join(score.feedback.split("\n"))
|
||||||
result += f" {detailed_feedback}\n"
|
result += f" {detailed_feedback}\n"
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|||||||
Reference in New Issue
Block a user