Files
crewAI/lib/crewai/src/crewai/experimental/evaluation/base_evaluator.py
Lorenze Jay b9dd166a6b
Some checks failed
Build uv cache / build-cache (3.10) (push) Has been cancelled
Build uv cache / build-cache (3.11) (push) Has been cancelled
Build uv cache / build-cache (3.12) (push) Has been cancelled
Build uv cache / build-cache (3.13) (push) Has been cancelled
CodeQL Advanced / Analyze (actions) (push) Has been cancelled
CodeQL Advanced / Analyze (python) (push) Has been cancelled
Notify Downstream / notify-downstream (push) Has been cancelled
Mark stale issues and pull requests / stale (push) Has been cancelled
Lorenze/agent executor flow pattern (#3975)
* WIP gh pr refactor: update agent executor handling and introduce flow-based executor

* wip

* refactor: clean up comments and improve code clarity in agent executor flow

- Removed outdated comments and unnecessary explanations in  and  classes to enhance code readability.
- Simplified parameter updates in the agent executor to avoid confusion regarding executor recreation.
- Improved clarity in the  method to ensure proper handling of non-final answers without raising errors.

* bumping pytest-randomly numpy

* also bump versions of anthropic sdk

* ensure flow logs are not passed if its on executor

* revert anthropic bump

* fix

* refactor: update dependency markers in uv.lock for platform compatibility

- Enhanced dependency markers for , , , and others to ensure compatibility across different platforms (Linux, Darwin, and architecture-specific conditions).
- Removed unnecessary event emission in the  class during kickoff.
- Cleaned up commented-out code in the  class for better readability and maintainability.

* drop dupllicate

* test: enhance agent executor creation and stop word assertions

- Added calls to create_agent_executor in multiple test cases to ensure proper agent execution setup.
- Updated assertions for stop words in the agent tests to remove unnecessary checks and improve clarity.
- Ensured consistency in task handling by invoking create_agent_executor with the appropriate task parameter.

* refactor: reorganize agent executor imports and introduce CrewAgentExecutorFlow

- Removed the old import of CrewAgentExecutorFlow and replaced it with the new import from the experimental module.
- Updated relevant references in the codebase to ensure compatibility with the new structure.
- Enhanced the organization of imports in core.py and base_agent.py for better clarity and maintainability.

* updating name

* dropped usage of printer here for rich console and dropped non-added value logging

* address i18n

* Enhance concurrency control in CrewAgentExecutorFlow by introducing a threading lock to prevent concurrent executions. This change ensures that the executor instance cannot be invoked while already running, improving stability and reliability during flow execution.

* string literal returns

* string literal returns

* Enhance CrewAgentExecutor initialization by allowing optional i18n parameter for improved internationalization support. This change ensures that the executor can utilize a provided i18n instance or fallback to the default, enhancing flexibility in multilingual contexts.

---------

Co-authored-by: Greyson LaLonde <greyson.r.lalonde@gmail.com>
2025-12-28 10:21:32 -08:00

120 lines
3.8 KiB
Python

from __future__ import annotations
import abc
import enum
from enum import Enum
from typing import TYPE_CHECKING, Any
from pydantic import BaseModel, Field
from crewai.agents.agent_builder.base_agent import BaseAgent
from crewai.llm import BaseLLM
from crewai.task import Task
from crewai.utilities.llm_utils import create_llm
if TYPE_CHECKING:
from crewai.agent import Agent
class MetricCategory(enum.Enum):
GOAL_ALIGNMENT = "goal_alignment"
SEMANTIC_QUALITY = "semantic_quality"
REASONING_EFFICIENCY = "reasoning_efficiency"
TOOL_SELECTION = "tool_selection"
PARAMETER_EXTRACTION = "parameter_extraction"
TOOL_INVOCATION = "tool_invocation"
def title(self):
return self.value.replace("_", " ").title()
class EvaluationScore(BaseModel):
score: float | None = Field(
default=5.0,
description="Numeric score from 0-10 where 0 is worst and 10 is best, None if not applicable",
ge=0.0,
le=10.0,
)
feedback: str = Field(
default="", description="Detailed feedback explaining the evaluation score"
)
raw_response: str | None = Field(
default=None, description="Raw response from the evaluator (e.g., LLM)"
)
def __str__(self) -> str:
if self.score is None:
return f"Score: N/A - {self.feedback}"
return f"Score: {self.score:.1f}/10 - {self.feedback}"
class BaseEvaluator(abc.ABC):
def __init__(self, llm: BaseLLM | None = None):
self.llm: BaseLLM | None = create_llm(llm)
@property
@abc.abstractmethod
def metric_category(self) -> MetricCategory:
pass
@abc.abstractmethod
def evaluate(
self,
agent: Agent | BaseAgent,
execution_trace: dict[str, Any],
final_output: Any,
task: Task | None = None,
) -> EvaluationScore:
pass
class AgentEvaluationResult(BaseModel):
agent_id: str = Field(description="ID of the evaluated agent")
task_id: str = Field(description="ID of the task that was executed")
metrics: dict[MetricCategory, EvaluationScore] = Field(
default_factory=dict, description="Evaluation scores for each metric category"
)
class AggregationStrategy(Enum):
SIMPLE_AVERAGE = "simple_average" # Equal weight to all tasks
WEIGHTED_BY_COMPLEXITY = "weighted_by_complexity" # Weight by task complexity
BEST_PERFORMANCE = "best_performance" # Use best scores across tasks
WORST_PERFORMANCE = "worst_performance" # Use worst scores across tasks
class AgentAggregatedEvaluationResult(BaseModel):
agent_id: str = Field(default="", description="ID of the agent")
agent_role: str = Field(default="", description="Role of the agent")
task_count: int = Field(
default=0, description="Number of tasks included in this aggregation"
)
aggregation_strategy: AggregationStrategy = Field(
default=AggregationStrategy.SIMPLE_AVERAGE,
description="Strategy used for aggregation",
)
metrics: dict[MetricCategory, EvaluationScore] = Field(
default_factory=dict, description="Aggregated metrics across all tasks"
)
task_results: list[str] = Field(
default_factory=list, description="IDs of tasks included in this aggregation"
)
overall_score: float | None = Field(
default=None, description="Overall score for this agent"
)
def __str__(self) -> str:
result = f"Agent Evaluation: {self.agent_role}\n"
result += f"Strategy: {self.aggregation_strategy.value}\n"
result += f"Tasks evaluated: {self.task_count}\n"
for category, score in self.metrics.items():
result += f"\n\n- {category.value.upper()}: {score.score}/10\n"
if score.feedback:
detailed_feedback = "\n ".join(score.feedback.split("\n"))
result += f" {detailed_feedback}\n"
return result