mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-10 16:48:30 +00:00
Some checks failed
Build uv cache / build-cache (3.10) (push) Has been cancelled
Build uv cache / build-cache (3.11) (push) Has been cancelled
Build uv cache / build-cache (3.12) (push) Has been cancelled
Build uv cache / build-cache (3.13) (push) Has been cancelled
CodeQL Advanced / Analyze (actions) (push) Has been cancelled
CodeQL Advanced / Analyze (python) (push) Has been cancelled
Notify Downstream / notify-downstream (push) Has been cancelled
Mark stale issues and pull requests / stale (push) Has been cancelled
* WIP gh pr refactor: update agent executor handling and introduce flow-based executor * wip * refactor: clean up comments and improve code clarity in agent executor flow - Removed outdated comments and unnecessary explanations in and classes to enhance code readability. - Simplified parameter updates in the agent executor to avoid confusion regarding executor recreation. - Improved clarity in the method to ensure proper handling of non-final answers without raising errors. * bumping pytest-randomly numpy * also bump versions of anthropic sdk * ensure flow logs are not passed if its on executor * revert anthropic bump * fix * refactor: update dependency markers in uv.lock for platform compatibility - Enhanced dependency markers for , , , and others to ensure compatibility across different platforms (Linux, Darwin, and architecture-specific conditions). - Removed unnecessary event emission in the class during kickoff. - Cleaned up commented-out code in the class for better readability and maintainability. * drop dupllicate * test: enhance agent executor creation and stop word assertions - Added calls to create_agent_executor in multiple test cases to ensure proper agent execution setup. - Updated assertions for stop words in the agent tests to remove unnecessary checks and improve clarity. - Ensured consistency in task handling by invoking create_agent_executor with the appropriate task parameter. * refactor: reorganize agent executor imports and introduce CrewAgentExecutorFlow - Removed the old import of CrewAgentExecutorFlow and replaced it with the new import from the experimental module. - Updated relevant references in the codebase to ensure compatibility with the new structure. - Enhanced the organization of imports in core.py and base_agent.py for better clarity and maintainability. * updating name * dropped usage of printer here for rich console and dropped non-added value logging * address i18n * Enhance concurrency control in CrewAgentExecutorFlow by introducing a threading lock to prevent concurrent executions. This change ensures that the executor instance cannot be invoked while already running, improving stability and reliability during flow execution. * string literal returns * string literal returns * Enhance CrewAgentExecutor initialization by allowing optional i18n parameter for improved internationalization support. This change ensures that the executor can utilize a provided i18n instance or fallback to the default, enhancing flexibility in multilingual contexts. --------- Co-authored-by: Greyson LaLonde <greyson.r.lalonde@gmail.com>
120 lines
3.8 KiB
Python
120 lines
3.8 KiB
Python
from __future__ import annotations
|
|
|
|
import abc
|
|
import enum
|
|
from enum import Enum
|
|
from typing import TYPE_CHECKING, Any
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
from crewai.agents.agent_builder.base_agent import BaseAgent
|
|
from crewai.llm import BaseLLM
|
|
from crewai.task import Task
|
|
from crewai.utilities.llm_utils import create_llm
|
|
|
|
|
|
if TYPE_CHECKING:
|
|
from crewai.agent import Agent
|
|
|
|
|
|
class MetricCategory(enum.Enum):
|
|
GOAL_ALIGNMENT = "goal_alignment"
|
|
SEMANTIC_QUALITY = "semantic_quality"
|
|
REASONING_EFFICIENCY = "reasoning_efficiency"
|
|
TOOL_SELECTION = "tool_selection"
|
|
PARAMETER_EXTRACTION = "parameter_extraction"
|
|
TOOL_INVOCATION = "tool_invocation"
|
|
|
|
def title(self):
|
|
return self.value.replace("_", " ").title()
|
|
|
|
|
|
class EvaluationScore(BaseModel):
|
|
score: float | None = Field(
|
|
default=5.0,
|
|
description="Numeric score from 0-10 where 0 is worst and 10 is best, None if not applicable",
|
|
ge=0.0,
|
|
le=10.0,
|
|
)
|
|
feedback: str = Field(
|
|
default="", description="Detailed feedback explaining the evaluation score"
|
|
)
|
|
raw_response: str | None = Field(
|
|
default=None, description="Raw response from the evaluator (e.g., LLM)"
|
|
)
|
|
|
|
def __str__(self) -> str:
|
|
if self.score is None:
|
|
return f"Score: N/A - {self.feedback}"
|
|
return f"Score: {self.score:.1f}/10 - {self.feedback}"
|
|
|
|
|
|
class BaseEvaluator(abc.ABC):
|
|
def __init__(self, llm: BaseLLM | None = None):
|
|
self.llm: BaseLLM | None = create_llm(llm)
|
|
|
|
@property
|
|
@abc.abstractmethod
|
|
def metric_category(self) -> MetricCategory:
|
|
pass
|
|
|
|
@abc.abstractmethod
|
|
def evaluate(
|
|
self,
|
|
agent: Agent | BaseAgent,
|
|
execution_trace: dict[str, Any],
|
|
final_output: Any,
|
|
task: Task | None = None,
|
|
) -> EvaluationScore:
|
|
pass
|
|
|
|
|
|
class AgentEvaluationResult(BaseModel):
|
|
agent_id: str = Field(description="ID of the evaluated agent")
|
|
task_id: str = Field(description="ID of the task that was executed")
|
|
metrics: dict[MetricCategory, EvaluationScore] = Field(
|
|
default_factory=dict, description="Evaluation scores for each metric category"
|
|
)
|
|
|
|
|
|
class AggregationStrategy(Enum):
|
|
SIMPLE_AVERAGE = "simple_average" # Equal weight to all tasks
|
|
WEIGHTED_BY_COMPLEXITY = "weighted_by_complexity" # Weight by task complexity
|
|
BEST_PERFORMANCE = "best_performance" # Use best scores across tasks
|
|
WORST_PERFORMANCE = "worst_performance" # Use worst scores across tasks
|
|
|
|
|
|
class AgentAggregatedEvaluationResult(BaseModel):
|
|
agent_id: str = Field(default="", description="ID of the agent")
|
|
agent_role: str = Field(default="", description="Role of the agent")
|
|
task_count: int = Field(
|
|
default=0, description="Number of tasks included in this aggregation"
|
|
)
|
|
aggregation_strategy: AggregationStrategy = Field(
|
|
default=AggregationStrategy.SIMPLE_AVERAGE,
|
|
description="Strategy used for aggregation",
|
|
)
|
|
metrics: dict[MetricCategory, EvaluationScore] = Field(
|
|
default_factory=dict, description="Aggregated metrics across all tasks"
|
|
)
|
|
task_results: list[str] = Field(
|
|
default_factory=list, description="IDs of tasks included in this aggregation"
|
|
)
|
|
overall_score: float | None = Field(
|
|
default=None, description="Overall score for this agent"
|
|
)
|
|
|
|
def __str__(self) -> str:
|
|
result = f"Agent Evaluation: {self.agent_role}\n"
|
|
result += f"Strategy: {self.aggregation_strategy.value}\n"
|
|
result += f"Tasks evaluated: {self.task_count}\n"
|
|
|
|
for category, score in self.metrics.items():
|
|
result += f"\n\n- {category.value.upper()}: {score.score}/10\n"
|
|
|
|
if score.feedback:
|
|
detailed_feedback = "\n ".join(score.feedback.split("\n"))
|
|
result += f" {detailed_feedback}\n"
|
|
|
|
return result
|