mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-08 15:48:29 +00:00
Compare commits
9 Commits
devin/1739
...
devin/1739
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dd38554b70 | ||
|
|
5e528416ec | ||
|
|
a097d933f6 | ||
|
|
7c2c7575ed | ||
|
|
5205021e94 | ||
|
|
4af5d0801b | ||
|
|
2086a4b530 | ||
|
|
16e558056a | ||
|
|
0068137974 |
@@ -1077,33 +1077,41 @@ class Crew(BaseModel):
|
||||
def test(
|
||||
self,
|
||||
n_iterations: int,
|
||||
openai_model_name: Optional[str] = None,
|
||||
llm: Optional[Union[str, LLM]] = None,
|
||||
llm: Optional[Union[str, InstanceOf[LLM], Any]] = None,
|
||||
openai_model_name: Optional[str] = None, # For backward compatibility
|
||||
inputs: Optional[Dict[str, Any]] = None,
|
||||
) -> None:
|
||||
"""Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures.
|
||||
|
||||
"""Test and evaluate the Crew with the given inputs for n iterations.
|
||||
|
||||
This method runs tests to evaluate the performance of the crew using the specified
|
||||
language model. It supports both string model names and LLM instances for flexibility.
|
||||
|
||||
Args:
|
||||
n_iterations: Number of iterations to run the test
|
||||
openai_model_name: Name of OpenAI model to use (deprecated, use llm instead)
|
||||
llm: LLM instance or model name to use for evaluation
|
||||
inputs: Optional inputs to pass to the crew
|
||||
n_iterations: Number of test iterations to run
|
||||
llm: Language model configuration (preferred). Can be:
|
||||
- A string model name (e.g., "gpt-4")
|
||||
- An LLM instance
|
||||
- Any object with model_name or deployment_name attributes
|
||||
openai_model_name: Legacy parameter for backward compatibility.
|
||||
Deprecated: Will be removed in future versions. Use `llm` instead.
|
||||
inputs: Optional dictionary of inputs to be used during testing
|
||||
|
||||
Note:
|
||||
The `openai_model_name` parameter is deprecated and will be removed in
|
||||
future versions. Use the more flexible `llm` parameter instead, which
|
||||
supports any LLM implementation.
|
||||
"""
|
||||
test_crew = self.copy()
|
||||
|
||||
# Convert string to LLM instance if needed
|
||||
if isinstance(llm, str):
|
||||
llm = LLM(model=llm)
|
||||
elif openai_model_name:
|
||||
llm = LLM(model=openai_model_name)
|
||||
|
||||
# For backward compatibility, convert openai_model_name to llm
|
||||
model_name = llm or openai_model_name or "gpt-4o-mini"
|
||||
self._test_execution_span = test_crew._telemetry.test_execution_span(
|
||||
test_crew,
|
||||
n_iterations,
|
||||
inputs,
|
||||
getattr(llm, "model", openai_model_name),
|
||||
model_name,
|
||||
)
|
||||
evaluator = CrewEvaluator(test_crew, llm)
|
||||
evaluator = CrewEvaluator(test_crew, llm=model_name)
|
||||
|
||||
for i in range(1, n_iterations + 1):
|
||||
evaluator.set_iteration(i)
|
||||
|
||||
@@ -1,7 +1,13 @@
|
||||
from collections import defaultdict
|
||||
from typing import Union
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from pydantic import (
|
||||
BaseModel,
|
||||
Field,
|
||||
InstanceOf,
|
||||
PrivateAttr,
|
||||
model_validator,
|
||||
)
|
||||
from rich.box import HEAVY_EDGE
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
@@ -19,27 +25,74 @@ class TaskEvaluationPydanticOutput(BaseModel):
|
||||
)
|
||||
|
||||
|
||||
class CrewEvaluator:
|
||||
class CrewEvaluator(BaseModel):
|
||||
"""
|
||||
A class to evaluate the performance of the agents in the crew based on the tasks they have performed.
|
||||
|
||||
Attributes:
|
||||
crew (Crew): The crew of agents to evaluate.
|
||||
openai_model_name (str): The model to use for evaluating the performance of the agents (for now ONLY OpenAI accepted).
|
||||
llm (Union[str, InstanceOf[LLM], Any]): The language model to use for evaluating the performance of the agents.
|
||||
tasks_scores (defaultdict): A dictionary to store the scores of the agents for each task.
|
||||
iteration (int): The current iteration of the evaluation.
|
||||
"""
|
||||
|
||||
tasks_scores: defaultdict = defaultdict(list)
|
||||
run_execution_times: defaultdict = defaultdict(list)
|
||||
iteration: int = 0
|
||||
crew: Any = Field(description="The crew of agents to evaluate.")
|
||||
llm: Union[str, InstanceOf[LLM], Any] = Field(
|
||||
description="Language model that will run the evaluation."
|
||||
)
|
||||
tasks_scores: Dict[int, List[float]] = Field(
|
||||
default_factory=lambda: defaultdict(list),
|
||||
description="Dictionary to store the scores of the agents for each task."
|
||||
)
|
||||
run_execution_times: Dict[int, List[int]] = Field(
|
||||
default_factory=lambda: defaultdict(list),
|
||||
description="Dictionary to store execution times for each run."
|
||||
)
|
||||
iteration: int = Field(
|
||||
default=0,
|
||||
description="Current iteration of the evaluation."
|
||||
)
|
||||
|
||||
def __init__(self, crew, llm: Union[str, LLM]):
|
||||
self.crew = crew
|
||||
self.llm = llm if isinstance(llm, LLM) else LLM(model=llm)
|
||||
self._telemetry = Telemetry()
|
||||
@model_validator(mode="after")
|
||||
def validate_llm(self):
|
||||
"""Validates that the LLM is properly configured."""
|
||||
if not self.llm:
|
||||
raise ValueError("LLM configuration is required")
|
||||
return self
|
||||
|
||||
_telemetry: Telemetry = PrivateAttr(default_factory=Telemetry)
|
||||
|
||||
def __init__(self, crew, llm: Union[str, InstanceOf[LLM], Any]):
|
||||
# Initialize Pydantic model with validated fields
|
||||
super().__init__(crew=crew, llm=llm)
|
||||
self._setup_for_evaluating()
|
||||
|
||||
@model_validator(mode="before")
|
||||
def init_llm(cls, values):
|
||||
"""Initialize LLM before Pydantic validation."""
|
||||
llm = values.get("llm")
|
||||
try:
|
||||
if isinstance(llm, str):
|
||||
values["llm"] = LLM(model=llm)
|
||||
elif isinstance(llm, LLM):
|
||||
values["llm"] = llm
|
||||
else:
|
||||
# For any other type, attempt to extract relevant attributes
|
||||
llm_params = {
|
||||
"model": getattr(llm, "model_name", None)
|
||||
or getattr(llm, "deployment_name", None)
|
||||
or str(llm),
|
||||
"temperature": getattr(llm, "temperature", None),
|
||||
"max_tokens": getattr(llm, "max_tokens", None),
|
||||
"timeout": getattr(llm, "timeout", None),
|
||||
}
|
||||
# Remove None values
|
||||
llm_params = {k: v for k, v in llm_params.items() if v is not None}
|
||||
values["llm"] = LLM(**llm_params)
|
||||
except Exception as e:
|
||||
raise ValueError(f"Invalid LLM configuration: {str(e)}") from e
|
||||
return values
|
||||
|
||||
def _setup_for_evaluating(self) -> None:
|
||||
"""Sets up the crew for evaluating."""
|
||||
for task in self.crew.tasks:
|
||||
@@ -183,7 +236,7 @@ class CrewEvaluator:
|
||||
self.crew,
|
||||
evaluation_result.pydantic.quality,
|
||||
current_task._execution_time,
|
||||
getattr(self.llm, "model", None),
|
||||
self.llm.model if isinstance(self.llm, LLM) else self.llm,
|
||||
)
|
||||
self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality)
|
||||
self.run_execution_times[self.iteration].append(
|
||||
|
||||
@@ -14,7 +14,6 @@ from crewai.agent import Agent
|
||||
from crewai.agents.cache import CacheHandler
|
||||
from crewai.crew import Crew
|
||||
from crewai.crews.crew_output import CrewOutput
|
||||
from crewai.llm import LLM
|
||||
from crewai.memory.contextual.contextual_memory import ContextualMemory
|
||||
from crewai.process import Process
|
||||
from crewai.task import Task
|
||||
@@ -301,6 +300,15 @@ def test_hierarchical_process():
|
||||
)
|
||||
|
||||
|
||||
@mock.patch("crewai.crew.CrewEvaluator")
|
||||
@mock.patch("crewai.crew.Crew.copy")
|
||||
def test_crew_test_backward_compatibility(mock_copy, mock_evaluator):
|
||||
crew = Crew(agents=[researcher], tasks=[Task(description="test", expected_output="test output", agent=researcher)])
|
||||
crew.test(2, openai_model_name="gpt-4")
|
||||
mock_evaluator.assert_called_once()
|
||||
_, kwargs = mock_evaluator.call_args
|
||||
assert kwargs["llm"] == "gpt-4"
|
||||
|
||||
def test_manager_llm_requirement_for_hierarchical_process():
|
||||
task = Task(
|
||||
description="Come up with a list of 5 interesting ideas to explore for an article, then write one amazing paragraph highlight for each idea that showcases how good an article about this topic could be. Return the list of ideas with their paragraph and your notes.",
|
||||
@@ -2813,11 +2821,10 @@ def test_conditional_should_execute():
|
||||
@mock.patch("crewai.crew.CrewEvaluator")
|
||||
@mock.patch("crewai.crew.Crew.copy")
|
||||
@mock.patch("crewai.crew.Crew.kickoff")
|
||||
def test_crew_testing_function_with_openai_model_name(kickoff_mock, copy_mock, crew_evaluator):
|
||||
"""Test backward compatibility with openai_model_name parameter."""
|
||||
def test_crew_testing_function(kickoff_mock, copy_mock, crew_evaluator):
|
||||
task = Task(
|
||||
description="Test task",
|
||||
expected_output="Test output",
|
||||
description="Come up with a list of 5 interesting ideas to explore for an article, then write one amazing paragraph highlight for each idea that showcases how good an article about this topic could be. Return the list of ideas with their paragraph and your notes.",
|
||||
expected_output="5 bullet points with a paragraph for each idea.",
|
||||
agent=researcher,
|
||||
)
|
||||
|
||||
@@ -2826,87 +2833,20 @@ def test_crew_testing_function_with_openai_model_name(kickoff_mock, copy_mock, c
|
||||
tasks=[task],
|
||||
)
|
||||
|
||||
# Create a mock for the copied crew
|
||||
copy_mock.return_value = crew
|
||||
|
||||
n_iterations = 2
|
||||
crew.test(n_iterations, openai_model_name="gpt-4o-mini", inputs={"topic": "AI"})
|
||||
|
||||
# Ensure kickoff is called on the copied crew
|
||||
kickoff_mock.assert_has_calls(
|
||||
[mock.call(inputs={"topic": "AI"}), mock.call(inputs={"topic": "AI"})]
|
||||
)
|
||||
|
||||
crew_evaluator.assert_has_calls(
|
||||
[
|
||||
mock.call(crew, mock.ANY), # ANY because we convert to LLM instance
|
||||
mock.call().set_iteration(1),
|
||||
mock.call().set_iteration(2),
|
||||
mock.call().print_crew_evaluation_result(),
|
||||
]
|
||||
)
|
||||
|
||||
@mock.patch("crewai.crew.CrewEvaluator")
|
||||
@mock.patch("crewai.crew.Crew.copy")
|
||||
@mock.patch("crewai.crew.Crew.kickoff")
|
||||
def test_crew_testing_function_with_llm_instance(kickoff_mock, copy_mock, crew_evaluator):
|
||||
"""Test using LLM instance parameter."""
|
||||
task = Task(
|
||||
description="Test task",
|
||||
expected_output="Test output",
|
||||
agent=researcher,
|
||||
)
|
||||
|
||||
crew = Crew(
|
||||
agents=[researcher],
|
||||
tasks=[task],
|
||||
)
|
||||
|
||||
copy_mock.return_value = crew
|
||||
llm = LLM(model="gpt-4o-mini")
|
||||
|
||||
n_iterations = 2
|
||||
crew.test(n_iterations, llm=llm, inputs={"topic": "AI"})
|
||||
|
||||
kickoff_mock.assert_has_calls(
|
||||
[mock.call(inputs={"topic": "AI"}), mock.call(inputs={"topic": "AI"})]
|
||||
)
|
||||
|
||||
crew_evaluator.assert_has_calls(
|
||||
[
|
||||
mock.call(crew, llm),
|
||||
mock.call().set_iteration(1),
|
||||
mock.call().set_iteration(2),
|
||||
mock.call().print_crew_evaluation_result(),
|
||||
]
|
||||
)
|
||||
|
||||
@mock.patch("crewai.crew.CrewEvaluator")
|
||||
@mock.patch("crewai.crew.Crew.copy")
|
||||
@mock.patch("crewai.crew.Crew.kickoff")
|
||||
def test_crew_testing_function_with_llm_string(kickoff_mock, copy_mock, crew_evaluator):
|
||||
"""Test using LLM string parameter."""
|
||||
task = Task(
|
||||
description="Test task",
|
||||
expected_output="Test output",
|
||||
agent=researcher,
|
||||
)
|
||||
|
||||
crew = Crew(
|
||||
agents=[researcher],
|
||||
tasks=[task],
|
||||
)
|
||||
|
||||
copy_mock.return_value = crew
|
||||
|
||||
n_iterations = 2
|
||||
crew.test(n_iterations, llm="gpt-4o-mini", inputs={"topic": "AI"})
|
||||
|
||||
kickoff_mock.assert_has_calls(
|
||||
[mock.call(inputs={"topic": "AI"}), mock.call(inputs={"topic": "AI"})]
|
||||
)
|
||||
|
||||
crew_evaluator.assert_has_calls(
|
||||
[
|
||||
mock.call(crew, mock.ANY), # ANY because we don't care about the LLM instance details
|
||||
mock.call(crew, llm="gpt-4o-mini"),
|
||||
mock.call().set_iteration(1),
|
||||
mock.call().set_iteration(2),
|
||||
mock.call().print_crew_evaluation_result(),
|
||||
|
||||
@@ -4,6 +4,7 @@ import pytest
|
||||
|
||||
from crewai.agent import Agent
|
||||
from crewai.crew import Crew
|
||||
from crewai.llm import LLM
|
||||
from crewai.task import Task
|
||||
from crewai.tasks.task_output import TaskOutput
|
||||
from crewai.utilities.evaluators.crew_evaluator_handler import (
|
||||
@@ -23,7 +24,7 @@ class TestCrewEvaluator:
|
||||
)
|
||||
crew = Crew(agents=[agent], tasks=[task])
|
||||
|
||||
return CrewEvaluator(crew, openai_model_name="gpt-4o-mini")
|
||||
return CrewEvaluator(crew, llm="gpt-4o-mini")
|
||||
|
||||
def test_setup_for_evaluating(self, crew_planner):
|
||||
crew_planner._setup_for_evaluating()
|
||||
@@ -46,6 +47,7 @@ class TestCrewEvaluator:
|
||||
)
|
||||
assert agent.verbose is False
|
||||
assert agent.llm.model == "gpt-4o-mini"
|
||||
assert isinstance(agent.llm, LLM)
|
||||
|
||||
def test_evaluation_task(self, crew_planner):
|
||||
evaluator_agent = Agent(
|
||||
@@ -131,6 +133,17 @@ class TestCrewEvaluator:
|
||||
# Ensure the console prints the table
|
||||
console.assert_has_calls([mock.call(), mock.call().print(table())])
|
||||
|
||||
def test_custom_llm_support(self):
|
||||
agent = Agent(role="Agent 1", goal="Goal 1", backstory="Backstory 1")
|
||||
task = Task(description="Task 1", expected_output="Output 1", agent=agent)
|
||||
crew = Crew(agents=[agent], tasks=[task])
|
||||
|
||||
custom_llm = LLM(model="custom-model")
|
||||
evaluator = CrewEvaluator(crew, llm=custom_llm)
|
||||
|
||||
assert evaluator.llm.model == "custom-model"
|
||||
assert isinstance(evaluator.llm, LLM)
|
||||
|
||||
def test_evaluate(self, crew_planner):
|
||||
task_output = TaskOutput(
|
||||
description="Task 1", agent=str(crew_planner.crew.agents[0])
|
||||
|
||||
Reference in New Issue
Block a user