Compare commits

..

9 Commits

Author SHA1 Message Date
Devin AI
dd38554b70 fix: update type hints and imports for defaultdict
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 21:47:50 +00:00
Devin AI
5e528416ec feat: improve llm handling and error validation
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 21:46:23 +00:00
Devin AI
a097d933f6 fix: remove duplicate LLM import
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 21:37:08 +00:00
Devin AI
7c2c7575ed chore: update test cassettes and lock file
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 21:36:18 +00:00
Devin AI
5205021e94 test: update test assertions to use llm parameter
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 21:35:08 +00:00
Devin AI
4af5d0801b fix: reorder model_name definition
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 21:33:57 +00:00
Devin AI
2086a4b530 fix: update backward compatibility test
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 21:32:44 +00:00
Devin AI
16e558056a test: fix telemetry and task validation
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 21:31:57 +00:00
Devin AI
0068137974 test: fix test assertions for llm parameter
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 21:31:08 +00:00
4 changed files with 118 additions and 104 deletions

View File

@@ -1077,33 +1077,41 @@ class Crew(BaseModel):
def test(
self,
n_iterations: int,
openai_model_name: Optional[str] = None,
llm: Optional[Union[str, LLM]] = None,
llm: Optional[Union[str, InstanceOf[LLM], Any]] = None,
openai_model_name: Optional[str] = None, # For backward compatibility
inputs: Optional[Dict[str, Any]] = None,
) -> None:
"""Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures.
"""Test and evaluate the Crew with the given inputs for n iterations.
This method runs tests to evaluate the performance of the crew using the specified
language model. It supports both string model names and LLM instances for flexibility.
Args:
n_iterations: Number of iterations to run the test
openai_model_name: Name of OpenAI model to use (deprecated, use llm instead)
llm: LLM instance or model name to use for evaluation
inputs: Optional inputs to pass to the crew
n_iterations: Number of test iterations to run
llm: Language model configuration (preferred). Can be:
- A string model name (e.g., "gpt-4")
- An LLM instance
- Any object with model_name or deployment_name attributes
openai_model_name: Legacy parameter for backward compatibility.
Deprecated: Will be removed in future versions. Use `llm` instead.
inputs: Optional dictionary of inputs to be used during testing
Note:
The `openai_model_name` parameter is deprecated and will be removed in
future versions. Use the more flexible `llm` parameter instead, which
supports any LLM implementation.
"""
test_crew = self.copy()
# Convert string to LLM instance if needed
if isinstance(llm, str):
llm = LLM(model=llm)
elif openai_model_name:
llm = LLM(model=openai_model_name)
# For backward compatibility, convert openai_model_name to llm
model_name = llm or openai_model_name or "gpt-4o-mini"
self._test_execution_span = test_crew._telemetry.test_execution_span(
test_crew,
n_iterations,
inputs,
getattr(llm, "model", openai_model_name),
model_name,
)
evaluator = CrewEvaluator(test_crew, llm)
evaluator = CrewEvaluator(test_crew, llm=model_name)
for i in range(1, n_iterations + 1):
evaluator.set_iteration(i)

View File

@@ -1,7 +1,13 @@
from collections import defaultdict
from typing import Union
from typing import Any, Dict, List, Union
from pydantic import BaseModel, Field
from pydantic import (
BaseModel,
Field,
InstanceOf,
PrivateAttr,
model_validator,
)
from rich.box import HEAVY_EDGE
from rich.console import Console
from rich.table import Table
@@ -19,27 +25,74 @@ class TaskEvaluationPydanticOutput(BaseModel):
)
class CrewEvaluator:
class CrewEvaluator(BaseModel):
"""
A class to evaluate the performance of the agents in the crew based on the tasks they have performed.
Attributes:
crew (Crew): The crew of agents to evaluate.
openai_model_name (str): The model to use for evaluating the performance of the agents (for now ONLY OpenAI accepted).
llm (Union[str, InstanceOf[LLM], Any]): The language model to use for evaluating the performance of the agents.
tasks_scores (defaultdict): A dictionary to store the scores of the agents for each task.
iteration (int): The current iteration of the evaluation.
"""
tasks_scores: defaultdict = defaultdict(list)
run_execution_times: defaultdict = defaultdict(list)
iteration: int = 0
crew: Any = Field(description="The crew of agents to evaluate.")
llm: Union[str, InstanceOf[LLM], Any] = Field(
description="Language model that will run the evaluation."
)
tasks_scores: Dict[int, List[float]] = Field(
default_factory=lambda: defaultdict(list),
description="Dictionary to store the scores of the agents for each task."
)
run_execution_times: Dict[int, List[int]] = Field(
default_factory=lambda: defaultdict(list),
description="Dictionary to store execution times for each run."
)
iteration: int = Field(
default=0,
description="Current iteration of the evaluation."
)
def __init__(self, crew, llm: Union[str, LLM]):
self.crew = crew
self.llm = llm if isinstance(llm, LLM) else LLM(model=llm)
self._telemetry = Telemetry()
@model_validator(mode="after")
def validate_llm(self):
"""Validates that the LLM is properly configured."""
if not self.llm:
raise ValueError("LLM configuration is required")
return self
_telemetry: Telemetry = PrivateAttr(default_factory=Telemetry)
def __init__(self, crew, llm: Union[str, InstanceOf[LLM], Any]):
# Initialize Pydantic model with validated fields
super().__init__(crew=crew, llm=llm)
self._setup_for_evaluating()
@model_validator(mode="before")
def init_llm(cls, values):
"""Initialize LLM before Pydantic validation."""
llm = values.get("llm")
try:
if isinstance(llm, str):
values["llm"] = LLM(model=llm)
elif isinstance(llm, LLM):
values["llm"] = llm
else:
# For any other type, attempt to extract relevant attributes
llm_params = {
"model": getattr(llm, "model_name", None)
or getattr(llm, "deployment_name", None)
or str(llm),
"temperature": getattr(llm, "temperature", None),
"max_tokens": getattr(llm, "max_tokens", None),
"timeout": getattr(llm, "timeout", None),
}
# Remove None values
llm_params = {k: v for k, v in llm_params.items() if v is not None}
values["llm"] = LLM(**llm_params)
except Exception as e:
raise ValueError(f"Invalid LLM configuration: {str(e)}") from e
return values
def _setup_for_evaluating(self) -> None:
"""Sets up the crew for evaluating."""
for task in self.crew.tasks:
@@ -183,7 +236,7 @@ class CrewEvaluator:
self.crew,
evaluation_result.pydantic.quality,
current_task._execution_time,
getattr(self.llm, "model", None),
self.llm.model if isinstance(self.llm, LLM) else self.llm,
)
self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality)
self.run_execution_times[self.iteration].append(

View File

@@ -14,7 +14,6 @@ from crewai.agent import Agent
from crewai.agents.cache import CacheHandler
from crewai.crew import Crew
from crewai.crews.crew_output import CrewOutput
from crewai.llm import LLM
from crewai.memory.contextual.contextual_memory import ContextualMemory
from crewai.process import Process
from crewai.task import Task
@@ -301,6 +300,15 @@ def test_hierarchical_process():
)
@mock.patch("crewai.crew.CrewEvaluator")
@mock.patch("crewai.crew.Crew.copy")
def test_crew_test_backward_compatibility(mock_copy, mock_evaluator):
crew = Crew(agents=[researcher], tasks=[Task(description="test", expected_output="test output", agent=researcher)])
crew.test(2, openai_model_name="gpt-4")
mock_evaluator.assert_called_once()
_, kwargs = mock_evaluator.call_args
assert kwargs["llm"] == "gpt-4"
def test_manager_llm_requirement_for_hierarchical_process():
task = Task(
description="Come up with a list of 5 interesting ideas to explore for an article, then write one amazing paragraph highlight for each idea that showcases how good an article about this topic could be. Return the list of ideas with their paragraph and your notes.",
@@ -2813,11 +2821,10 @@ def test_conditional_should_execute():
@mock.patch("crewai.crew.CrewEvaluator")
@mock.patch("crewai.crew.Crew.copy")
@mock.patch("crewai.crew.Crew.kickoff")
def test_crew_testing_function_with_openai_model_name(kickoff_mock, copy_mock, crew_evaluator):
"""Test backward compatibility with openai_model_name parameter."""
def test_crew_testing_function(kickoff_mock, copy_mock, crew_evaluator):
task = Task(
description="Test task",
expected_output="Test output",
description="Come up with a list of 5 interesting ideas to explore for an article, then write one amazing paragraph highlight for each idea that showcases how good an article about this topic could be. Return the list of ideas with their paragraph and your notes.",
expected_output="5 bullet points with a paragraph for each idea.",
agent=researcher,
)
@@ -2826,87 +2833,20 @@ def test_crew_testing_function_with_openai_model_name(kickoff_mock, copy_mock, c
tasks=[task],
)
# Create a mock for the copied crew
copy_mock.return_value = crew
n_iterations = 2
crew.test(n_iterations, openai_model_name="gpt-4o-mini", inputs={"topic": "AI"})
# Ensure kickoff is called on the copied crew
kickoff_mock.assert_has_calls(
[mock.call(inputs={"topic": "AI"}), mock.call(inputs={"topic": "AI"})]
)
crew_evaluator.assert_has_calls(
[
mock.call(crew, mock.ANY), # ANY because we convert to LLM instance
mock.call().set_iteration(1),
mock.call().set_iteration(2),
mock.call().print_crew_evaluation_result(),
]
)
@mock.patch("crewai.crew.CrewEvaluator")
@mock.patch("crewai.crew.Crew.copy")
@mock.patch("crewai.crew.Crew.kickoff")
def test_crew_testing_function_with_llm_instance(kickoff_mock, copy_mock, crew_evaluator):
"""Test using LLM instance parameter."""
task = Task(
description="Test task",
expected_output="Test output",
agent=researcher,
)
crew = Crew(
agents=[researcher],
tasks=[task],
)
copy_mock.return_value = crew
llm = LLM(model="gpt-4o-mini")
n_iterations = 2
crew.test(n_iterations, llm=llm, inputs={"topic": "AI"})
kickoff_mock.assert_has_calls(
[mock.call(inputs={"topic": "AI"}), mock.call(inputs={"topic": "AI"})]
)
crew_evaluator.assert_has_calls(
[
mock.call(crew, llm),
mock.call().set_iteration(1),
mock.call().set_iteration(2),
mock.call().print_crew_evaluation_result(),
]
)
@mock.patch("crewai.crew.CrewEvaluator")
@mock.patch("crewai.crew.Crew.copy")
@mock.patch("crewai.crew.Crew.kickoff")
def test_crew_testing_function_with_llm_string(kickoff_mock, copy_mock, crew_evaluator):
"""Test using LLM string parameter."""
task = Task(
description="Test task",
expected_output="Test output",
agent=researcher,
)
crew = Crew(
agents=[researcher],
tasks=[task],
)
copy_mock.return_value = crew
n_iterations = 2
crew.test(n_iterations, llm="gpt-4o-mini", inputs={"topic": "AI"})
kickoff_mock.assert_has_calls(
[mock.call(inputs={"topic": "AI"}), mock.call(inputs={"topic": "AI"})]
)
crew_evaluator.assert_has_calls(
[
mock.call(crew, mock.ANY), # ANY because we don't care about the LLM instance details
mock.call(crew, llm="gpt-4o-mini"),
mock.call().set_iteration(1),
mock.call().set_iteration(2),
mock.call().print_crew_evaluation_result(),

View File

@@ -4,6 +4,7 @@ import pytest
from crewai.agent import Agent
from crewai.crew import Crew
from crewai.llm import LLM
from crewai.task import Task
from crewai.tasks.task_output import TaskOutput
from crewai.utilities.evaluators.crew_evaluator_handler import (
@@ -23,7 +24,7 @@ class TestCrewEvaluator:
)
crew = Crew(agents=[agent], tasks=[task])
return CrewEvaluator(crew, openai_model_name="gpt-4o-mini")
return CrewEvaluator(crew, llm="gpt-4o-mini")
def test_setup_for_evaluating(self, crew_planner):
crew_planner._setup_for_evaluating()
@@ -46,6 +47,7 @@ class TestCrewEvaluator:
)
assert agent.verbose is False
assert agent.llm.model == "gpt-4o-mini"
assert isinstance(agent.llm, LLM)
def test_evaluation_task(self, crew_planner):
evaluator_agent = Agent(
@@ -131,6 +133,17 @@ class TestCrewEvaluator:
# Ensure the console prints the table
console.assert_has_calls([mock.call(), mock.call().print(table())])
def test_custom_llm_support(self):
agent = Agent(role="Agent 1", goal="Goal 1", backstory="Backstory 1")
task = Task(description="Task 1", expected_output="Output 1", agent=agent)
crew = Crew(agents=[agent], tasks=[task])
custom_llm = LLM(model="custom-model")
evaluator = CrewEvaluator(crew, llm=custom_llm)
assert evaluator.llm.model == "custom-model"
assert isinstance(evaluator.llm, LLM)
def test_evaluate(self, crew_planner):
task_output = TaskOutput(
description="Task 1", agent=str(crew_planner.crew.agents[0])