Compare commits

..

2 Commits

Author SHA1 Message Date
Devin AI
306f6e1e8f Address PR review feedback: Move model list to class constant, optimize methods, add docstrings, enhance tests
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-05-15 16:51:35 +00:00
Devin AI
52a471609a Fix issue #2843: Exclude stop parameter for models that don't support it
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-05-15 16:46:33 +00:00
6 changed files with 71 additions and 130 deletions

View File

@@ -4,7 +4,6 @@ import uuid
import warnings
from concurrent.futures import Future
from hashlib import md5
from crewai.llm import LLM
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from pydantic import (
@@ -1076,36 +1075,19 @@ class Crew(BaseModel):
def test(
self,
n_iterations: int,
llm: Union[str, LLM],
openai_model_name: Optional[str] = None,
inputs: Optional[Dict[str, Any]] = None,
) -> None:
"""Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures.
Args:
n_iterations: Number of test iterations to run
llm: Language model to use for evaluation. Can be either a model name string (e.g. "gpt-4")
or an LLM instance for custom implementations
inputs: Optional dictionary of input values to use for task execution
Example:
```python
# Using model name string
crew.test(n_iterations=3, llm="gpt-4")
# Using custom LLM implementation
custom_llm = LLM(model="custom-model")
crew.test(n_iterations=3, llm=custom_llm)
```
"""
"""Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures."""
test_crew = self.copy()
self._test_execution_span = test_crew._telemetry.test_execution_span(
test_crew,
n_iterations,
inputs,
str(llm) if isinstance(llm, LLM) else llm,
)
evaluator = CrewEvaluator(test_crew, llm)
openai_model_name, # type: ignore[arg-type]
) # type: ignore[arg-type]
evaluator = CrewEvaluator(test_crew, openai_model_name) # type: ignore[arg-type]
for i in range(1, n_iterations + 1):
evaluator.set_iteration(i)

View File

@@ -92,6 +92,8 @@ def suppress_warnings():
class LLM:
MODELS_WITHOUT_STOP_SUPPORT = ["o3", "o3-mini", "o4-mini"]
def __init__(
self,
model: str,
@@ -155,7 +157,7 @@ class LLM:
"temperature": self.temperature,
"top_p": self.top_p,
"n": self.n,
"stop": self.stop,
"stop": self.stop if self.supports_stop_words() else None,
"max_tokens": self.max_tokens or self.max_completion_tokens,
"presence_penalty": self.presence_penalty,
"frequency_penalty": self.frequency_penalty,
@@ -193,6 +195,19 @@ class LLM:
return False
def supports_stop_words(self) -> bool:
"""
Determines whether the current model supports the 'stop' parameter.
This method checks if the model is in the list of models known not to support
stop words, and if not, it queries the litellm library to determine if the
model supports the 'stop' parameter.
Returns:
bool: True if the model supports stop words, False otherwise.
"""
if any(self.model.startswith(model) for model in self.MODELS_WITHOUT_STOP_SUPPORT):
return False
try:
params = get_supported_openai_params(model=self.model)
return "stop" in params

View File

@@ -1,16 +1,10 @@
from collections import defaultdict
from typing import Any, Dict, List, Optional, TypeVar, Union
from typing import DefaultDict # Separate import to avoid circular imports
from pydantic import BaseModel, Field
from rich.box import HEAVY_EDGE
from rich.console import Console
from rich.table import Table
from crewai.llm import LLM
T = TypeVar('T', bound=LLM)
from crewai.agent import Agent
from crewai.task import Task
from crewai.tasks.task_output import TaskOutput
@@ -34,47 +28,14 @@ class CrewEvaluator:
iteration (int): The current iteration of the evaluation.
"""
_tasks_scores: DefaultDict[int, List[float]] = Field(
default_factory=lambda: defaultdict(list))
_run_execution_times: DefaultDict[int, List[float]] = Field(
default_factory=lambda: defaultdict(list))
tasks_scores: defaultdict = defaultdict(list)
run_execution_times: defaultdict = defaultdict(list)
iteration: int = 0
@property
def tasks_scores(self) -> DefaultDict[int, List[float]]:
return self._tasks_scores
@tasks_scores.setter
def tasks_scores(self, value: Dict[int, List[float]]) -> None:
self._tasks_scores = defaultdict(list, value)
@property
def run_execution_times(self) -> DefaultDict[int, List[float]]:
return self._run_execution_times
@run_execution_times.setter
def run_execution_times(self, value: Dict[int, List[float]]) -> None:
self._run_execution_times = defaultdict(list, value)
def __init__(self, crew, llm: Union[str, T]):
"""Initialize the CrewEvaluator.
Args:
crew: The Crew instance to evaluate
llm: Language model to use for evaluation. Can be either a model name string
or an LLM instance for custom implementations
Raises:
ValueError: If llm is None or invalid
"""
if not llm:
raise ValueError("Invalid LLM configuration")
def __init__(self, crew, openai_model_name: str):
self.crew = crew
self.llm = LLM(model=llm) if isinstance(llm, str) else llm
self.openai_model_name = openai_model_name
self._telemetry = Telemetry()
self._tasks_scores = defaultdict(list)
self._run_execution_times = defaultdict(list)
self._setup_for_evaluating()
def _setup_for_evaluating(self) -> None:
@@ -90,7 +51,7 @@ class CrewEvaluator:
),
backstory="Evaluator agent for crew evaluation with precise capabilities to evaluate the performance of the agents in the crew based on the tasks they have performed",
verbose=False,
llm=self.llm,
llm=self.openai_model_name,
)
def _evaluation_task(
@@ -220,19 +181,11 @@ class CrewEvaluator:
self.crew,
evaluation_result.pydantic.quality,
current_task._execution_time,
self._get_llm_identifier(),
self.openai_model_name,
)
self._tasks_scores[self.iteration].append(evaluation_result.pydantic.quality)
self._run_execution_times[self.iteration].append(
self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality)
self.run_execution_times[self.iteration].append(
current_task._execution_time
)
else:
raise ValueError("Evaluation result is not in the expected format")
def _get_llm_identifier(self) -> str:
"""Get a string identifier for the LLM instance.
Returns:
String representation of the LLM for telemetry
"""
return str(self.llm) if isinstance(self.llm, LLM) else self.llm

View File

@@ -10,7 +10,6 @@ import instructor
import pydantic_core
import pytest
from crewai.llm import LLM
from crewai.agent import Agent
from crewai.agents.cache import CacheHandler
from crewai.crew import Crew
@@ -1124,7 +1123,7 @@ def test_kickoff_for_each_empty_input():
assert results == []
@pytest.mark.vcr(filter_headeruvs=["authorization"])
@pytest.mark.vcr(filter_headers=["authorization"])
def test_kickoff_for_each_invalid_input():
"""Tests if kickoff_for_each raises TypeError for invalid input types."""
@@ -2829,7 +2828,7 @@ def test_crew_testing_function(kickoff_mock, copy_mock, crew_evaluator):
copy_mock.return_value = crew
n_iterations = 2
crew.test(n_iterations, llm="gpt-4o-mini", inputs={"topic": "AI"})
crew.test(n_iterations, openai_model_name="gpt-4o-mini", inputs={"topic": "AI"})
# Ensure kickoff is called on the copied crew
kickoff_mock.assert_has_calls(
@@ -2845,32 +2844,6 @@ def test_crew_testing_function(kickoff_mock, copy_mock, crew_evaluator):
]
)
@mock.patch("crewai.crew.CrewEvaluator")
@mock.patch("crewai.crew.Crew.copy")
@mock.patch("crewai.crew.Crew.kickoff")
def test_crew_testing_with_custom_llm(kickoff_mock, copy_mock, crew_evaluator):
task = Task(
description="Test task",
expected_output="Test output",
agent=researcher,
)
crew = Crew(agents=[researcher], tasks=[task])
copy_mock.return_value = crew
custom_llm = LLM(model="gpt-4")
crew.test(2, llm=custom_llm, inputs={"topic": "AI"})
kickoff_mock.assert_has_calls([
mock.call(inputs={"topic": "AI"}),
mock.call(inputs={"topic": "AI"})
])
crew_evaluator.assert_has_calls([
mock.call(crew, custom_llm),
mock.call().set_iteration(1),
mock.call().set_iteration(2),
mock.call().print_crew_evaluation_result(),
])
@pytest.mark.vcr(filter_headers=["authorization"])
def test_hierarchical_verbose_manager_agent():
@@ -3152,4 +3125,4 @@ def test_multimodal_agent_live_image_analysis():
# Verify we got a meaningful response
assert isinstance(result.raw, str)
assert len(result.raw) > 100 # Expecting a detailed analysis
assert "error" not in result.raw.lower() # No error messages in response
assert "error" not in result.raw.lower() # No error messages in response

View File

@@ -28,3 +28,41 @@ def test_llm_callback_replacement():
assert usage_metrics_1.successful_requests == 1
assert usage_metrics_2.successful_requests == 1
assert usage_metrics_1 == calc_handler_1.token_cost_process.get_summary()
class TestLLMStopWords:
"""Tests for LLM stop words functionality."""
def test_supports_stop_words_for_o3_model(self):
"""Test that supports_stop_words returns False for o3 model."""
llm = LLM(model="o3")
assert not llm.supports_stop_words()
def test_supports_stop_words_for_o4_mini_model(self):
"""Test that supports_stop_words returns False for o4-mini model."""
llm = LLM(model="o4-mini")
assert not llm.supports_stop_words()
def test_supports_stop_words_for_supported_model(self):
"""Test that supports_stop_words returns True for models that support stop words."""
llm = LLM(model="gpt-4")
assert llm.supports_stop_words()
@pytest.mark.vcr(filter_headers=["authorization"])
def test_llm_call_excludes_stop_parameter_for_unsupported_models(self, monkeypatch):
"""Test that the LLM.call method excludes the stop parameter for models that don't support it."""
def mock_completion(**kwargs):
assert 'stop' not in kwargs, "Stop parameter should be excluded for o3 model"
assert 'model' in kwargs, "Model parameter should be included"
assert 'messages' in kwargs, "Messages parameter should be included"
return {"choices": [{"message": {"content": "Hello, World!"}}]}
monkeypatch.setattr("litellm.completion", mock_completion)
llm = LLM(model="o3")
llm.stop = ["STOP"]
messages = [{"role": "user", "content": "Say 'Hello, World!'"}]
response = llm.call(messages)
assert response == "Hello, World!"

View File

@@ -2,7 +2,6 @@ from unittest import mock
import pytest
from crewai.llm import LLM
from crewai.agent import Agent
from crewai.crew import Crew
from crewai.task import Task
@@ -24,7 +23,7 @@ class TestCrewEvaluator:
)
crew = Crew(agents=[agent], tasks=[task])
return CrewEvaluator(crew, llm="gpt-4o-mini")
return CrewEvaluator(crew, openai_model_name="gpt-4o-mini")
def test_setup_for_evaluating(self, crew_planner):
crew_planner._setup_for_evaluating()
@@ -48,25 +47,6 @@ class TestCrewEvaluator:
assert agent.verbose is False
assert agent.llm.model == "gpt-4o-mini"
@pytest.mark.parametrize("llm_input,expected_model", [
(LLM(model="gpt-4"), "gpt-4"),
("gpt-4", "gpt-4"),
])
def test_evaluator_with_llm_types(self, crew_planner, llm_input, expected_model):
evaluator = CrewEvaluator(crew_planner.crew, llm_input)
agent = evaluator._evaluator_agent()
assert agent.llm.model == expected_model
def test_evaluator_with_invalid_llm(self, crew_planner):
with pytest.raises(ValueError, match="Invalid LLM configuration"):
CrewEvaluator(crew_planner.crew, None)
def test_evaluator_with_string_llm(self, crew_planner):
evaluator = CrewEvaluator(crew_planner.crew, "gpt-4")
agent = evaluator._evaluator_agent()
assert isinstance(agent.llm, LLM)
assert agent.llm.model == "gpt-4"
def test_evaluation_task(self, crew_planner):
evaluator_agent = Agent(
role="Evaluator Agent",