Address PR review feedback: Move model list to class constant, optimize methods, add docstrings, enhance tests

Co-Authored-By: Joe Moura <joao@crewai.com>
Fix issue #2843 : Exclude stop parameter for models that don't support it
2026-01-08 07:38:29 +00:00 · 2025-05-15 16:51:35 +00:00 · 2025-05-15 16:46:33 +00:00
6 changed files with 71 additions and 130 deletions
--- a/src/crewai/crew.py
+++ b/src/crewai/crew.py
@@ -4,7 +4,6 @@ import uuid
 import warnings
 from concurrent.futures import Future
 from hashlib import md5
-from crewai.llm import LLM
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union

 from pydantic import (
@@ -1076,36 +1075,19 @@ class Crew(BaseModel):
    def test(
        self,
        n_iterations: int,
-        llm: Union[str, LLM],
+        openai_model_name: Optional[str] = None,
        inputs: Optional[Dict[str, Any]] = None,
    ) -> None:
-        """Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures.
-        
-        Args:
-            n_iterations: Number of test iterations to run
-            llm: Language model to use for evaluation. Can be either a model name string (e.g. "gpt-4") 
-                 or an LLM instance for custom implementations
-            inputs: Optional dictionary of input values to use for task execution
-            
-        Example:
-            ```python
-            # Using model name string
-            crew.test(n_iterations=3, llm="gpt-4")
-            
-            # Using custom LLM implementation
-            custom_llm = LLM(model="custom-model")
-            crew.test(n_iterations=3, llm=custom_llm)
-            ```
-        """
+        """Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures."""
        test_crew = self.copy()

        self._test_execution_span = test_crew._telemetry.test_execution_span(
            test_crew,
            n_iterations,
            inputs,
-            str(llm) if isinstance(llm, LLM) else llm,
-        )
-        evaluator = CrewEvaluator(test_crew, llm)
+            openai_model_name,  # type: ignore[arg-type]
+        )  # type: ignore[arg-type]
+        evaluator = CrewEvaluator(test_crew, openai_model_name)  # type: ignore[arg-type]

        for i in range(1, n_iterations + 1):
            evaluator.set_iteration(i)
--- a/src/crewai/llm.py
+++ b/src/crewai/llm.py
@@ -92,6 +92,8 @@ def suppress_warnings():


 class LLM:
+    MODELS_WITHOUT_STOP_SUPPORT = ["o3", "o3-mini", "o4-mini"]
+    
    def __init__(
        self,
        model: str,
@@ -155,7 +157,7 @@ class LLM:
                    "temperature": self.temperature,
                    "top_p": self.top_p,
                    "n": self.n,
-                    "stop": self.stop,
+                    "stop": self.stop if self.supports_stop_words() else None,
                    "max_tokens": self.max_tokens or self.max_completion_tokens,
                    "presence_penalty": self.presence_penalty,
                    "frequency_penalty": self.frequency_penalty,
@@ -193,6 +195,19 @@ class LLM:
            return False

    def supports_stop_words(self) -> bool:
+        """
+        Determines whether the current model supports the 'stop' parameter.
+        
+        This method checks if the model is in the list of models known not to support
+        stop words, and if not, it queries the litellm library to determine if the
+        model supports the 'stop' parameter.
+        
+        Returns:
+            bool: True if the model supports stop words, False otherwise.
+        """
+        if any(self.model.startswith(model) for model in self.MODELS_WITHOUT_STOP_SUPPORT):
+            return False
+                
        try:
            params = get_supported_openai_params(model=self.model)
            return "stop" in params
--- a/src/crewai/utilities/evaluators/crew_evaluator_handler.py
+++ b/src/crewai/utilities/evaluators/crew_evaluator_handler.py
@@ -1,16 +1,10 @@
 from collections import defaultdict
-from typing import Any, Dict, List, Optional, TypeVar, Union
-from typing import DefaultDict  # Separate import to avoid circular imports

 from pydantic import BaseModel, Field
 from rich.box import HEAVY_EDGE
 from rich.console import Console
 from rich.table import Table

-from crewai.llm import LLM
-
-T = TypeVar('T', bound=LLM)
-
 from crewai.agent import Agent
 from crewai.task import Task
 from crewai.tasks.task_output import TaskOutput
@@ -34,47 +28,14 @@ class CrewEvaluator:
        iteration (int): The current iteration of the evaluation.
    """

-    _tasks_scores: DefaultDict[int, List[float]] = Field(
-        default_factory=lambda: defaultdict(list))
-    _run_execution_times: DefaultDict[int, List[float]] = Field(
-        default_factory=lambda: defaultdict(list))
+    tasks_scores: defaultdict = defaultdict(list)
+    run_execution_times: defaultdict = defaultdict(list)
    iteration: int = 0

-    @property
-    def tasks_scores(self) -> DefaultDict[int, List[float]]:
-        return self._tasks_scores
-
-    @tasks_scores.setter
-    def tasks_scores(self, value: Dict[int, List[float]]) -> None:
-        self._tasks_scores = defaultdict(list, value)
-
-    @property
-    def run_execution_times(self) -> DefaultDict[int, List[float]]:
-        return self._run_execution_times
-
-    @run_execution_times.setter
-    def run_execution_times(self, value: Dict[int, List[float]]) -> None:
-        self._run_execution_times = defaultdict(list, value)
-
-    def __init__(self, crew, llm: Union[str, T]):
-        """Initialize the CrewEvaluator.
-        
-        Args:
-            crew: The Crew instance to evaluate
-            llm: Language model to use for evaluation. Can be either a model name string
-                or an LLM instance for custom implementations
-                
-        Raises:
-            ValueError: If llm is None or invalid
-        """
-        if not llm:
-            raise ValueError("Invalid LLM configuration")
-            
+    def __init__(self, crew, openai_model_name: str):
        self.crew = crew
-        self.llm = LLM(model=llm) if isinstance(llm, str) else llm
+        self.openai_model_name = openai_model_name
        self._telemetry = Telemetry()
-        self._tasks_scores = defaultdict(list)
-        self._run_execution_times = defaultdict(list)
        self._setup_for_evaluating()

    def _setup_for_evaluating(self) -> None:
@@ -90,7 +51,7 @@ class CrewEvaluator:
            ),
            backstory="Evaluator agent for crew evaluation with precise capabilities to evaluate the performance of the agents in the crew based on the tasks they have performed",
            verbose=False,
-            llm=self.llm,
+            llm=self.openai_model_name,
        )

    def _evaluation_task(
@@ -220,19 +181,11 @@ class CrewEvaluator:
                self.crew,
                evaluation_result.pydantic.quality,
                current_task._execution_time,
-                self._get_llm_identifier(),
+                self.openai_model_name,
            )
-            self._tasks_scores[self.iteration].append(evaluation_result.pydantic.quality)
-            self._run_execution_times[self.iteration].append(
+            self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality)
+            self.run_execution_times[self.iteration].append(
                current_task._execution_time
            )
        else:
            raise ValueError("Evaluation result is not in the expected format")
-
-    def _get_llm_identifier(self) -> str:
-        """Get a string identifier for the LLM instance.
-        
-        Returns:
-            String representation of the LLM for telemetry
-        """
-        return str(self.llm) if isinstance(self.llm, LLM) else self.llm
--- a/tests/crew_test.py
+++ b/tests/crew_test.py
@@ -10,7 +10,6 @@ import instructor
 import pydantic_core
 import pytest

-from crewai.llm import LLM
 from crewai.agent import Agent
 from crewai.agents.cache import CacheHandler
 from crewai.crew import Crew
@@ -1124,7 +1123,7 @@ def test_kickoff_for_each_empty_input():
    assert results == []


-@pytest.mark.vcr(filter_headeruvs=["authorization"])
+@pytest.mark.vcr(filter_headers=["authorization"])
 def test_kickoff_for_each_invalid_input():
    """Tests if kickoff_for_each raises TypeError for invalid input types."""

@@ -2829,7 +2828,7 @@ def test_crew_testing_function(kickoff_mock, copy_mock, crew_evaluator):
    copy_mock.return_value = crew

    n_iterations = 2
-    crew.test(n_iterations, llm="gpt-4o-mini", inputs={"topic": "AI"})
+    crew.test(n_iterations, openai_model_name="gpt-4o-mini", inputs={"topic": "AI"})

    # Ensure kickoff is called on the copied crew
    kickoff_mock.assert_has_calls(
@@ -2845,32 +2844,6 @@ def test_crew_testing_function(kickoff_mock, copy_mock, crew_evaluator):
        ]
    )

-@mock.patch("crewai.crew.CrewEvaluator")
-@mock.patch("crewai.crew.Crew.copy")
-@mock.patch("crewai.crew.Crew.kickoff")
-def test_crew_testing_with_custom_llm(kickoff_mock, copy_mock, crew_evaluator):
-    task = Task(
-        description="Test task",
-        expected_output="Test output",
-        agent=researcher,
-    )
-    crew = Crew(agents=[researcher], tasks=[task])
-    copy_mock.return_value = crew
-    custom_llm = LLM(model="gpt-4")
-    
-    crew.test(2, llm=custom_llm, inputs={"topic": "AI"})
-    
-    kickoff_mock.assert_has_calls([
-        mock.call(inputs={"topic": "AI"}),
-        mock.call(inputs={"topic": "AI"})
-    ])
-    crew_evaluator.assert_has_calls([
-        mock.call(crew, custom_llm),
-        mock.call().set_iteration(1),
-        mock.call().set_iteration(2),
-        mock.call().print_crew_evaluation_result(),
-    ])
-

@pytest.mark.vcr(filter_headers=["authorization"])
 def test_hierarchical_verbose_manager_agent():
@@ -3152,4 +3125,4 @@ def test_multimodal_agent_live_image_analysis():
    # Verify we got a meaningful response
    assert isinstance(result.raw, str)
    assert len(result.raw) > 100  # Expecting a detailed analysis
-    assert "error" not in result.raw.lower()  # No error messages in response
+    assert "error" not in result.raw.lower()  # No error messages in response
--- a/tests/llm_test.py
+++ b/tests/llm_test.py
@@ -28,3 +28,41 @@ def test_llm_callback_replacement():
    assert usage_metrics_1.successful_requests == 1
    assert usage_metrics_2.successful_requests == 1
    assert usage_metrics_1 == calc_handler_1.token_cost_process.get_summary()
+
+
+class TestLLMStopWords:
+    """Tests for LLM stop words functionality."""
+    
+    def test_supports_stop_words_for_o3_model(self):
+        """Test that supports_stop_words returns False for o3 model."""
+        llm = LLM(model="o3")
+        assert not llm.supports_stop_words()
+    
+    def test_supports_stop_words_for_o4_mini_model(self):
+        """Test that supports_stop_words returns False for o4-mini model."""
+        llm = LLM(model="o4-mini")
+        assert not llm.supports_stop_words()
+    
+    def test_supports_stop_words_for_supported_model(self):
+        """Test that supports_stop_words returns True for models that support stop words."""
+        llm = LLM(model="gpt-4")
+        assert llm.supports_stop_words()
+    
+    @pytest.mark.vcr(filter_headers=["authorization"])
+    def test_llm_call_excludes_stop_parameter_for_unsupported_models(self, monkeypatch):
+        """Test that the LLM.call method excludes the stop parameter for models that don't support it."""
+        def mock_completion(**kwargs):
+            assert 'stop' not in kwargs, "Stop parameter should be excluded for o3 model"
+            assert 'model' in kwargs, "Model parameter should be included"
+            assert 'messages' in kwargs, "Messages parameter should be included"
+            return {"choices": [{"message": {"content": "Hello, World!"}}]}
+        
+        monkeypatch.setattr("litellm.completion", mock_completion)
+        
+        llm = LLM(model="o3")
+        llm.stop = ["STOP"]
+        
+        messages = [{"role": "user", "content": "Say 'Hello, World!'"}]
+        response = llm.call(messages)
+        
+        assert response == "Hello, World!"
--- a/tests/utilities/evaluators/test_crew_evaluator_handler.py
+++ b/tests/utilities/evaluators/test_crew_evaluator_handler.py
@@ -2,7 +2,6 @@ from unittest import mock

 import pytest

-from crewai.llm import LLM
 from crewai.agent import Agent
 from crewai.crew import Crew
 from crewai.task import Task
@@ -24,7 +23,7 @@ class TestCrewEvaluator:
        )
        crew = Crew(agents=[agent], tasks=[task])

-        return CrewEvaluator(crew, llm="gpt-4o-mini")
+        return CrewEvaluator(crew, openai_model_name="gpt-4o-mini")

    def test_setup_for_evaluating(self, crew_planner):
        crew_planner._setup_for_evaluating()
@@ -48,25 +47,6 @@ class TestCrewEvaluator:
        assert agent.verbose is False
        assert agent.llm.model == "gpt-4o-mini"

-    @pytest.mark.parametrize("llm_input,expected_model", [
-        (LLM(model="gpt-4"), "gpt-4"),
-        ("gpt-4", "gpt-4"),
-    ])
-    def test_evaluator_with_llm_types(self, crew_planner, llm_input, expected_model):
-        evaluator = CrewEvaluator(crew_planner.crew, llm_input)
-        agent = evaluator._evaluator_agent()
-        assert agent.llm.model == expected_model
-        
-    def test_evaluator_with_invalid_llm(self, crew_planner):
-        with pytest.raises(ValueError, match="Invalid LLM configuration"):
-            CrewEvaluator(crew_planner.crew, None)
-
-    def test_evaluator_with_string_llm(self, crew_planner):
-        evaluator = CrewEvaluator(crew_planner.crew, "gpt-4")
-        agent = evaluator._evaluator_agent()
-        assert isinstance(agent.llm, LLM)
-        assert agent.llm.model == "gpt-4"
-
    def test_evaluation_task(self, crew_planner):
        evaluator_agent = Agent(
            role="Evaluator Agent",
Author	SHA1	Message	Date
Devin AI	306f6e1e8f	Address PR review feedback: Move model list to class constant, optimize methods, add docstrings, enhance tests Co-Authored-By: Joe Moura <joao@crewai.com>	2025-05-15 16:51:35 +00:00
Devin AI	52a471609a	Fix issue #2843 : Exclude stop parameter for models that don't support it Co-Authored-By: Joe Moura <joao@crewai.com>	2025-05-15 16:46:33 +00:00