crewAI/tests/utilities/evaluators/test_custom_llm_support.py

import pytest
from unittest.mock import MagicMock

from crewai.agent import Agent
from crewai.crew import Crew
from crewai.llm import LLM
from crewai.task import Task
from crewai.utilities.evaluators.crew_evaluator_handler import CrewEvaluator

@pytest.mark.parametrize("model_input", [
    "gpt-4",  # Test string model name
    LLM(model="gpt-4"),  # Test LLM instance
])
def test_crew_test_with_custom_llm(model_input, mocker):
    # Mock LLM call to return valid JSON
    mocker.patch('crewai.llm.LLM.call', return_value='{"quality": 9.0}')

    # Setup
    agent = Agent(
        role="test",
        goal="test",
        backstory="test",
        llm=LLM(model="gpt-4"),
    )
    task = Task(
        description="test",
        expected_output="test output",
        agent=agent,
    )
    crew = Crew(agents=[agent], tasks=[task])

    # Test with provided model input
    crew.test(n_iterations=1, llm=model_input)

    # Test backward compatibility
    crew.test(n_iterations=1, openai_model_name="gpt-4")

    # Test error when neither parameter is provided
    with pytest.raises(ValueError, match="Either llm or openai_model_name must be provided"):
        crew.test(n_iterations=1)

def test_crew_evaluator_with_custom_llm():
    # Setup
    agent = Agent(
        role="test",
        goal="test",
        backstory="test",
        llm=LLM(model="gpt-4"),
    )
    task = Task(
        description="test",
        expected_output="test output",
        agent=agent,
    )
    crew = Crew(agents=[agent], tasks=[task])

    # Test with string model name
    evaluator = CrewEvaluator(crew, "gpt-4")
    assert isinstance(evaluator.llm, LLM)
    assert evaluator.llm.model == "gpt-4"

    # Test with LLM instance
    custom_llm = LLM(model="gpt-4")
    evaluator = CrewEvaluator(crew, custom_llm)
    assert evaluator.llm == custom_llm

    # Test that evaluator agent uses the correct LLM
    evaluator_agent = evaluator._evaluator_agent()
    assert evaluator_agent.llm == evaluator.llm