Added functionality to have any llm run test functionality (#2071)

* Added functionality to have any llm run test functionality

* Fixed lint issues

* Fixed Linting issues

* Fixed unit test case

* Fixed unit test

* Fixed test case

* Fixed unit test case

---------

Co-authored-by: Brandon Hancock (bhancock_ai) <109994880+bhancockio@users.noreply.github.com>
This commit is contained in:
Vidit Ostwal
2025-02-18 22:15:26 +05:30
committed by GitHub
parent b6d668fc66
commit ac819bcb6e
3 changed files with 19 additions and 11 deletions

View File

@@ -1148,19 +1148,24 @@ class Crew(BaseModel):
def test( def test(
self, self,
n_iterations: int, n_iterations: int,
openai_model_name: Optional[str] = None, eval_llm: Union[str, InstanceOf[LLM]],
inputs: Optional[Dict[str, Any]] = None, inputs: Optional[Dict[str, Any]] = None,
) -> None: ) -> None:
"""Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures.""" """Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures."""
test_crew = self.copy() test_crew = self.copy()
eval_llm = create_llm(eval_llm)
if not eval_llm:
raise ValueError("Failed to create LLM instance.")
self._test_execution_span = test_crew._telemetry.test_execution_span( self._test_execution_span = test_crew._telemetry.test_execution_span(
test_crew, test_crew,
n_iterations, n_iterations,
inputs, inputs,
openai_model_name, # type: ignore[arg-type] eval_llm.model, # type: ignore[arg-type]
) # type: ignore[arg-type] ) # type: ignore[arg-type]
evaluator = CrewEvaluator(test_crew, openai_model_name) # type: ignore[arg-type] evaluator = CrewEvaluator(test_crew, eval_llm) # type: ignore[arg-type]
for i in range(1, n_iterations + 1): for i in range(1, n_iterations + 1):
evaluator.set_iteration(i) evaluator.set_iteration(i)

View File

@@ -1,11 +1,12 @@
from collections import defaultdict from collections import defaultdict
from pydantic import BaseModel, Field from pydantic import BaseModel, Field, InstanceOf
from rich.box import HEAVY_EDGE from rich.box import HEAVY_EDGE
from rich.console import Console from rich.console import Console
from rich.table import Table from rich.table import Table
from crewai.agent import Agent from crewai.agent import Agent
from crewai.llm import LLM
from crewai.task import Task from crewai.task import Task
from crewai.tasks.task_output import TaskOutput from crewai.tasks.task_output import TaskOutput
from crewai.telemetry import Telemetry from crewai.telemetry import Telemetry
@@ -23,7 +24,7 @@ class CrewEvaluator:
Attributes: Attributes:
crew (Crew): The crew of agents to evaluate. crew (Crew): The crew of agents to evaluate.
openai_model_name (str): The model to use for evaluating the performance of the agents (for now ONLY OpenAI accepted). eval_llm (LLM): Language model instance to use for evaluations
tasks_scores (defaultdict): A dictionary to store the scores of the agents for each task. tasks_scores (defaultdict): A dictionary to store the scores of the agents for each task.
iteration (int): The current iteration of the evaluation. iteration (int): The current iteration of the evaluation.
""" """
@@ -32,9 +33,9 @@ class CrewEvaluator:
run_execution_times: defaultdict = defaultdict(list) run_execution_times: defaultdict = defaultdict(list)
iteration: int = 0 iteration: int = 0
def __init__(self, crew, openai_model_name: str): def __init__(self, crew, eval_llm: InstanceOf[LLM]):
self.crew = crew self.crew = crew
self.openai_model_name = openai_model_name self.llm = eval_llm
self._telemetry = Telemetry() self._telemetry = Telemetry()
self._setup_for_evaluating() self._setup_for_evaluating()
@@ -51,7 +52,7 @@ class CrewEvaluator:
), ),
backstory="Evaluator agent for crew evaluation with precise capabilities to evaluate the performance of the agents in the crew based on the tasks they have performed", backstory="Evaluator agent for crew evaluation with precise capabilities to evaluate the performance of the agents in the crew based on the tasks they have performed",
verbose=False, verbose=False,
llm=self.openai_model_name, llm=self.llm,
) )
def _evaluation_task( def _evaluation_task(
@@ -181,7 +182,7 @@ class CrewEvaluator:
self.crew, self.crew,
evaluation_result.pydantic.quality, evaluation_result.pydantic.quality,
current_task.execution_duration, current_task.execution_duration,
self.openai_model_name, self.llm.model,
) )
self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality) self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality)
self.run_execution_times[self.iteration].append( self.run_execution_times[self.iteration].append(

View File

@@ -15,6 +15,7 @@ from crewai.agents.cache import CacheHandler
from crewai.crew import Crew from crewai.crew import Crew
from crewai.crews.crew_output import CrewOutput from crewai.crews.crew_output import CrewOutput
from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource
from crewai.llm import LLM
from crewai.memory.contextual.contextual_memory import ContextualMemory from crewai.memory.contextual.contextual_memory import ContextualMemory
from crewai.process import Process from crewai.process import Process
from crewai.project import crew from crewai.project import crew
@@ -3341,7 +3342,8 @@ def test_crew_testing_function(kickoff_mock, copy_mock, crew_evaluator):
copy_mock.return_value = crew copy_mock.return_value = crew
n_iterations = 2 n_iterations = 2
crew.test(n_iterations, openai_model_name="gpt-4o-mini", inputs={"topic": "AI"}) llm_instance = LLM('gpt-4o-mini')
crew.test(n_iterations, llm_instance, inputs={"topic": "AI"})
# Ensure kickoff is called on the copied crew # Ensure kickoff is called on the copied crew
kickoff_mock.assert_has_calls( kickoff_mock.assert_has_calls(
@@ -3350,7 +3352,7 @@ def test_crew_testing_function(kickoff_mock, copy_mock, crew_evaluator):
crew_evaluator.assert_has_calls( crew_evaluator.assert_has_calls(
[ [
mock.call(crew, "gpt-4o-mini"), mock.call(crew,llm_instance),
mock.call().set_iteration(1), mock.call().set_iteration(1),
mock.call().set_iteration(2), mock.call().set_iteration(2),
mock.call().print_crew_evaluation_result(), mock.call().print_crew_evaluation_result(),