mirror of
https://github.com/crewAIInc/crewAI.git
synced 2025-12-16 20:38:29 +00:00
Compare commits
14 Commits
v0.41.0
...
feat/testi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
616ffe2aba | ||
|
|
a6bce1089a | ||
|
|
cb8fbf61de | ||
|
|
4d2cdc3d96 | ||
|
|
890c03a0a6 | ||
|
|
e4b419d5be | ||
|
|
8ffc4f79fa | ||
|
|
c05ef3c8cf | ||
|
|
cf600c1a43 | ||
|
|
2a88d1d462 | ||
|
|
660a2ae837 | ||
|
|
6930656897 | ||
|
|
349753a013 | ||
|
|
f53a3a00e1 |
@@ -220,7 +220,7 @@ These methods provide flexibility in how you manage and execute tasks within you
|
||||
### Replaying from specific task:
|
||||
You can now replay from a specific task using our cli command replay.
|
||||
|
||||
The replay_from_tasks feature in CrewAI allows you to replay from a specific task using the command-line interface (CLI). By running the command `crewai replay -t <task_id>`, you can specify the `task_id` for the replay process.
|
||||
The replay feature in CrewAI allows you to replay from a specific task using the command-line interface (CLI). By running the command `crewai replay -t <task_id>`, you can specify the `task_id` for the replay process.
|
||||
|
||||
Kickoffs will now save the latest kickoffs returned task outputs locally for you to be able to replay from.
|
||||
|
||||
|
||||
41
docs/core-concepts/Testing.md
Normal file
41
docs/core-concepts/Testing.md
Normal file
@@ -0,0 +1,41 @@
|
||||
---
|
||||
title: crewAI Testing
|
||||
description: Learn how to test your crewAI Crew and evaluate their performance.
|
||||
---
|
||||
|
||||
## Introduction
|
||||
|
||||
Testing is a crucial part of the development process, and it is essential to ensure that your crew is performing as expected. And with crewAI, you can easily test your crew and evaluate its performance using the built-in testing capabilities.
|
||||
|
||||
### Using the Testing Feature
|
||||
|
||||
We added the CLI command `crewai test` to make it easy to test your crew. This command will run your crew for a specified number of iterations and provide detailed performance metrics.
|
||||
The parameters are `n_iterations` and `model` which are optional and default to 2 and `gpt-4o-mini` respectively. For now the only provider available is OpenAI.
|
||||
|
||||
```bash
|
||||
crewai test
|
||||
```
|
||||
|
||||
If you want to run more iterations or use a different model, you can specify the parameters like this:
|
||||
|
||||
```bash
|
||||
crewai test --n_iterations 5 --model gpt-4o
|
||||
```
|
||||
|
||||
What happens when you run the `crewai test` command is that the crew will be executed for the specified number of iterations, and the performance metrics will be displayed at the end of the run.
|
||||
|
||||
A table of scores at the end will show the performance of the crew in terms of the following metrics:
|
||||
```
|
||||
Task Scores
|
||||
(1-10 Higher is better)
|
||||
┏━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━┓
|
||||
┃ Tasks/Crew ┃ Run 1 ┃ Run 2 ┃ Avg. Total ┃
|
||||
┡━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━┩
|
||||
│ Task 1 │ 10.0 │ 9.0 │ 9.5 │
|
||||
│ Task 2 │ 9.0 │ 9.0 │ 9.0 │
|
||||
│ Crew │ 9.5 │ 9.0 │ 9.2 │
|
||||
└────────────┴───────┴───────┴────────────┘
|
||||
```
|
||||
|
||||
The example above shows the test results for two runs of the crew with two tasks, with the average total score for each task and the crew as a whole.
|
||||
|
||||
@@ -36,14 +36,14 @@ To replay from a task programmatically, use the following steps:
|
||||
2. Execute the replay command within a try-except block to handle potential errors.
|
||||
|
||||
```python
|
||||
def replay_from_task():
|
||||
def replay():
|
||||
"""
|
||||
Replay the crew execution from a specific task.
|
||||
"""
|
||||
task_id = '<task_id>'
|
||||
inputs = {"topic": "CrewAI Training"} # this is optional, you can pass in the inputs you want to replay otherwise uses the previous kickoffs inputs
|
||||
try:
|
||||
YourCrewName_Crew().crew().replay_from_task(task_id=task_id, inputs=inputs)
|
||||
YourCrewName_Crew().crew().replay(task_id=task_id, inputs=inputs)
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"An error occurred while replaying the crew: {e}")
|
||||
@@ -129,6 +129,7 @@ nav:
|
||||
- Training: 'core-concepts/Training-Crew.md'
|
||||
- Memory: 'core-concepts/Memory.md'
|
||||
- Planning: 'core-concepts/Planning.md'
|
||||
- Testing: 'core-concepts/Testing.md'
|
||||
- Using LangChain Tools: 'core-concepts/Using-LangChain-Tools.md'
|
||||
- Using LlamaIndex Tools: 'core-concepts/Using-LlamaIndex-Tools.md'
|
||||
- How to Guides:
|
||||
|
||||
688
poetry.lock
generated
688
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "crewai"
|
||||
version = "0.41.0"
|
||||
version = "0.41.1"
|
||||
description = "Cutting-edge framework for orchestrating role-playing, autonomous AI agents. By fostering collaborative intelligence, CrewAI empowers agents to work together seamlessly, tackling complex tasks."
|
||||
authors = ["Joao Moura <joao@crewai.com>"]
|
||||
readme = "README.md"
|
||||
|
||||
@@ -5,11 +5,11 @@ from crewai.memory.storage.kickoff_task_outputs_storage import (
|
||||
KickoffTaskOutputsSQLiteStorage,
|
||||
)
|
||||
|
||||
|
||||
from .create_crew import create_crew
|
||||
from .train_crew import train_crew
|
||||
from .replay_from_task import replay_task_command
|
||||
from .reset_memories_command import reset_memories_command
|
||||
from .test_crew import test_crew
|
||||
from .train_crew import train_crew
|
||||
|
||||
|
||||
@click.group()
|
||||
@@ -126,5 +126,26 @@ def reset_memories(long, short, entities, kickoff_outputs, all):
|
||||
click.echo(f"An error occurred while resetting memories: {e}", err=True)
|
||||
|
||||
|
||||
@crewai.command()
|
||||
@click.option(
|
||||
"-n",
|
||||
"--n_iterations",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Number of iterations to Test the crew",
|
||||
)
|
||||
@click.option(
|
||||
"-m",
|
||||
"--model",
|
||||
type=str,
|
||||
default="gpt-4o-mini",
|
||||
help="LLM Model to run the tests on the Crew. For now only accepting only OpenAI models.",
|
||||
)
|
||||
def test(n_iterations: int, model: str):
|
||||
"""Test the crew and evaluate the results."""
|
||||
click.echo(f"Testing the crew for {n_iterations} iterations with model {model}")
|
||||
test_crew(n_iterations, model)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
crewai()
|
||||
|
||||
@@ -39,3 +39,16 @@ def replay():
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"An error occurred while replaying the crew: {e}")
|
||||
|
||||
def test():
|
||||
"""
|
||||
Test the crew execution and returns the results.
|
||||
"""
|
||||
inputs = {
|
||||
"topic": "AI LLMs"
|
||||
}
|
||||
try:
|
||||
{{crew_name}}Crew().crew().test(n_iterations=int(sys.argv[1]), openai_model_name=sys.argv[2], inputs=inputs)
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"An error occurred while replaying the crew: {e}")
|
||||
|
||||
@@ -6,12 +6,13 @@ authors = ["Your Name <you@example.com>"]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.10,<=3.13"
|
||||
crewai = { extras = ["tools"], version = "^0.41.0" }
|
||||
crewai = { extras = ["tools"], version = "^0.41.1" }
|
||||
|
||||
[tool.poetry.scripts]
|
||||
{{folder_name}} = "{{folder_name}}.main:run"
|
||||
train = "{{folder_name}}.main:train"
|
||||
replay = "{{folder_name}}.main:replay"
|
||||
test = "{{folder_name}}.main:test"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
|
||||
30
src/crewai/cli/test_crew.py
Normal file
30
src/crewai/cli/test_crew.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import subprocess
|
||||
|
||||
import click
|
||||
|
||||
|
||||
def test_crew(n_iterations: int, model: str) -> None:
|
||||
"""
|
||||
Test the crew by running a command in the Poetry environment.
|
||||
|
||||
Args:
|
||||
n_iterations (int): The number of iterations to test the crew.
|
||||
model (str): The model to test the crew with.
|
||||
"""
|
||||
command = ["poetry", "run", "test", str(n_iterations), model]
|
||||
|
||||
try:
|
||||
if n_iterations <= 0:
|
||||
raise ValueError("The number of iterations must be a positive integer.")
|
||||
|
||||
result = subprocess.run(command, capture_output=False, text=True, check=True)
|
||||
|
||||
if result.stderr:
|
||||
click.echo(result.stderr, err=True)
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
click.echo(f"An error occurred while testing the crew: {e}", err=True)
|
||||
click.echo(e.output, err=True)
|
||||
|
||||
except Exception as e:
|
||||
click.echo(f"An unexpected error occurred: {e}", err=True)
|
||||
@@ -37,6 +37,7 @@ from crewai.utilities.constants import (
|
||||
TRAINED_AGENTS_DATA_FILE,
|
||||
TRAINING_DATA_FILE,
|
||||
)
|
||||
from crewai.utilities.evaluators.crew_evaluator_handler import CrewEvaluator
|
||||
from crewai.utilities.evaluators.task_evaluator import TaskEvaluator
|
||||
from crewai.utilities.formatter import (
|
||||
aggregate_raw_outputs_from_task_outputs,
|
||||
@@ -561,8 +562,13 @@ class Crew(BaseModel):
|
||||
self._logger.log("info", "Planning the crew execution")
|
||||
result = CrewPlanner(self.tasks)._handle_crew_planning()
|
||||
|
||||
for task, step_plan in zip(self.tasks, result.list_of_plans_per_task):
|
||||
task.description += step_plan
|
||||
if result is not None and hasattr(result, "list_of_plans_per_task"):
|
||||
for task, step_plan in zip(self.tasks, result.list_of_plans_per_task):
|
||||
task.description += step_plan
|
||||
else:
|
||||
self._logger.log(
|
||||
"info", "Something went wrong with the planning process of the Crew"
|
||||
)
|
||||
|
||||
def _store_execution_log(
|
||||
self,
|
||||
@@ -961,5 +967,20 @@ class Crew(BaseModel):
|
||||
|
||||
return total_usage_metrics
|
||||
|
||||
def test(
|
||||
self,
|
||||
n_iterations: int,
|
||||
openai_model_name: str,
|
||||
inputs: Optional[Dict[str, Any]] = None,
|
||||
) -> None:
|
||||
"""Test and evaluate the Crew with the given inputs for n iterations."""
|
||||
evaluator = CrewEvaluator(self, openai_model_name)
|
||||
|
||||
for i in range(1, n_iterations + 1):
|
||||
evaluator.set_iteration(i)
|
||||
self.kickoff(inputs=inputs)
|
||||
|
||||
evaluator.print_crew_evaluation_result()
|
||||
|
||||
def __repr__(self):
|
||||
return f"Crew(id={self.id}, process={self.process}, number_of_agents={len(self.agents)}, number_of_tasks={len(self.tasks)})"
|
||||
|
||||
149
src/crewai/utilities/evaluators/crew_evaluator_handler.py
Normal file
149
src/crewai/utilities/evaluators/crew_evaluator_handler.py
Normal file
@@ -0,0 +1,149 @@
|
||||
from collections import defaultdict
|
||||
|
||||
from langchain_openai import ChatOpenAI
|
||||
from pydantic import BaseModel, Field
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
from crewai.tasks.task_output import TaskOutput
|
||||
|
||||
|
||||
class TaskEvaluationPydanticOutput(BaseModel):
|
||||
quality: float = Field(
|
||||
description="A score from 1 to 10 evaluating on completion, quality, and overall performance from the task_description and task_expected_output to the actual Task Output."
|
||||
)
|
||||
|
||||
|
||||
class CrewEvaluator:
|
||||
"""
|
||||
A class to evaluate the performance of the agents in the crew based on the tasks they have performed.
|
||||
|
||||
Attributes:
|
||||
crew (Crew): The crew of agents to evaluate.
|
||||
openai_model_name (str): The model to use for evaluating the performance of the agents (for now ONLY OpenAI accepted).
|
||||
tasks_scores (defaultdict): A dictionary to store the scores of the agents for each task.
|
||||
iteration (int): The current iteration of the evaluation.
|
||||
"""
|
||||
|
||||
tasks_scores: defaultdict = defaultdict(list)
|
||||
iteration: int = 0
|
||||
|
||||
def __init__(self, crew, openai_model_name: str):
|
||||
self.crew = crew
|
||||
self.openai_model_name = openai_model_name
|
||||
self._setup_for_evaluating()
|
||||
|
||||
def _setup_for_evaluating(self) -> None:
|
||||
"""Sets up the crew for evaluating."""
|
||||
for task in self.crew.tasks:
|
||||
task.callback = self.evaluate
|
||||
|
||||
def set_iteration(self, iteration: int) -> None:
|
||||
self.iteration = iteration
|
||||
|
||||
def _evaluator_agent(self):
|
||||
return Agent(
|
||||
role="Task Execution Evaluator",
|
||||
goal=(
|
||||
"Your goal is to evaluate the performance of the agents in the crew based on the tasks they have performed using score from 1 to 10 evaluating on completion, quality, and overall performance."
|
||||
),
|
||||
backstory="Evaluator agent for crew evaluation with precise capabilities to evaluate the performance of the agents in the crew based on the tasks they have performed",
|
||||
verbose=False,
|
||||
llm=ChatOpenAI(model=self.openai_model_name),
|
||||
)
|
||||
|
||||
def _evaluation_task(
|
||||
self, evaluator_agent: Agent, task_to_evaluate: Task, task_output: str
|
||||
) -> Task:
|
||||
return Task(
|
||||
description=(
|
||||
"Based on the task description and the expected output, compare and evaluate the performance of the agents in the crew based on the Task Output they have performed using score from 1 to 10 evaluating on completion, quality, and overall performance."
|
||||
f"task_description: {task_to_evaluate.description} "
|
||||
f"task_expected_output: {task_to_evaluate.expected_output} "
|
||||
f"agent: {task_to_evaluate.agent.role if task_to_evaluate.agent else None} "
|
||||
f"agent_goal: {task_to_evaluate.agent.goal if task_to_evaluate.agent else None} "
|
||||
f"Task Output: {task_output}"
|
||||
),
|
||||
expected_output="Evaluation Score from 1 to 10 based on the performance of the agents on the tasks",
|
||||
agent=evaluator_agent,
|
||||
output_pydantic=TaskEvaluationPydanticOutput,
|
||||
)
|
||||
|
||||
def print_crew_evaluation_result(self) -> None:
|
||||
"""
|
||||
Prints the evaluation result of the crew in a table.
|
||||
A Crew with 2 tasks using the command crewai test -n 2
|
||||
will output the following table:
|
||||
|
||||
Task Scores
|
||||
(1-10 Higher is better)
|
||||
┏━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━┓
|
||||
┃ Tasks/Crew ┃ Run 1 ┃ Run 2 ┃ Avg. Total ┃
|
||||
┡━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━┩
|
||||
│ Task 1 │ 10.0 │ 9.0 │ 9.5 │
|
||||
│ Task 2 │ 9.0 │ 9.0 │ 9.0 │
|
||||
│ Crew │ 9.5 │ 9.0 │ 9.2 │
|
||||
└────────────┴───────┴───────┴────────────┘
|
||||
"""
|
||||
task_averages = [
|
||||
sum(scores) / len(scores) for scores in zip(*self.tasks_scores.values())
|
||||
]
|
||||
crew_average = sum(task_averages) / len(task_averages)
|
||||
|
||||
# Create a table
|
||||
table = Table(title="Tasks Scores \n (1-10 Higher is better)")
|
||||
|
||||
# Add columns for the table
|
||||
table.add_column("Tasks/Crew")
|
||||
for run in range(1, len(self.tasks_scores) + 1):
|
||||
table.add_column(f"Run {run}")
|
||||
table.add_column("Avg. Total")
|
||||
|
||||
# Add rows for each task
|
||||
for task_index in range(len(task_averages)):
|
||||
task_scores = [
|
||||
self.tasks_scores[run][task_index]
|
||||
for run in range(1, len(self.tasks_scores) + 1)
|
||||
]
|
||||
avg_score = task_averages[task_index]
|
||||
table.add_row(
|
||||
f"Task {task_index + 1}", *map(str, task_scores), f"{avg_score:.1f}"
|
||||
)
|
||||
|
||||
# Add a row for the crew average
|
||||
crew_scores = [
|
||||
sum(self.tasks_scores[run]) / len(self.tasks_scores[run])
|
||||
for run in range(1, len(self.tasks_scores) + 1)
|
||||
]
|
||||
table.add_row("Crew", *map(str, crew_scores), f"{crew_average:.1f}")
|
||||
|
||||
# Display the table in the terminal
|
||||
console = Console()
|
||||
console.print(table)
|
||||
|
||||
def evaluate(self, task_output: TaskOutput):
|
||||
"""Evaluates the performance of the agents in the crew based on the tasks they have performed."""
|
||||
current_task = None
|
||||
for task in self.crew.tasks:
|
||||
if task.description == task_output.description:
|
||||
current_task = task
|
||||
break
|
||||
|
||||
if not current_task or not task_output:
|
||||
raise ValueError(
|
||||
"Task to evaluate and task output are required for evaluation"
|
||||
)
|
||||
|
||||
evaluator_agent = self._evaluator_agent()
|
||||
evaluation_task = self._evaluation_task(
|
||||
evaluator_agent, current_task, task_output.raw
|
||||
)
|
||||
|
||||
evaluation_result = evaluation_task.execute_sync()
|
||||
|
||||
if isinstance(evaluation_result.pydantic, TaskEvaluationPydanticOutput):
|
||||
self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality)
|
||||
else:
|
||||
raise ValueError("Evaluation result is not in the expected format")
|
||||
@@ -1,4 +1,4 @@
|
||||
from typing import List
|
||||
from typing import List, Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
@@ -14,14 +14,14 @@ class CrewPlanner:
|
||||
def __init__(self, tasks: List[Task]):
|
||||
self.tasks = tasks
|
||||
|
||||
def _handle_crew_planning(self):
|
||||
def _handle_crew_planning(self) -> Optional[BaseModel]:
|
||||
"""Handles the Crew planning by creating detailed step-by-step plans for each task."""
|
||||
planning_agent = self._create_planning_agent()
|
||||
tasks_summary = self._create_tasks_summary()
|
||||
|
||||
planner_task = self._create_planner_task(planning_agent, tasks_summary)
|
||||
|
||||
return planner_task.execute_sync()
|
||||
return planner_task.execute_sync().pydantic
|
||||
|
||||
def _create_planning_agent(self) -> Agent:
|
||||
"""Creates the planning agent for the crew planning."""
|
||||
|
||||
163
tests/cassettes/test_replay_setup_context.yaml
Normal file
163
tests/cassettes/test_replay_setup_context.yaml
Normal file
@@ -0,0 +1,163 @@
|
||||
interactions:
|
||||
- request:
|
||||
body: '{"messages": [{"content": "You are test_agent. Test Description\nYour personal
|
||||
goal is: Test GoalTo give my best complete final answer to the task use the
|
||||
exact following format:\n\nThought: I now can give a great answer\nFinal Answer:
|
||||
my best complete final answer to the task.\nYour final answer must be the great
|
||||
and the most complete as possible, it must be outcome described.\n\nI MUST use
|
||||
these formats, my job depends on it!\nCurrent Task: Test Task\n\nThis is the
|
||||
expect criteria for your final answer: Say Hi to John \n you MUST return the
|
||||
actual complete content as the final answer, not a summary.\n\nThis is the context
|
||||
you''re working with:\ncontext raw output\n\nBegin! This is VERY important to
|
||||
you, use the tools available and give your best Final Answer, your job depends
|
||||
on it!\n\nThought:\n", "role": "user"}], "model": "gpt-4o", "logprobs": false,
|
||||
"n": 1, "stop": ["\nObservation"], "stream": true, "temperature": 0.7}'
|
||||
headers:
|
||||
accept:
|
||||
- application/json
|
||||
accept-encoding:
|
||||
- gzip, deflate
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '937'
|
||||
content-type:
|
||||
- application/json
|
||||
host:
|
||||
- api.openai.com
|
||||
user-agent:
|
||||
- OpenAI/Python 1.36.0
|
||||
x-stainless-arch:
|
||||
- arm64
|
||||
x-stainless-async:
|
||||
- 'false'
|
||||
x-stainless-lang:
|
||||
- python
|
||||
x-stainless-os:
|
||||
- MacOS
|
||||
x-stainless-package-version:
|
||||
- 1.36.0
|
||||
x-stainless-runtime:
|
||||
- CPython
|
||||
x-stainless-runtime-version:
|
||||
- 3.11.7
|
||||
method: POST
|
||||
uri: https://api.openai.com/v1/chat/completions
|
||||
response:
|
||||
body:
|
||||
string: 'data: {"id":"chatcmpl-9n6wQ7bzZKcXAmiNgs4nn5Of0EFiM","object":"chat.completion.chunk","created":1721491782,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_400f27fa1f","choices":[{"index":0,"delta":{"role":"assistant","content":""},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wQ7bzZKcXAmiNgs4nn5Of0EFiM","object":"chat.completion.chunk","created":1721491782,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_400f27fa1f","choices":[{"index":0,"delta":{"content":"Thought"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wQ7bzZKcXAmiNgs4nn5Of0EFiM","object":"chat.completion.chunk","created":1721491782,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_400f27fa1f","choices":[{"index":0,"delta":{"content":":"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wQ7bzZKcXAmiNgs4nn5Of0EFiM","object":"chat.completion.chunk","created":1721491782,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_400f27fa1f","choices":[{"index":0,"delta":{"content":"
|
||||
I"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wQ7bzZKcXAmiNgs4nn5Of0EFiM","object":"chat.completion.chunk","created":1721491782,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_400f27fa1f","choices":[{"index":0,"delta":{"content":"
|
||||
now"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wQ7bzZKcXAmiNgs4nn5Of0EFiM","object":"chat.completion.chunk","created":1721491782,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_400f27fa1f","choices":[{"index":0,"delta":{"content":"
|
||||
can"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wQ7bzZKcXAmiNgs4nn5Of0EFiM","object":"chat.completion.chunk","created":1721491782,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_400f27fa1f","choices":[{"index":0,"delta":{"content":"
|
||||
give"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wQ7bzZKcXAmiNgs4nn5Of0EFiM","object":"chat.completion.chunk","created":1721491782,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_400f27fa1f","choices":[{"index":0,"delta":{"content":"
|
||||
a"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wQ7bzZKcXAmiNgs4nn5Of0EFiM","object":"chat.completion.chunk","created":1721491782,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_400f27fa1f","choices":[{"index":0,"delta":{"content":"
|
||||
great"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wQ7bzZKcXAmiNgs4nn5Of0EFiM","object":"chat.completion.chunk","created":1721491782,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_400f27fa1f","choices":[{"index":0,"delta":{"content":"
|
||||
answer"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wQ7bzZKcXAmiNgs4nn5Of0EFiM","object":"chat.completion.chunk","created":1721491782,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_400f27fa1f","choices":[{"index":0,"delta":{"content":"\n"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wQ7bzZKcXAmiNgs4nn5Of0EFiM","object":"chat.completion.chunk","created":1721491782,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_400f27fa1f","choices":[{"index":0,"delta":{"content":"Final"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wQ7bzZKcXAmiNgs4nn5Of0EFiM","object":"chat.completion.chunk","created":1721491782,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_400f27fa1f","choices":[{"index":0,"delta":{"content":"
|
||||
Answer"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wQ7bzZKcXAmiNgs4nn5Of0EFiM","object":"chat.completion.chunk","created":1721491782,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_400f27fa1f","choices":[{"index":0,"delta":{"content":":"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wQ7bzZKcXAmiNgs4nn5Of0EFiM","object":"chat.completion.chunk","created":1721491782,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_400f27fa1f","choices":[{"index":0,"delta":{"content":"
|
||||
Hi"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wQ7bzZKcXAmiNgs4nn5Of0EFiM","object":"chat.completion.chunk","created":1721491782,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_400f27fa1f","choices":[{"index":0,"delta":{"content":"
|
||||
John"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wQ7bzZKcXAmiNgs4nn5Of0EFiM","object":"chat.completion.chunk","created":1721491782,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_400f27fa1f","choices":[{"index":0,"delta":{},"logprobs":null,"finish_reason":"stop"}]}
|
||||
|
||||
|
||||
data: [DONE]
|
||||
|
||||
|
||||
'
|
||||
headers:
|
||||
CF-Cache-Status:
|
||||
- DYNAMIC
|
||||
CF-RAY:
|
||||
- 8a643794fe0341e9-EWR
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Type:
|
||||
- text/event-stream; charset=utf-8
|
||||
Date:
|
||||
- Sat, 20 Jul 2024 16:09:42 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Set-Cookie:
|
||||
- __cf_bm=7kfE3khl2E.6zM44yel5nToHzdtz0QeQ4wkLuGYyqSs-1721491782-1.0.1.1-XUb95eXTriHvSUSCH.TCyAmCGCbPK6L7p_tRTDBon8Fo6ns8TDbDoDGA.wVCFI4MTXSxkqrjD0GpYDj4GBTeSQ;
|
||||
path=/; expires=Sat, 20-Jul-24 16:39:42 GMT; domain=.api.openai.com; HttpOnly;
|
||||
Secure; SameSite=None
|
||||
- _cfuvid=iN41lAEk.DjpRMAtG.K0NEvIN0xB9eS0CUCU2iWmjv4-1721491782137-0.0.1.1-604800000;
|
||||
path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
X-Content-Type-Options:
|
||||
- nosniff
|
||||
alt-svc:
|
||||
- h3=":443"; ma=86400
|
||||
openai-organization:
|
||||
- crewai-iuxna1
|
||||
openai-processing-ms:
|
||||
- '104'
|
||||
openai-version:
|
||||
- '2020-10-01'
|
||||
strict-transport-security:
|
||||
- max-age=15552000; includeSubDomains; preload
|
||||
x-ratelimit-limit-requests:
|
||||
- '10000'
|
||||
x-ratelimit-limit-tokens:
|
||||
- '30000000'
|
||||
x-ratelimit-remaining-requests:
|
||||
- '9999'
|
||||
x-ratelimit-remaining-tokens:
|
||||
- '29999791'
|
||||
x-ratelimit-reset-requests:
|
||||
- 6ms
|
||||
x-ratelimit-reset-tokens:
|
||||
- 0s
|
||||
x-request-id:
|
||||
- req_4d90924dd28a0fb48c857f03515f0ca8
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
version: 1
|
||||
159
tests/cassettes/test_replay_with_context.yaml
Normal file
159
tests/cassettes/test_replay_with_context.yaml
Normal file
@@ -0,0 +1,159 @@
|
||||
interactions:
|
||||
- request:
|
||||
body: '{"messages": [{"content": "You are test_agent. Test Description\nYour personal
|
||||
goal is: Test GoalTo give my best complete final answer to the task use the
|
||||
exact following format:\n\nThought: I now can give a great answer\nFinal Answer:
|
||||
my best complete final answer to the task.\nYour final answer must be the great
|
||||
and the most complete as possible, it must be outcome described.\n\nI MUST use
|
||||
these formats, my job depends on it!\nCurrent Task: Test Task\n\nThis is the
|
||||
expect criteria for your final answer: Say Hi \n you MUST return the actual
|
||||
complete content as the final answer, not a summary.\n\nThis is the context
|
||||
you''re working with:\ncontext raw output\n\nBegin! This is VERY important to
|
||||
you, use the tools available and give your best Final Answer, your job depends
|
||||
on it!\n\nThought:\n", "role": "user"}], "model": "gpt-4o", "logprobs": false,
|
||||
"n": 1, "stop": ["\nObservation"], "stream": true, "temperature": 0.7}'
|
||||
headers:
|
||||
accept:
|
||||
- application/json
|
||||
accept-encoding:
|
||||
- gzip, deflate
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '929'
|
||||
content-type:
|
||||
- application/json
|
||||
host:
|
||||
- api.openai.com
|
||||
user-agent:
|
||||
- OpenAI/Python 1.36.0
|
||||
x-stainless-arch:
|
||||
- arm64
|
||||
x-stainless-async:
|
||||
- 'false'
|
||||
x-stainless-lang:
|
||||
- python
|
||||
x-stainless-os:
|
||||
- MacOS
|
||||
x-stainless-package-version:
|
||||
- 1.36.0
|
||||
x-stainless-runtime:
|
||||
- CPython
|
||||
x-stainless-runtime-version:
|
||||
- 3.11.7
|
||||
method: POST
|
||||
uri: https://api.openai.com/v1/chat/completions
|
||||
response:
|
||||
body:
|
||||
string: 'data: {"id":"chatcmpl-9n6wPAClsh4tUGoLYKLh3VoX1vlAx","object":"chat.completion.chunk","created":1721491781,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_c4e5b6fa31","choices":[{"index":0,"delta":{"role":"assistant","content":""},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wPAClsh4tUGoLYKLh3VoX1vlAx","object":"chat.completion.chunk","created":1721491781,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_c4e5b6fa31","choices":[{"index":0,"delta":{"content":"Thought"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wPAClsh4tUGoLYKLh3VoX1vlAx","object":"chat.completion.chunk","created":1721491781,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_c4e5b6fa31","choices":[{"index":0,"delta":{"content":":"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wPAClsh4tUGoLYKLh3VoX1vlAx","object":"chat.completion.chunk","created":1721491781,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_c4e5b6fa31","choices":[{"index":0,"delta":{"content":"
|
||||
I"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wPAClsh4tUGoLYKLh3VoX1vlAx","object":"chat.completion.chunk","created":1721491781,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_c4e5b6fa31","choices":[{"index":0,"delta":{"content":"
|
||||
now"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wPAClsh4tUGoLYKLh3VoX1vlAx","object":"chat.completion.chunk","created":1721491781,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_c4e5b6fa31","choices":[{"index":0,"delta":{"content":"
|
||||
can"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wPAClsh4tUGoLYKLh3VoX1vlAx","object":"chat.completion.chunk","created":1721491781,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_c4e5b6fa31","choices":[{"index":0,"delta":{"content":"
|
||||
give"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wPAClsh4tUGoLYKLh3VoX1vlAx","object":"chat.completion.chunk","created":1721491781,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_c4e5b6fa31","choices":[{"index":0,"delta":{"content":"
|
||||
a"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wPAClsh4tUGoLYKLh3VoX1vlAx","object":"chat.completion.chunk","created":1721491781,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_c4e5b6fa31","choices":[{"index":0,"delta":{"content":"
|
||||
great"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wPAClsh4tUGoLYKLh3VoX1vlAx","object":"chat.completion.chunk","created":1721491781,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_c4e5b6fa31","choices":[{"index":0,"delta":{"content":"
|
||||
answer"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wPAClsh4tUGoLYKLh3VoX1vlAx","object":"chat.completion.chunk","created":1721491781,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_c4e5b6fa31","choices":[{"index":0,"delta":{"content":"\n"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wPAClsh4tUGoLYKLh3VoX1vlAx","object":"chat.completion.chunk","created":1721491781,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_c4e5b6fa31","choices":[{"index":0,"delta":{"content":"Final"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wPAClsh4tUGoLYKLh3VoX1vlAx","object":"chat.completion.chunk","created":1721491781,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_c4e5b6fa31","choices":[{"index":0,"delta":{"content":"
|
||||
Answer"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wPAClsh4tUGoLYKLh3VoX1vlAx","object":"chat.completion.chunk","created":1721491781,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_c4e5b6fa31","choices":[{"index":0,"delta":{"content":":"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wPAClsh4tUGoLYKLh3VoX1vlAx","object":"chat.completion.chunk","created":1721491781,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_c4e5b6fa31","choices":[{"index":0,"delta":{"content":"
|
||||
Hi"},"logprobs":null,"finish_reason":null}]}
|
||||
|
||||
|
||||
data: {"id":"chatcmpl-9n6wPAClsh4tUGoLYKLh3VoX1vlAx","object":"chat.completion.chunk","created":1721491781,"model":"gpt-4o-2024-05-13","system_fingerprint":"fp_c4e5b6fa31","choices":[{"index":0,"delta":{},"logprobs":null,"finish_reason":"stop"}]}
|
||||
|
||||
|
||||
data: [DONE]
|
||||
|
||||
|
||||
'
|
||||
headers:
|
||||
CF-Cache-Status:
|
||||
- DYNAMIC
|
||||
CF-RAY:
|
||||
- 8a643791a80e8c96-EWR
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Type:
|
||||
- text/event-stream; charset=utf-8
|
||||
Date:
|
||||
- Sat, 20 Jul 2024 16:09:41 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Set-Cookie:
|
||||
- __cf_bm=cam5sECdaTzbttLIOaiuvh9flDIAXp_FLPODnDEOn6k-1721491781-1.0.1.1-hyFl43P7HIWZsGueyWuDeO579sZ41as2mvrM.cQS1E8KSLG2ZZ0DxDGbVvHYRO0eflTUJohgZu6CGltvjQfMtQ;
|
||||
path=/; expires=Sat, 20-Jul-24 16:39:41 GMT; domain=.api.openai.com; HttpOnly;
|
||||
Secure; SameSite=None
|
||||
- _cfuvid=nmlgS.bqXAu0rZ.OlHPfXrIrdnVgrBSW3e0UuU3N5ng-1721491781661-0.0.1.1-604800000;
|
||||
path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
X-Content-Type-Options:
|
||||
- nosniff
|
||||
alt-svc:
|
||||
- h3=":443"; ma=86400
|
||||
openai-organization:
|
||||
- crewai-iuxna1
|
||||
openai-processing-ms:
|
||||
- '126'
|
||||
openai-version:
|
||||
- '2020-10-01'
|
||||
strict-transport-security:
|
||||
- max-age=15552000; includeSubDomains; preload
|
||||
x-ratelimit-limit-requests:
|
||||
- '10000'
|
||||
x-ratelimit-limit-tokens:
|
||||
- '30000000'
|
||||
x-ratelimit-remaining-requests:
|
||||
- '9999'
|
||||
x-ratelimit-remaining-tokens:
|
||||
- '29999794'
|
||||
x-ratelimit-reset-requests:
|
||||
- 6ms
|
||||
x-ratelimit-reset-tokens:
|
||||
- 0s
|
||||
x-request-id:
|
||||
- req_31484eeb0af939af4e0d9c47441ba2db
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
version: 1
|
||||
@@ -3,7 +3,7 @@ from unittest import mock
|
||||
import pytest
|
||||
from click.testing import CliRunner
|
||||
|
||||
from crewai.cli.cli import train, version, reset_memories
|
||||
from crewai.cli.cli import reset_memories, test, train, version
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -133,3 +133,33 @@ def test_version_command_with_tools(runner):
|
||||
"crewai tools version:" in result.output
|
||||
or "crewai tools not installed" in result.output
|
||||
)
|
||||
|
||||
|
||||
@mock.patch("crewai.cli.cli.test_crew")
|
||||
def test_test_default_iterations(test_crew, runner):
|
||||
result = runner.invoke(test)
|
||||
|
||||
test_crew.assert_called_once_with(3, "gpt-4o-mini")
|
||||
assert result.exit_code == 0
|
||||
assert "Testing the crew for 3 iterations with model gpt-4o-mini" in result.output
|
||||
|
||||
|
||||
@mock.patch("crewai.cli.cli.test_crew")
|
||||
def test_test_custom_iterations(test_crew, runner):
|
||||
result = runner.invoke(test, ["--n_iterations", "5", "--model", "gpt-4o"])
|
||||
|
||||
test_crew.assert_called_once_with(5, "gpt-4o")
|
||||
assert result.exit_code == 0
|
||||
assert "Testing the crew for 5 iterations with model gpt-4o" in result.output
|
||||
|
||||
|
||||
@mock.patch("crewai.cli.cli.test_crew")
|
||||
def test_test_invalid_string_iterations(test_crew, runner):
|
||||
result = runner.invoke(test, ["--n_iterations", "invalid"])
|
||||
|
||||
test_crew.assert_not_called()
|
||||
assert result.exit_code == 2
|
||||
assert (
|
||||
"Usage: test [OPTIONS]\nTry 'test --help' for help.\n\nError: Invalid value for '-n' / '--n_iterations': 'invalid' is not a valid integer.\n"
|
||||
in result.output
|
||||
)
|
||||
|
||||
97
tests/cli/test_crew_test.py
Normal file
97
tests/cli/test_crew_test.py
Normal file
@@ -0,0 +1,97 @@
|
||||
import subprocess
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.cli import test_crew
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"n_iterations,model",
|
||||
[
|
||||
(1, "gpt-4o"),
|
||||
(5, "gpt-3.5-turbo"),
|
||||
(10, "gpt-4"),
|
||||
],
|
||||
)
|
||||
@mock.patch("crewai.cli.test_crew.subprocess.run")
|
||||
def test_crew_success(mock_subprocess_run, n_iterations, model):
|
||||
"""Test the crew function for successful execution."""
|
||||
mock_subprocess_run.return_value = subprocess.CompletedProcess(
|
||||
args=f"poetry run test {n_iterations} {model}", returncode=0
|
||||
)
|
||||
result = test_crew.test_crew(n_iterations, model)
|
||||
|
||||
mock_subprocess_run.assert_called_once_with(
|
||||
["poetry", "run", "test", str(n_iterations), model],
|
||||
capture_output=False,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
assert result is None
|
||||
|
||||
|
||||
@mock.patch("crewai.cli.test_crew.click")
|
||||
def test_test_crew_zero_iterations(click):
|
||||
test_crew.test_crew(0, "gpt-4o")
|
||||
click.echo.assert_called_once_with(
|
||||
"An unexpected error occurred: The number of iterations must be a positive integer.",
|
||||
err=True,
|
||||
)
|
||||
|
||||
|
||||
@mock.patch("crewai.cli.test_crew.click")
|
||||
def test_test_crew_negative_iterations(click):
|
||||
test_crew.test_crew(-2, "gpt-4o")
|
||||
click.echo.assert_called_once_with(
|
||||
"An unexpected error occurred: The number of iterations must be a positive integer.",
|
||||
err=True,
|
||||
)
|
||||
|
||||
|
||||
@mock.patch("crewai.cli.test_crew.click")
|
||||
@mock.patch("crewai.cli.test_crew.subprocess.run")
|
||||
def test_test_crew_called_process_error(mock_subprocess_run, click):
|
||||
n_iterations = 5
|
||||
mock_subprocess_run.side_effect = subprocess.CalledProcessError(
|
||||
returncode=1,
|
||||
cmd=["poetry", "run", "test", str(n_iterations), "gpt-4o"],
|
||||
output="Error",
|
||||
stderr="Some error occurred",
|
||||
)
|
||||
test_crew.test_crew(n_iterations, "gpt-4o")
|
||||
|
||||
mock_subprocess_run.assert_called_once_with(
|
||||
["poetry", "run", "test", "5", "gpt-4o"],
|
||||
capture_output=False,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
click.echo.assert_has_calls(
|
||||
[
|
||||
mock.call.echo(
|
||||
"An error occurred while testing the crew: Command '['poetry', 'run', 'test', '5', 'gpt-4o']' returned non-zero exit status 1.",
|
||||
err=True,
|
||||
),
|
||||
mock.call.echo("Error", err=True),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@mock.patch("crewai.cli.test_crew.click")
|
||||
@mock.patch("crewai.cli.test_crew.subprocess.run")
|
||||
def test_test_crew_unexpected_exception(mock_subprocess_run, click):
|
||||
# Arrange
|
||||
n_iterations = 5
|
||||
mock_subprocess_run.side_effect = Exception("Unexpected error")
|
||||
test_crew.test_crew(n_iterations, "gpt-4o")
|
||||
|
||||
mock_subprocess_run.assert_called_once_with(
|
||||
["poetry", "run", "test", "5", "gpt-4o"],
|
||||
capture_output=False,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
click.echo.assert_called_once_with(
|
||||
"An unexpected error occurred: Unexpected error", err=True
|
||||
)
|
||||
@@ -8,6 +8,7 @@ from unittest.mock import MagicMock, patch
|
||||
|
||||
import pydantic_core
|
||||
import pytest
|
||||
|
||||
from crewai.agent import Agent
|
||||
from crewai.agents.cache import CacheHandler
|
||||
from crewai.crew import Crew
|
||||
@@ -1917,13 +1918,13 @@ def test_replay_feature():
|
||||
)
|
||||
|
||||
crew.kickoff()
|
||||
crew.replay_from_task(str(write.id))
|
||||
crew.replay(str(write.id))
|
||||
# Ensure context was passed correctly
|
||||
assert mock_execute_task.call_count == 3
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
def test_crew_replay_from_task_error():
|
||||
def test_crew_replay_error():
|
||||
task = Task(
|
||||
description="Come up with a list of 5 interesting ideas to explore for an article",
|
||||
expected_output="5 bullet points with a paragraph for each idea.",
|
||||
@@ -1936,7 +1937,7 @@ def test_crew_replay_from_task_error():
|
||||
)
|
||||
|
||||
with pytest.raises(TypeError) as e:
|
||||
crew.replay_from_task() # type: ignore purposefully throwing err
|
||||
crew.replay() # type: ignore purposefully throwing err
|
||||
assert "task_id is required" in str(e)
|
||||
|
||||
|
||||
@@ -2071,14 +2072,14 @@ def test_replay_task_with_context():
|
||||
with patch.object(Task, "execute_sync") as mock_replay_task:
|
||||
mock_replay_task.return_value = mock_task_output4
|
||||
|
||||
replayed_output = crew.replay_from_task(str(task4.id))
|
||||
replayed_output = crew.replay(str(task4.id))
|
||||
assert replayed_output.raw == "Presentation on AI advancements..."
|
||||
|
||||
db_handler.reset()
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
def test_replay_from_task_with_context():
|
||||
def test_replay_with_context():
|
||||
agent = Agent(role="test_agent", backstory="Test Description", goal="Test Goal")
|
||||
task1 = Task(
|
||||
description="Context Task", expected_output="Say Task Output", agent=agent
|
||||
@@ -2130,7 +2131,7 @@ def test_replay_from_task_with_context():
|
||||
},
|
||||
],
|
||||
):
|
||||
crew.replay_from_task(str(task2.id))
|
||||
crew.replay(str(task2.id))
|
||||
|
||||
assert crew.tasks[1].context[0].output.raw == "context raw output"
|
||||
|
||||
@@ -2192,7 +2193,7 @@ def test_replay_with_invalid_task_id():
|
||||
ValueError,
|
||||
match="Task with id bf5b09c9-69bd-4eb8-be12-f9e5bae31c2d not found in the crew's tasks.",
|
||||
):
|
||||
crew.replay_from_task("bf5b09c9-69bd-4eb8-be12-f9e5bae31c2d")
|
||||
crew.replay("bf5b09c9-69bd-4eb8-be12-f9e5bae31c2d")
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
@@ -2251,13 +2252,13 @@ def test_replay_interpolates_inputs_properly(mock_interpolate_inputs):
|
||||
},
|
||||
],
|
||||
):
|
||||
crew.replay_from_task(str(task2.id))
|
||||
crew.replay(str(task2.id))
|
||||
assert crew._inputs == {"name": "John"}
|
||||
assert mock_interpolate_inputs.call_count == 2
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
def test_replay_from_task_setup_context():
|
||||
def test_replay_setup_context():
|
||||
agent = Agent(role="test_agent", backstory="Test Description", goal="Test Goal")
|
||||
task1 = Task(description="Context Task", expected_output="Say {name}", agent=agent)
|
||||
task2 = Task(
|
||||
@@ -2306,7 +2307,7 @@ def test_replay_from_task_setup_context():
|
||||
},
|
||||
],
|
||||
):
|
||||
crew.replay_from_task(str(task2.id))
|
||||
crew.replay(str(task2.id))
|
||||
|
||||
# Check if the first task's output was set correctly
|
||||
assert crew.tasks[0].output is not None
|
||||
@@ -2499,3 +2500,34 @@ def test_conditional_should_execute():
|
||||
assert condition_mock.call_count == 1
|
||||
assert condition_mock() is True
|
||||
assert mock_execute_sync.call_count == 2
|
||||
|
||||
|
||||
@mock.patch("crewai.crew.CrewEvaluator")
|
||||
@mock.patch("crewai.crew.Crew.kickoff")
|
||||
def test_crew_testing_function(mock_kickoff, crew_evaluator):
|
||||
task = Task(
|
||||
description="Come up with a list of 5 interesting ideas to explore for an article, then write one amazing paragraph highlight for each idea that showcases how good an article about this topic could be. Return the list of ideas with their paragraph and your notes.",
|
||||
expected_output="5 bullet points with a paragraph for each idea.",
|
||||
agent=researcher,
|
||||
)
|
||||
|
||||
crew = Crew(
|
||||
agents=[researcher],
|
||||
tasks=[task],
|
||||
)
|
||||
n_iterations = 2
|
||||
crew.test(n_iterations, openai_model_name="gpt-4o-mini", inputs={"topic": "AI"})
|
||||
|
||||
assert len(mock_kickoff.mock_calls) == n_iterations
|
||||
mock_kickoff.assert_has_calls(
|
||||
[mock.call(inputs={"topic": "AI"}), mock.call(inputs={"topic": "AI"})]
|
||||
)
|
||||
|
||||
crew_evaluator.assert_has_calls(
|
||||
[
|
||||
mock.call(crew, "gpt-4o-mini"),
|
||||
mock.call().set_iteration(1),
|
||||
mock.call().set_iteration(2),
|
||||
mock.call().print_crew_evaluation_result(),
|
||||
]
|
||||
)
|
||||
|
||||
113
tests/utilities/evaluators/test_crew_evaluator_handler.py
Normal file
113
tests/utilities/evaluators/test_crew_evaluator_handler.py
Normal file
@@ -0,0 +1,113 @@
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai.agent import Agent
|
||||
from crewai.crew import Crew
|
||||
from crewai.task import Task
|
||||
from crewai.tasks.task_output import TaskOutput
|
||||
from crewai.utilities.evaluators.crew_evaluator_handler import (
|
||||
CrewEvaluator,
|
||||
TaskEvaluationPydanticOutput,
|
||||
)
|
||||
|
||||
|
||||
class TestCrewEvaluator:
|
||||
@pytest.fixture
|
||||
def crew_planner(self):
|
||||
agent = Agent(role="Agent 1", goal="Goal 1", backstory="Backstory 1")
|
||||
task = Task(
|
||||
description="Task 1",
|
||||
expected_output="Output 1",
|
||||
agent=agent,
|
||||
)
|
||||
crew = Crew(agents=[agent], tasks=[task])
|
||||
|
||||
return CrewEvaluator(crew, openai_model_name="gpt-4o-mini")
|
||||
|
||||
def test_setup_for_evaluating(self, crew_planner):
|
||||
crew_planner._setup_for_evaluating()
|
||||
assert crew_planner.crew.tasks[0].callback == crew_planner.evaluate
|
||||
|
||||
def test_set_iteration(self, crew_planner):
|
||||
crew_planner.set_iteration(1)
|
||||
assert crew_planner.iteration == 1
|
||||
|
||||
def test_evaluator_agent(self, crew_planner):
|
||||
agent = crew_planner._evaluator_agent()
|
||||
assert agent.role == "Task Execution Evaluator"
|
||||
assert (
|
||||
agent.goal
|
||||
== "Your goal is to evaluate the performance of the agents in the crew based on the tasks they have performed using score from 1 to 10 evaluating on completion, quality, and overall performance."
|
||||
)
|
||||
assert (
|
||||
agent.backstory
|
||||
== "Evaluator agent for crew evaluation with precise capabilities to evaluate the performance of the agents in the crew based on the tasks they have performed"
|
||||
)
|
||||
assert agent.verbose is False
|
||||
assert agent.llm.model_name == "gpt-4o-mini"
|
||||
|
||||
def test_evaluation_task(self, crew_planner):
|
||||
evaluator_agent = Agent(
|
||||
role="Evaluator Agent",
|
||||
goal="Evaluate the performance of the agents in the crew",
|
||||
backstory="Master in Evaluation",
|
||||
)
|
||||
task_to_evaluate = Task(
|
||||
description="Task 1",
|
||||
expected_output="Output 1",
|
||||
agent=Agent(role="Agent 1", goal="Goal 1", backstory="Backstory 1"),
|
||||
)
|
||||
task_output = "Task Output 1"
|
||||
task = crew_planner._evaluation_task(
|
||||
evaluator_agent, task_to_evaluate, task_output
|
||||
)
|
||||
|
||||
assert task.description.startswith(
|
||||
"Based on the task description and the expected output, compare and evaluate the performance of the agents in the crew based on the Task Output they have performed using score from 1 to 10 evaluating on completion, quality, and overall performance."
|
||||
)
|
||||
|
||||
assert task.agent == evaluator_agent
|
||||
assert (
|
||||
task.description
|
||||
== "Based on the task description and the expected output, compare and evaluate "
|
||||
"the performance of the agents in the crew based on the Task Output they have "
|
||||
"performed using score from 1 to 10 evaluating on completion, quality, and overall "
|
||||
"performance.task_description: Task 1 task_expected_output: Output 1 "
|
||||
"agent: Agent 1 agent_goal: Goal 1 Task Output: Task Output 1"
|
||||
)
|
||||
|
||||
@mock.patch("crewai.utilities.evaluators.crew_evaluator_handler.Console")
|
||||
@mock.patch("crewai.utilities.evaluators.crew_evaluator_handler.Table")
|
||||
def test_print_crew_evaluation_result(self, table, console, crew_planner):
|
||||
crew_planner.tasks_scores = {
|
||||
1: [10, 9, 8],
|
||||
2: [9, 8, 7],
|
||||
}
|
||||
|
||||
crew_planner.print_crew_evaluation_result()
|
||||
|
||||
table.assert_has_calls(
|
||||
[
|
||||
mock.call(title="Tasks Scores \n (1-10 Higher is better)"),
|
||||
mock.call().add_column("Tasks/Crew"),
|
||||
mock.call().add_column("Run 1"),
|
||||
mock.call().add_column("Run 2"),
|
||||
mock.call().add_column("Avg. Total"),
|
||||
mock.call().add_row("Task 1", "10", "9", "9.5"),
|
||||
mock.call().add_row("Task 2", "9", "8", "8.5"),
|
||||
mock.call().add_row("Task 3", "8", "7", "7.5"),
|
||||
mock.call().add_row("Crew", "9.0", "8.0", "8.5"),
|
||||
]
|
||||
)
|
||||
console.assert_has_calls([mock.call(), mock.call().print(table())])
|
||||
|
||||
def test_evaluate(self, crew_planner):
|
||||
task_output = TaskOutput(
|
||||
description="Task 1", agent=str(crew_planner.crew.agents[0])
|
||||
)
|
||||
|
||||
with mock.patch.object(Task, "execute_sync") as execute:
|
||||
execute().pydantic = TaskEvaluationPydanticOutput(quality=9.5)
|
||||
crew_planner.evaluate(task_output)
|
||||
assert crew_planner.tasks_scores[0] == [9.5]
|
||||
@@ -1,4 +1,5 @@
|
||||
from unittest.mock import patch
|
||||
from crewai.tasks.task_output import TaskOutput
|
||||
|
||||
import pytest
|
||||
|
||||
@@ -31,10 +32,15 @@ class TestCrewPlanner:
|
||||
|
||||
def test_handle_crew_planning(self, crew_planner):
|
||||
with patch.object(Task, "execute_sync") as execute:
|
||||
execute.return_value = PlannerTaskPydanticOutput(
|
||||
list_of_plans_per_task=["Plan 1", "Plan 2", "Plan 3"]
|
||||
execute.return_value = TaskOutput(
|
||||
description="Description",
|
||||
agent="agent",
|
||||
pydantic=PlannerTaskPydanticOutput(
|
||||
list_of_plans_per_task=["Plan 1", "Plan 2", "Plan 3"]
|
||||
),
|
||||
)
|
||||
result = crew_planner._handle_crew_planning()
|
||||
|
||||
assert isinstance(result, PlannerTaskPydanticOutput)
|
||||
assert len(result.list_of_plans_per_task) == len(crew_planner.tasks)
|
||||
execute.assert_called_once()
|
||||
|
||||
Reference in New Issue
Block a user