Compare commits

...

15 Commits

Author SHA1 Message Date
Devin AI
10af8e35fd fix: sort imports using ruff --fix
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 23:06:51 +00:00
Devin AI
e343017414 fix: update test assertions and sort imports
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 23:05:31 +00:00
Devin AI
dad121a692 fix: sort imports in test_custom_llm_support.py
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 23:03:57 +00:00
Devin AI
8dc07febc7 fix: sort imports and remove duplicate assertions
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 23:01:20 +00:00
Devin AI
09240a7b62 fix: remove duplicate assertions in test_crew_testing_function
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 22:59:56 +00:00
Devin AI
0423dd8134 fix: improve error handling and import order
- Add better error handling in _get_llm_instance
- Fix import order in test_custom_llm_support.py

Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 22:57:14 +00:00
Devin AI
f4efdc55e2 refactor: implement review suggestions
- Extract model conversion logic to _get_llm_instance helper method
- Improve error message clarity
- Simplify LLM instance creation in CrewEvaluator

Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 22:52:13 +00:00
Devin AI
0ab66041da fix: address type-checker and lint issues
- Add proper type hints in Crew.test()
- Fix import sorting in test file

Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 22:50:06 +00:00
Devin AI
b8f2603bf3 test: add VCR cassettes for custom LLM support tests
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 22:47:50 +00:00
Devin AI
ff620f0ad6 test: switch to VCR for test recording
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 22:47:00 +00:00
Devin AI
1caf45ad9b test: improve test reliability by mocking LLM responses
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 22:44:50 +00:00
Devin AI
4a216d1f15 fix: use llm.model instead of openai_model_name in CrewEvaluator
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 22:43:56 +00:00
Devin AI
5f5a1b3687 fix: add expected_output field to Task in tests
Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 22:43:22 +00:00
Devin AI
f3704a44b3 test: add tests for custom LLM support in Crew.test() and CrewEvaluator
- Add tests for string model name input
- Add tests for LLM instance input
- Add tests for backward compatibility
- Add tests for error cases
- Add tests for CrewEvaluator LLM handling

Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 22:42:44 +00:00
Devin AI
9bd39464cc feat: enable custom LLM support for Crew.test()
- Add llm parameter to Crew.test() that accepts string or LLM instance
- Maintain backward compatibility with openai_model_name parameter
- Update CrewEvaluator to handle any LLM implementation
- Improve docstrings and type hints

Fixes #2080

Co-Authored-By: Joe Moura <joao@crewai.com>
2025-02-09 22:42:05 +00:00
5 changed files with 1073 additions and 22 deletions

View File

@@ -1147,20 +1147,32 @@ class Crew(BaseModel):
def test(
self,
n_iterations: int,
n_iterations: int = 1,
openai_model_name: Optional[str] = None,
llm: Optional[Union[str, LLM]] = None,
inputs: Optional[Dict[str, Any]] = None,
) -> None:
"""Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures."""
"""Test and evaluate the Crew with the given inputs for n iterations.
Args:
n_iterations: Number of iterations to run the test
openai_model_name: OpenAI model name to use for evaluation (deprecated)
llm: LLM instance or model name to use for evaluation
inputs: Optional dictionary of inputs to pass to the crew
"""
if not llm and not openai_model_name:
raise ValueError("Must provide either 'llm' or 'openai_model_name' parameter")
model_to_use = self._get_llm_instance(llm, openai_model_name)
test_crew = self.copy()
self._test_execution_span = test_crew._telemetry.test_execution_span(
test_crew,
n_iterations,
inputs,
openai_model_name, # type: ignore[arg-type]
) # type: ignore[arg-type]
evaluator = CrewEvaluator(test_crew, openai_model_name) # type: ignore[arg-type]
str(model_to_use.model),
)
evaluator = CrewEvaluator(test_crew, model_to_use)
for i in range(1, n_iterations + 1):
evaluator.set_iteration(i)
@@ -1168,6 +1180,28 @@ class Crew(BaseModel):
evaluator.print_crew_evaluation_result()
def _get_llm_instance(self, llm: Optional[Union[str, LLM]], openai_model_name: Optional[str]) -> LLM:
"""Get an LLM instance from either llm or openai_model_name parameter.
Args:
llm: LLM instance or model name
openai_model_name: OpenAI model name (deprecated)
Returns:
LLM instance
Raises:
ValueError: If neither llm nor openai_model_name is provided
"""
model = llm if llm is not None else openai_model_name
if model is None:
raise ValueError("Must provide either 'llm' or 'openai_model_name' parameter")
if isinstance(model, str):
return LLM(model=model)
if not isinstance(model, LLM):
raise ValueError("Model must be either a string or an LLM instance")
return model
def __repr__(self):
return f"Crew(id={self.id}, process={self.process}, number_of_agents={len(self.agents)}, number_of_tasks={len(self.tasks)})"

View File

@@ -1,4 +1,5 @@
from collections import defaultdict
from typing import Union
from pydantic import BaseModel, Field
from rich.box import HEAVY_EDGE
@@ -6,6 +7,7 @@ from rich.console import Console
from rich.table import Table
from crewai.agent import Agent
from crewai.llm import LLM
from crewai.task import Task
from crewai.tasks.task_output import TaskOutput
from crewai.telemetry import Telemetry
@@ -32,9 +34,9 @@ class CrewEvaluator:
run_execution_times: defaultdict = defaultdict(list)
iteration: int = 0
def __init__(self, crew, openai_model_name: str):
def __init__(self, crew, llm: Union[str, LLM]):
self.crew = crew
self.openai_model_name = openai_model_name
self.llm = LLM(model=llm) if isinstance(llm, str) else llm
self._telemetry = Telemetry()
self._setup_for_evaluating()
@@ -51,7 +53,7 @@ class CrewEvaluator:
),
backstory="Evaluator agent for crew evaluation with precise capabilities to evaluate the performance of the agents in the crew based on the tasks they have performed",
verbose=False,
llm=self.openai_model_name,
llm=self.llm,
)
def _evaluation_task(
@@ -181,7 +183,7 @@ class CrewEvaluator:
self.crew,
evaluation_result.pydantic.quality,
current_task.execution_duration,
self.openai_model_name,
self.llm.model,
)
self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality)
self.run_execution_times[self.iteration].append(

View File

@@ -3306,8 +3306,7 @@ def test_conditional_should_execute():
@mock.patch("crewai.crew.CrewEvaluator")
@mock.patch("crewai.crew.Crew.copy")
@mock.patch("crewai.crew.Crew.kickoff")
def test_crew_testing_function(kickoff_mock, copy_mock, crew_evaluator):
def test_crew_testing_function(copy_mock, crew_evaluator_mock):
task = Task(
description="Come up with a list of 5 interesting ideas to explore for an article, then write one amazing paragraph highlight for each idea that showcases how good an article about this topic could be. Return the list of ideas with their paragraph and your notes.",
expected_output="5 bullet points with a paragraph for each idea.",
@@ -3319,25 +3318,28 @@ def test_crew_testing_function(kickoff_mock, copy_mock, crew_evaluator):
tasks=[task],
)
# Create a mock for the copied crew
copy_mock.return_value = crew
# Create a mock for the copied crew with a mock kickoff method
copied_crew = MagicMock()
copy_mock.return_value = copied_crew
# Create a mock for the CrewEvaluator instance
evaluator_instance = MagicMock()
crew_evaluator_mock.return_value = evaluator_instance
n_iterations = 2
crew.test(n_iterations, openai_model_name="gpt-4o-mini", inputs={"topic": "AI"})
# Ensure kickoff is called on the copied crew
kickoff_mock.assert_has_calls(
copied_crew.kickoff.assert_has_calls(
[mock.call(inputs={"topic": "AI"}), mock.call(inputs={"topic": "AI"})]
)
crew_evaluator.assert_has_calls(
[
mock.call(crew, "gpt-4o-mini"),
mock.call().set_iteration(1),
mock.call().set_iteration(2),
mock.call().print_crew_evaluation_result(),
]
)
# Verify CrewEvaluator interactions
# We don't check the exact LLM object since it's created internally
assert len(crew_evaluator_mock.mock_calls) == 4
assert crew_evaluator_mock.mock_calls[1] == mock.call().set_iteration(1)
assert crew_evaluator_mock.mock_calls[2] == mock.call().set_iteration(2)
assert crew_evaluator_mock.mock_calls[3] == mock.call().print_crew_evaluation_result()
@pytest.mark.vcr(filter_headers=["authorization"])

View File

@@ -0,0 +1,942 @@
interactions:
- request:
body: '{"messages": [{"role": "system", "content": "You are test. test\nYour personal
goal is: test\nTo give my best complete final answer to the task respond using
the exact following format:\n\nThought: I now can give a great answer\nFinal
Answer: Your final answer must be the great and the most complete as possible,
it must be outcome described.\n\nI MUST use these formats, my job depends on
it!"}, {"role": "user", "content": "\nCurrent Task: test\n\nThis is the expected
criteria for your final answer: test output\nyou MUST return the actual complete
content as the final answer, not a summary.\n\nBegin! This is VERY important
to you, use the tools available and give your best Final Answer, your job depends
on it!\n\nThought:"}], "model": "gpt-4", "stop": ["\nObservation:"]}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate
authorization:
- Bearer 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
connection:
- keep-alive
content-length:
- '780'
content-type:
- application/json
host:
- api.openai.com
user-agent:
- OpenAI/Python 1.61.0
x-stainless-arch:
- x64
x-stainless-async:
- 'false'
x-stainless-lang:
- python
x-stainless-os:
- Linux
x-stainless-package-version:
- 1.61.0
x-stainless-raw-response:
- 'true'
x-stainless-retry-count:
- '0'
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.12.7
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
content: "{\n \"id\": \"chatcmpl-AzAMqMWFX8hC1szIKxWNSyXm0SPFi\",\n \"object\":
\"chat.completion\",\n \"created\": 1739141224,\n \"model\": \"gpt-4-0613\",\n
\ \"choices\": [\n {\n \"index\": 0,\n \"message\": {\n \"role\":
\"assistant\",\n \"content\": \"I am prepared to conduct the test efficiently.\\n\\nFinal
Answer: The test output that aligns with the given criteria is a detailed description
of the testing process, providing a thorough understanding for anyone reviewing
it. The output not only contains the raw data or results but also includes step-by-step
documentation of the process employed, thoughts and reasoning behind each step,
deviations if any from the original plan, and how these deviations impacted
the results. In addition, it captures any errors or unexpected occurrences during
the course of the test, and proposes possible explanations or solutions for
these. It is detailed yet comprehensible, catering to both technical and non-technical
audiences. It is a result of meticulous planning, diligent execution, and robust
post-test analysis, making it a complete content.\",\n \"refusal\": null\n
\ },\n \"logprobs\": null,\n \"finish_reason\": \"stop\"\n }\n
\ ],\n \"usage\": {\n \"prompt_tokens\": 149,\n \"completion_tokens\":
151,\n \"total_tokens\": 300,\n \"prompt_tokens_details\": {\n \"cached_tokens\":
0,\n \"audio_tokens\": 0\n },\n \"completion_tokens_details\": {\n
\ \"reasoning_tokens\": 0,\n \"audio_tokens\": 0,\n \"accepted_prediction_tokens\":
0,\n \"rejected_prediction_tokens\": 0\n }\n },\n \"service_tier\":
\"default\",\n \"system_fingerprint\": null\n}\n"
headers:
CF-RAY:
- 90f7662cab11ba33-SEA
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Sun, 09 Feb 2025 22:47:09 GMT
Server:
- cloudflare
Set-Cookie:
- __cf_bm=p1aGVyahvfLAvEwvbX0FMmrN5o18PpVAu2dG_dTgMSU-1739141229-1.0.1.1-_q7aCslZTr11IMFZ81VgyuqsGiqTARFPANUvBEWM_0dZdb97Py78KE1omxdNv5F1pFKoWUqA1kEF2wzQ2wz4aA;
path=/; expires=Sun, 09-Feb-25 23:17:09 GMT; domain=.api.openai.com; HttpOnly;
Secure; SameSite=None
- _cfuvid=bsF0jwE67cS.ywAaQU59jKPFC03S1dvynClHm_wTQik-1739141229143-0.0.1.1-604800000;
path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
cf-cache-status:
- DYNAMIC
openai-organization:
- crewai-iuxna1
openai-processing-ms:
- '4585'
openai-version:
- '2020-10-01'
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- '10000'
x-ratelimit-limit-tokens:
- '1000000'
x-ratelimit-remaining-requests:
- '9999'
x-ratelimit-remaining-tokens:
- '999822'
x-ratelimit-reset-requests:
- 6ms
x-ratelimit-reset-tokens:
- 10ms
x-request-id:
- req_1ba81a80018602119b871a7a42d7becf
http_version: HTTP/1.1
status_code: 200
- request:
body: !!binary |
Ct8LCiQKIgoMc2VydmljZS5uYW1lEhIKEGNyZXdBSS10ZWxlbWV0cnkStgsKEgoQY3Jld2FpLnRl
bGVtZXRyeRL5AQoQT8uAOJ+suOhFs22RW56o6BII0Ob64+TP3XQqE0NyZXcgVGVzdCBFeGVjdXRp
b24wATk1IlKguqsiGEGGgWOguqsiGEobCg5jcmV3YWlfdmVyc2lvbhIJCgcwLjEwMC4xSi4KCGNy
ZXdfa2V5EiIKIGZlYjFlMjFiMzI1NmM1OWE2NDcxNTJhZmRkNjYzMjJlSjEKB2NyZXdfaWQSJgok
M2Q1MGJkYWItZDI1NS00MjFiLThkMzMtZjZmOTAzMThhOWQwShEKCml0ZXJhdGlvbnMSAwoBMUoV
Cgptb2RlbF9uYW1lEgcKBWdwdC00egIYAYUBAAEAABKSBwoQgzrB2KxaHe9FwPNktJHbFRIIT8gM
r7rSvJUqDENyZXcgQ3JlYXRlZDABOW4kaqC6qyIYQe+yeaC6qyIYShsKDmNyZXdhaV92ZXJzaW9u
EgkKBzAuMTAwLjFKGgoOcHl0aG9uX3ZlcnNpb24SCAoGMy4xMi43Si4KCGNyZXdfa2V5EiIKIGZl
YjFlMjFiMzI1NmM1OWE2NDcxNTJhZmRkNjYzMjJlSjEKB2NyZXdfaWQSJgokM2Q1MGJkYWItZDI1
NS00MjFiLThkMzMtZjZmOTAzMThhOWQwShwKDGNyZXdfcHJvY2VzcxIMCgpzZXF1ZW50aWFsShEK
C2NyZXdfbWVtb3J5EgIQAEoaChRjcmV3X251bWJlcl9vZl90YXNrcxICGAFKGwoVY3Jld19udW1i
ZXJfb2ZfYWdlbnRzEgIYAUrFAgoLY3Jld19hZ2VudHMStQIKsgJbeyJrZXkiOiAiOTc2ZjhmNTBh
Y2NmZWJhMjIzZTQ5YzQyYjE2ZTk5ZTYiLCAiaWQiOiAiN2E3NmZjNmYtZTI5YS00MDBlLWI0NGEt
NzAyMDNlMzg1Y2RmIiwgInJvbGUiOiAidGVzdCIsICJ2ZXJib3NlPyI6IGZhbHNlLCAibWF4X2l0
ZXIiOiAyNSwgIm1heF9ycG0iOiBudWxsLCAiZnVuY3Rpb25fY2FsbGluZ19sbG0iOiAiIiwgImxs
bSI6ICJncHQtNCIsICJkZWxlZ2F0aW9uX2VuYWJsZWQ/IjogZmFsc2UsICJhbGxvd19jb2RlX2V4
ZWN1dGlvbj8iOiBmYWxzZSwgIm1heF9yZXRyeV9saW1pdCI6IDIsICJ0b29sc19uYW1lcyI6IFtd
fV1K+QEKCmNyZXdfdGFza3MS6gEK5wFbeyJrZXkiOiAiZGE5NWViZGIzNmU0Y2RmOTJkZjZhNmRk
MTZiY2VlMGUiLCAiaWQiOiAiNTcwYmJlYjQtYzkzNi00NTNkLTg2MjktYzhjMDM0ODA5NDhjIiwg
ImFzeW5jX2V4ZWN1dGlvbj8iOiBmYWxzZSwgImh1bWFuX2lucHV0PyI6IGZhbHNlLCAiYWdlbnRf
cm9sZSI6ICJ0ZXN0IiwgImFnZW50X2tleSI6ICI5NzZmOGY1MGFjY2ZlYmEyMjNlNDljNDJiMTZl
OTllNiIsICJ0b29sc19uYW1lcyI6IFtdfV16AhgBhQEAAQAAEo4CChAus8hZAJcezzXdP2XqhVyF
Egi1wnliqIdQdSoMVGFzayBDcmVhdGVkMAE5nneIoLqrIhhBKDyJoLqrIhhKLgoIY3Jld19rZXkS
IgogZmViMWUyMWIzMjU2YzU5YTY0NzE1MmFmZGQ2NjMyMmVKMQoHY3Jld19pZBImCiQzZDUwYmRh
Yi1kMjU1LTQyMWItOGQzMy1mNmY5MDMxOGE5ZDBKLgoIdGFza19rZXkSIgogZGE5NWViZGIzNmU0
Y2RmOTJkZjZhNmRkMTZiY2VlMGVKMQoHdGFza19pZBImCiQ1NzBiYmViNC1jOTM2LTQ1M2QtODYy
OS1jOGMwMzQ4MDk0OGN6AhgBhQEAAQAA
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '1506'
Content-Type:
- application/x-protobuf
User-Agent:
- OTel-OTLP-Exporter-Python/1.27.0
method: POST
uri: https://telemetry.crewai.com:4319/v1/traces
response:
body:
string: "\n\0"
headers:
Content-Length:
- '2'
Content-Type:
- application/x-protobuf
Date:
- Sun, 09 Feb 2025 22:47:09 GMT
status:
code: 200
message: OK
- request:
body: '{"messages": [{"role": "system", "content": "You are Task Execution Evaluator.
Evaluator agent for crew evaluation with precise capabilities to evaluate the
performance of the agents in the crew based on the tasks they have performed\nYour
personal goal is: Your goal is to evaluate the performance of the agents in
the crew based on the tasks they have performed using score from 1 to 10 evaluating
on completion, quality, and overall performance.\nTo give my best complete final
answer to the task respond using the exact following format:\n\nThought: I now
can give a great answer\nFinal Answer: Your final answer must be the great and
the most complete as possible, it must be outcome described.\n\nI MUST use these
formats, my job depends on it!"}, {"role": "user", "content": "\nCurrent Task:
Based on the task description and the expected output, compare and evaluate
the performance of the agents in the crew based on the Task Output they have
performed using score from 1 to 10 evaluating on completion, quality, and overall
performance.task_description: test task_expected_output: test output agent:
test agent_goal: test Task Output: The test output that aligns with the given
criteria is a detailed description of the testing process, providing a thorough
understanding for anyone reviewing it. The output not only contains the raw
data or results but also includes step-by-step documentation of the process
employed, thoughts and reasoning behind each step, deviations if any from the
original plan, and how these deviations impacted the results. In addition, it
captures any errors or unexpected occurrences during the course of the test,
and proposes possible explanations or solutions for these. It is detailed yet
comprehensible, catering to both technical and non-technical audiences. It is
a result of meticulous planning, diligent execution, and robust post-test analysis,
making it a complete content.\n\nThis is the expected criteria for your final
answer: Evaluation Score from 1 to 10 based on the performance of the agents
on the tasks\nyou MUST return the actual complete content as the final answer,
not a summary.\nEnsure your final answer contains only the content in the following
format: {\n \"quality\": float\n}\n\nEnsure the final output does not include
any code block markers like ```json or ```python.\n\nBegin! This is VERY important
to you, use the tools available and give your best Final Answer, your job depends
on it!\n\nThought:"}], "model": "gpt-4", "stop": ["\nObservation:"]}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate
authorization:
- Bearer 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
connection:
- keep-alive
content-length:
- '2523'
content-type:
- application/json
cookie:
- __cf_bm=p1aGVyahvfLAvEwvbX0FMmrN5o18PpVAu2dG_dTgMSU-1739141229-1.0.1.1-_q7aCslZTr11IMFZ81VgyuqsGiqTARFPANUvBEWM_0dZdb97Py78KE1omxdNv5F1pFKoWUqA1kEF2wzQ2wz4aA;
_cfuvid=bsF0jwE67cS.ywAaQU59jKPFC03S1dvynClHm_wTQik-1739141229143-0.0.1.1-604800000
host:
- api.openai.com
user-agent:
- OpenAI/Python 1.61.0
x-stainless-arch:
- x64
x-stainless-async:
- 'false'
x-stainless-lang:
- python
x-stainless-os:
- Linux
x-stainless-package-version:
- 1.61.0
x-stainless-raw-response:
- 'true'
x-stainless-retry-count:
- '0'
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.12.7
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
content: "{\n \"id\": \"chatcmpl-AzAMvAOSw5847reo2vh61focjnyK2\",\n \"object\":
\"chat.completion\",\n \"created\": 1739141229,\n \"model\": \"gpt-4-0613\",\n
\ \"choices\": [\n {\n \"index\": 0,\n \"message\": {\n \"role\":
\"assistant\",\n \"content\": \"Based on the given task output, I can
determine that the test agent has performed impressively well. Their work is
comprehensive, catering to both non-technical and technical audiences and includes
complete and detailed process documentation. Further, the way they detect and
elaborate deviances and errors shows their meticulousness and efficiency. Their
planning, execution, and analysis are sound.\\n\\nFinal Answer: The quality
of this task is admirable, paying attention to details and meticulously planning
and reasoning behind each step. Considering all these, on a scale from 1 to
10, I would rate the task performed by the test agent as follows:\\n\\n{\\n
\ \\\"quality\\\": 9.5\\n}\",\n \"refusal\": null\n },\n \"logprobs\":
null,\n \"finish_reason\": \"stop\"\n }\n ],\n \"usage\": {\n \"prompt_tokens\":
471,\n \"completion_tokens\": 133,\n \"total_tokens\": 604,\n \"prompt_tokens_details\":
{\n \"cached_tokens\": 0,\n \"audio_tokens\": 0\n },\n \"completion_tokens_details\":
{\n \"reasoning_tokens\": 0,\n \"audio_tokens\": 0,\n \"accepted_prediction_tokens\":
0,\n \"rejected_prediction_tokens\": 0\n }\n },\n \"service_tier\":
\"default\",\n \"system_fingerprint\": null\n}\n"
headers:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 90f7664a581eba33-SEA
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Sun, 09 Feb 2025 22:47:14 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
openai-organization:
- crewai-iuxna1
openai-processing-ms:
- '4884'
openai-version:
- '2020-10-01'
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- '10000'
x-ratelimit-limit-tokens:
- '1000000'
x-ratelimit-remaining-requests:
- '9999'
x-ratelimit-remaining-tokens:
- '999388'
x-ratelimit-reset-requests:
- 6ms
x-ratelimit-reset-tokens:
- 36ms
x-request-id:
- req_0335ff13c1777c1bcdbee89879bc132c
http_version: HTTP/1.1
status_code: 200
- request:
body: !!binary |
CvsNCiQKIgoMc2VydmljZS5uYW1lEhIKEGNyZXdBSS10ZWxlbWV0cnkS0g0KEgoQY3Jld2FpLnRl
bGVtZXRyeRKZAgoQJQy0LAglHxA7Ok+n0Gmi9hIIybabc5KDQlkqG0NyZXcgSW5kaXZpZHVhbCBU
ZXN0IFJlc3VsdDABOQK5tem8qyIYQVTw0+m8qyIYShsKDmNyZXdhaV92ZXJzaW9uEgkKBzAuMTAw
LjFKLgoIY3Jld19rZXkSIgogZmViMWUyMWIzMjU2YzU5YTY0NzE1MmFmZGQ2NjMyMmVKMQoHY3Jl
d19pZBImCiQzZDUwYmRhYi1kMjU1LTQyMWItOGQzMy1mNmY5MDMxOGE5ZDBKEAoHcXVhbGl0eRIF
CgM5LjVKFwoJZXhlY190aW1lEgoKCDQuODA5MTc4ShUKCm1vZGVsX25hbWUSBwoFZ3B0LTR6AhgB
hQEAAQAAEvkBChCnePSvJJg/cFeYF3HlEvyVEgh3YAewpHkssyoTQ3JldyBUZXN0IEV4ZWN1dGlv
bjABOeiBSOq8qyIYQQWTV+q8qyIYShsKDmNyZXdhaV92ZXJzaW9uEgkKBzAuMTAwLjFKLgoIY3Jl
d19rZXkSIgogZmViMWUyMWIzMjU2YzU5YTY0NzE1MmFmZGQ2NjMyMmVKMQoHY3Jld19pZBImCiQ2
YzYyNmEyZi05OGRlLTQ2ODAtOWJhNC01NWVkYzdmODhiZTNKEQoKaXRlcmF0aW9ucxIDCgExShUK
Cm1vZGVsX25hbWUSBwoFZ3B0LTR6AhgBhQEAAQAAEpIHChB0Gz3vlppAjams1hbMI/RQEggPCufR
e9thfSoMQ3JldyBDcmVhdGVkMAE5X3td6ryrIhhBWNhr6ryrIhhKGwoOY3Jld2FpX3ZlcnNpb24S
CQoHMC4xMDAuMUoaCg5weXRob25fdmVyc2lvbhIICgYzLjEyLjdKLgoIY3Jld19rZXkSIgogZmVi
MWUyMWIzMjU2YzU5YTY0NzE1MmFmZGQ2NjMyMmVKMQoHY3Jld19pZBImCiQ2YzYyNmEyZi05OGRl
LTQ2ODAtOWJhNC01NWVkYzdmODhiZTNKHAoMY3Jld19wcm9jZXNzEgwKCnNlcXVlbnRpYWxKEQoL
Y3Jld19tZW1vcnkSAhAAShoKFGNyZXdfbnVtYmVyX29mX3Rhc2tzEgIYAUobChVjcmV3X251bWJl
cl9vZl9hZ2VudHMSAhgBSsUCCgtjcmV3X2FnZW50cxK1AgqyAlt7ImtleSI6ICI5NzZmOGY1MGFj
Y2ZlYmEyMjNlNDljNDJiMTZlOTllNiIsICJpZCI6ICJkYTA4M2Q5ZS0xOWU5LTQyMzAtYjZmNC0y
NjlhNzM1NzViOWQiLCAicm9sZSI6ICJ0ZXN0IiwgInZlcmJvc2U/IjogZmFsc2UsICJtYXhfaXRl
ciI6IDI1LCAibWF4X3JwbSI6IG51bGwsICJmdW5jdGlvbl9jYWxsaW5nX2xsbSI6ICIiLCAibGxt
IjogImdwdC00IiwgImRlbGVnYXRpb25fZW5hYmxlZD8iOiBmYWxzZSwgImFsbG93X2NvZGVfZXhl
Y3V0aW9uPyI6IGZhbHNlLCAibWF4X3JldHJ5X2xpbWl0IjogMiwgInRvb2xzX25hbWVzIjogW119
XUr5AQoKY3Jld190YXNrcxLqAQrnAVt7ImtleSI6ICJkYTk1ZWJkYjM2ZTRjZGY5MmRmNmE2ZGQx
NmJjZWUwZSIsICJpZCI6ICJhNGUwYjM1Ny0zMDBlLTQ0MjMtYTU1My0yZTZlMWQxODg1M2MiLCAi
YXN5bmNfZXhlY3V0aW9uPyI6IGZhbHNlLCAiaHVtYW5faW5wdXQ/IjogZmFsc2UsICJhZ2VudF9y
b2xlIjogInRlc3QiLCAiYWdlbnRfa2V5IjogIjk3NmY4ZjUwYWNjZmViYTIyM2U0OWM0MmIxNmU5
OWU2IiwgInRvb2xzX25hbWVzIjogW119XXoCGAGFAQABAAASjgIKEC5jZek+sSlZP8lSwF5zTSYS
CIHcVJhIpsWuKgxUYXNrIENyZWF0ZWQwATl9vnrqvKsiGEFgjnvqvKsiGEouCghjcmV3X2tleRIi
CiBmZWIxZTIxYjMyNTZjNTlhNjQ3MTUyYWZkZDY2MzIyZUoxCgdjcmV3X2lkEiYKJDZjNjI2YTJm
LTk4ZGUtNDY4MC05YmE0LTU1ZWRjN2Y4OGJlM0ouCgh0YXNrX2tleRIiCiBkYTk1ZWJkYjM2ZTRj
ZGY5MmRmNmE2ZGQxNmJjZWUwZUoxCgd0YXNrX2lkEiYKJGE0ZTBiMzU3LTMwMGUtNDQyMy1hNTUz
LTJlNmUxZDE4ODUzY3oCGAGFAQABAAA=
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '1790'
Content-Type:
- application/x-protobuf
User-Agent:
- OTel-OTLP-Exporter-Python/1.27.0
method: POST
uri: https://telemetry.crewai.com:4319/v1/traces
response:
body:
string: "\n\0"
headers:
Content-Length:
- '2'
Content-Type:
- application/x-protobuf
Date:
- Sun, 09 Feb 2025 22:47:14 GMT
status:
code: 200
message: OK
- request:
body: '{"messages": [{"role": "system", "content": "You are test. test\nYour personal
goal is: test\nTo give my best complete final answer to the task respond using
the exact following format:\n\nThought: I now can give a great answer\nFinal
Answer: Your final answer must be the great and the most complete as possible,
it must be outcome described.\n\nI MUST use these formats, my job depends on
it!"}, {"role": "user", "content": "\nCurrent Task: test\n\nThis is the expected
criteria for your final answer: test output\nyou MUST return the actual complete
content as the final answer, not a summary.\n\nBegin! This is VERY important
to you, use the tools available and give your best Final Answer, your job depends
on it!\n\nThought:"}], "model": "gpt-4", "stop": ["\nObservation:"]}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate
authorization:
- Bearer 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
connection:
- keep-alive
content-length:
- '780'
content-type:
- application/json
cookie:
- __cf_bm=p1aGVyahvfLAvEwvbX0FMmrN5o18PpVAu2dG_dTgMSU-1739141229-1.0.1.1-_q7aCslZTr11IMFZ81VgyuqsGiqTARFPANUvBEWM_0dZdb97Py78KE1omxdNv5F1pFKoWUqA1kEF2wzQ2wz4aA;
_cfuvid=bsF0jwE67cS.ywAaQU59jKPFC03S1dvynClHm_wTQik-1739141229143-0.0.1.1-604800000
host:
- api.openai.com
user-agent:
- OpenAI/Python 1.61.0
x-stainless-arch:
- x64
x-stainless-async:
- 'false'
x-stainless-lang:
- python
x-stainless-os:
- Linux
x-stainless-package-version:
- 1.61.0
x-stainless-raw-response:
- 'true'
x-stainless-retry-count:
- '0'
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.12.7
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
content: "{\n \"id\": \"chatcmpl-AzAN0cgAktzQnGukedPNpZsTy461c\",\n \"object\":
\"chat.completion\",\n \"created\": 1739141234,\n \"model\": \"gpt-4-0613\",\n
\ \"choices\": [\n {\n \"index\": 0,\n \"message\": {\n \"role\":
\"assistant\",\n \"content\": \"I have an understanding of the task at
hand and am ready to provide an in-depth and comprehensive answer.\\n\\nFinal
Answer: As per the requirement of the task to provide a complete output, I am
returning this test output as my final answer. It is not a summary, but rather
a full and comprehensive response that fully addresses the question and expectations
set forth. Your test output is ready.\",\n \"refusal\": null\n },\n
\ \"logprobs\": null,\n \"finish_reason\": \"stop\"\n }\n ],\n
\ \"usage\": {\n \"prompt_tokens\": 149,\n \"completion_tokens\": 77,\n
\ \"total_tokens\": 226,\n \"prompt_tokens_details\": {\n \"cached_tokens\":
0,\n \"audio_tokens\": 0\n },\n \"completion_tokens_details\": {\n
\ \"reasoning_tokens\": 0,\n \"audio_tokens\": 0,\n \"accepted_prediction_tokens\":
0,\n \"rejected_prediction_tokens\": 0\n }\n },\n \"service_tier\":
\"default\",\n \"system_fingerprint\": null\n}\n"
headers:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 90f76669bd9eba33-SEA
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Sun, 09 Feb 2025 22:47:17 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
openai-organization:
- crewai-iuxna1
openai-processing-ms:
- '3379'
openai-version:
- '2020-10-01'
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- '10000'
x-ratelimit-limit-tokens:
- '1000000'
x-ratelimit-remaining-requests:
- '9999'
x-ratelimit-remaining-tokens:
- '999822'
x-ratelimit-reset-requests:
- 6ms
x-ratelimit-reset-tokens:
- 10ms
x-request-id:
- req_977371ae262154885689d766016ed132
http_version: HTTP/1.1
status_code: 200
- request:
body: '{"messages": [{"role": "system", "content": "You are Task Execution Evaluator.
Evaluator agent for crew evaluation with precise capabilities to evaluate the
performance of the agents in the crew based on the tasks they have performed\nYour
personal goal is: Your goal is to evaluate the performance of the agents in
the crew based on the tasks they have performed using score from 1 to 10 evaluating
on completion, quality, and overall performance.\nTo give my best complete final
answer to the task respond using the exact following format:\n\nThought: I now
can give a great answer\nFinal Answer: Your final answer must be the great and
the most complete as possible, it must be outcome described.\n\nI MUST use these
formats, my job depends on it!"}, {"role": "user", "content": "\nCurrent Task:
Based on the task description and the expected output, compare and evaluate
the performance of the agents in the crew based on the Task Output they have
performed using score from 1 to 10 evaluating on completion, quality, and overall
performance.task_description: test task_expected_output: test output agent:
test agent_goal: test Task Output: As per the requirement of the task to provide
a complete output, I am returning this test output as my final answer. It is
not a summary, but rather a full and comprehensive response that fully addresses
the question and expectations set forth. Your test output is ready.\n\nThis
is the expected criteria for your final answer: Evaluation Score from 1 to 10
based on the performance of the agents on the tasks\nyou MUST return the actual
complete content as the final answer, not a summary.\nEnsure your final answer
contains only the content in the following format: {\n \"quality\": float\n}\n\nEnsure
the final output does not include any code block markers like ```json or ```python.\n\nBegin!
This is VERY important to you, use the tools available and give your best Final
Answer, your job depends on it!\n\nThought:"}], "model": "gpt-4", "stop": ["\nObservation:"]}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate
authorization:
- Bearer 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
connection:
- keep-alive
content-length:
- '2017'
content-type:
- application/json
cookie:
- __cf_bm=p1aGVyahvfLAvEwvbX0FMmrN5o18PpVAu2dG_dTgMSU-1739141229-1.0.1.1-_q7aCslZTr11IMFZ81VgyuqsGiqTARFPANUvBEWM_0dZdb97Py78KE1omxdNv5F1pFKoWUqA1kEF2wzQ2wz4aA;
_cfuvid=bsF0jwE67cS.ywAaQU59jKPFC03S1dvynClHm_wTQik-1739141229143-0.0.1.1-604800000
host:
- api.openai.com
user-agent:
- OpenAI/Python 1.61.0
x-stainless-arch:
- x64
x-stainless-async:
- 'false'
x-stainless-lang:
- python
x-stainless-os:
- Linux
x-stainless-package-version:
- 1.61.0
x-stainless-raw-response:
- 'true'
x-stainless-retry-count:
- '0'
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.12.7
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
content: "{\n \"id\": \"chatcmpl-AzAN3PPgDvH836sMJBXCVRc3im99S\",\n \"object\":
\"chat.completion\",\n \"created\": 1739141237,\n \"model\": \"gpt-4-0613\",\n
\ \"choices\": [\n {\n \"index\": 0,\n \"message\": {\n \"role\":
\"assistant\",\n \"content\": \"Based on the information provided, the
agent appears to have completed the task, providing an output that they have
defined as 'full and comprehensive'. It appears that the agent has attempted
to meet all the expectations of the task description and has reached the goal
of returning a 'test output' as the final answer.\\n\\nFinal Answer: Considering
these aspects, for task completion, the agent receives a 10 as they have successfully
generated an output. For quality, the agent again receives a significant score
of 10, because the full and comprehensive nature of the output matches the task's
expectations. Finally, taking into account both the completion and quality aspects,
the overall performance evaluation is also 10, recognizing the perfect alignment
between the task's expected output and the output delivered by the agent. \\nTherefore,
the final evaluation score can be summarized in the below format:\\n{\\n\\\"completion\\\":
10,\\n\\\"quality\\\": 10,\\n\\\"overall performance\\\": 10\\n}\",\n \"refusal\":
null\n },\n \"logprobs\": null,\n \"finish_reason\": \"stop\"\n
\ }\n ],\n \"usage\": {\n \"prompt_tokens\": 385,\n \"completion_tokens\":
190,\n \"total_tokens\": 575,\n \"prompt_tokens_details\": {\n \"cached_tokens\":
0,\n \"audio_tokens\": 0\n },\n \"completion_tokens_details\": {\n
\ \"reasoning_tokens\": 0,\n \"audio_tokens\": 0,\n \"accepted_prediction_tokens\":
0,\n \"rejected_prediction_tokens\": 0\n }\n },\n \"service_tier\":
\"default\",\n \"system_fingerprint\": null\n}\n"
headers:
CF-RAY:
- 90f7667f9bfdba33-SEA
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Sun, 09 Feb 2025 22:47:24 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
cf-cache-status:
- DYNAMIC
openai-organization:
- crewai-iuxna1
openai-processing-ms:
- '6909'
openai-version:
- '2020-10-01'
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- '10000'
x-ratelimit-limit-tokens:
- '1000000'
x-ratelimit-remaining-requests:
- '9999'
x-ratelimit-remaining-tokens:
- '999515'
x-ratelimit-reset-requests:
- 6ms
x-ratelimit-reset-tokens:
- 29ms
x-request-id:
- req_3e283ae8c1cd132001ecf2d96198bbd6
http_version: HTTP/1.1
status_code: 200
- request:
body: '{"messages": [{"role": "system", "content": "You are test. test\nYour personal
goal is: test\nTo give my best complete final answer to the task respond using
the exact following format:\n\nThought: I now can give a great answer\nFinal
Answer: Your final answer must be the great and the most complete as possible,
it must be outcome described.\n\nI MUST use these formats, my job depends on
it!"}, {"role": "user", "content": "\nCurrent Task: test\n\nThis is the expected
criteria for your final answer: test output\nyou MUST return the actual complete
content as the final answer, not a summary.\n\nBegin! This is VERY important
to you, use the tools available and give your best Final Answer, your job depends
on it!\n\nThought:"}], "model": "gpt-4", "stop": ["\nObservation:"]}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate
authorization:
- Bearer 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
connection:
- keep-alive
content-length:
- '780'
content-type:
- application/json
cookie:
- __cf_bm=p1aGVyahvfLAvEwvbX0FMmrN5o18PpVAu2dG_dTgMSU-1739141229-1.0.1.1-_q7aCslZTr11IMFZ81VgyuqsGiqTARFPANUvBEWM_0dZdb97Py78KE1omxdNv5F1pFKoWUqA1kEF2wzQ2wz4aA;
_cfuvid=bsF0jwE67cS.ywAaQU59jKPFC03S1dvynClHm_wTQik-1739141229143-0.0.1.1-604800000
host:
- api.openai.com
user-agent:
- OpenAI/Python 1.61.0
x-stainless-arch:
- x64
x-stainless-async:
- 'false'
x-stainless-lang:
- python
x-stainless-os:
- Linux
x-stainless-package-version:
- 1.61.0
x-stainless-raw-response:
- 'true'
x-stainless-retry-count:
- '0'
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.12.7
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
content: "{\n \"id\": \"chatcmpl-AzANAznjrZdppuFIsRnEouHG8WuM0\",\n \"object\":
\"chat.completion\",\n \"created\": 1739141244,\n \"model\": \"gpt-4-0613\",\n
\ \"choices\": [\n {\n \"index\": 0,\n \"message\": {\n \"role\":
\"assistant\",\n \"content\": \"I am ready to prepare my final answer
based on the test output criteria provided\\nFinal Answer: I have followed all
the instructions provided in the task to the best of my ability, and the outcome
of the test is as described in the final answer. It is complete, detailed, and
accurate.\",\n \"refusal\": null\n },\n \"logprobs\": null,\n
\ \"finish_reason\": \"stop\"\n }\n ],\n \"usage\": {\n \"prompt_tokens\":
149,\n \"completion_tokens\": 59,\n \"total_tokens\": 208,\n \"prompt_tokens_details\":
{\n \"cached_tokens\": 0,\n \"audio_tokens\": 0\n },\n \"completion_tokens_details\":
{\n \"reasoning_tokens\": 0,\n \"audio_tokens\": 0,\n \"accepted_prediction_tokens\":
0,\n \"rejected_prediction_tokens\": 0\n }\n },\n \"service_tier\":
\"default\",\n \"system_fingerprint\": null\n}\n"
headers:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 90f766abaaa3ba33-SEA
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Sun, 09 Feb 2025 22:47:27 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
openai-organization:
- crewai-iuxna1
openai-processing-ms:
- '2533'
openai-version:
- '2020-10-01'
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- '10000'
x-ratelimit-limit-tokens:
- '1000000'
x-ratelimit-remaining-requests:
- '9999'
x-ratelimit-remaining-tokens:
- '999822'
x-ratelimit-reset-requests:
- 6ms
x-ratelimit-reset-tokens:
- 10ms
x-request-id:
- req_6ea4c81627695f58de56727aa8d8cc59
http_version: HTTP/1.1
status_code: 200
- request:
body: !!binary |
CvwNCiQKIgoMc2VydmljZS5uYW1lEhIKEGNyZXdBSS10ZWxlbWV0cnkS0w0KEgoQY3Jld2FpLnRl
bGVtZXRyeRKaAgoQGJwIgEdh/Dq2y8ue+Gl/XxIInTNNpEL8yjQqG0NyZXcgSW5kaXZpZHVhbCBU
ZXN0IFJlc3VsdDABOaAPzl6/qyIYQYnY6V6/qyIYShsKDmNyZXdhaV92ZXJzaW9uEgkKBzAuMTAw
LjFKLgoIY3Jld19rZXkSIgogZmViMWUyMWIzMjU2YzU5YTY0NzE1MmFmZGQ2NjMyMmVKMQoHY3Jl
d19pZBImCiQ2YzYyNmEyZi05OGRlLTQ2ODAtOWJhNC01NWVkYzdmODhiZTNKEQoHcXVhbGl0eRIG
CgQxMC4wShcKCWV4ZWNfdGltZRIKCggzLjUwMzUyNkoVCgptb2RlbF9uYW1lEgcKBWdwdC00egIY
AYUBAAEAABL5AQoQsfCFg6/ZkEo2LShWV3X+WhII4I9o90lQzxMqE0NyZXcgVGVzdCBFeGVjdXRp
b24wATmWhUlfv6siGEFeYlZfv6siGEobCg5jcmV3YWlfdmVyc2lvbhIJCgcwLjEwMC4xSi4KCGNy
ZXdfa2V5EiIKIGZlYjFlMjFiMzI1NmM1OWE2NDcxNTJhZmRkNjYzMjJlSjEKB2NyZXdfaWQSJgok
ZDU2ZjljMWEtYmRkMS00MDI3LWI1ZjctMzg1ZGVlMWU2YjljShEKCml0ZXJhdGlvbnMSAwoBMUoV
Cgptb2RlbF9uYW1lEgcKBWdwdC00egIYAYUBAAEAABKSBwoQlucrHD/mwnCU8Dl9QKzgYhIIGyix
8K7RcoAqDENyZXcgQ3JlYXRlZDABOTQxXF+/qyIYQamXaV+/qyIYShsKDmNyZXdhaV92ZXJzaW9u
EgkKBzAuMTAwLjFKGgoOcHl0aG9uX3ZlcnNpb24SCAoGMy4xMi43Si4KCGNyZXdfa2V5EiIKIGZl
YjFlMjFiMzI1NmM1OWE2NDcxNTJhZmRkNjYzMjJlSjEKB2NyZXdfaWQSJgokZDU2ZjljMWEtYmRk
MS00MDI3LWI1ZjctMzg1ZGVlMWU2YjljShwKDGNyZXdfcHJvY2VzcxIMCgpzZXF1ZW50aWFsShEK
C2NyZXdfbWVtb3J5EgIQAEoaChRjcmV3X251bWJlcl9vZl90YXNrcxICGAFKGwoVY3Jld19udW1i
ZXJfb2ZfYWdlbnRzEgIYAUrFAgoLY3Jld19hZ2VudHMStQIKsgJbeyJrZXkiOiAiOTc2ZjhmNTBh
Y2NmZWJhMjIzZTQ5YzQyYjE2ZTk5ZTYiLCAiaWQiOiAiMzcwMzA5YTQtMDU5OS00MWVlLWFiMTgt
YWE1ZmQ1Mjg2ZGQ1IiwgInJvbGUiOiAidGVzdCIsICJ2ZXJib3NlPyI6IGZhbHNlLCAibWF4X2l0
ZXIiOiAyNSwgIm1heF9ycG0iOiBudWxsLCAiZnVuY3Rpb25fY2FsbGluZ19sbG0iOiAiIiwgImxs
bSI6ICJncHQtNCIsICJkZWxlZ2F0aW9uX2VuYWJsZWQ/IjogZmFsc2UsICJhbGxvd19jb2RlX2V4
ZWN1dGlvbj8iOiBmYWxzZSwgIm1heF9yZXRyeV9saW1pdCI6IDIsICJ0b29sc19uYW1lcyI6IFtd
fV1K+QEKCmNyZXdfdGFza3MS6gEK5wFbeyJrZXkiOiAiZGE5NWViZGIzNmU0Y2RmOTJkZjZhNmRk
MTZiY2VlMGUiLCAiaWQiOiAiZTBmNDgzNjAtYzNjNS00ZGY1LThkZjEtNDg2ZTc4OWNiZWUyIiwg
ImFzeW5jX2V4ZWN1dGlvbj8iOiBmYWxzZSwgImh1bWFuX2lucHV0PyI6IGZhbHNlLCAiYWdlbnRf
cm9sZSI6ICJ0ZXN0IiwgImFnZW50X2tleSI6ICI5NzZmOGY1MGFjY2ZlYmEyMjNlNDljNDJiMTZl
OTllNiIsICJ0b29sc19uYW1lcyI6IFtdfV16AhgBhQEAAQAAEo4CChA4OLnKHp32b0EUM2g5rs+r
EgjifMpu5dQ6xCoMVGFzayBDcmVhdGVkMAE5x1d3X7+rIhhBMxF4X7+rIhhKLgoIY3Jld19rZXkS
IgogZmViMWUyMWIzMjU2YzU5YTY0NzE1MmFmZGQ2NjMyMmVKMQoHY3Jld19pZBImCiRkNTZmOWMx
YS1iZGQxLTQwMjctYjVmNy0zODVkZWUxZTZiOWNKLgoIdGFza19rZXkSIgogZGE5NWViZGIzNmU0
Y2RmOTJkZjZhNmRkMTZiY2VlMGVKMQoHdGFza19pZBImCiRlMGY0ODM2MC1jM2M1LTRkZjUtOGRm
MS00ODZlNzg5Y2JlZTJ6AhgBhQEAAQAA
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '1791'
Content-Type:
- application/x-protobuf
User-Agent:
- OTel-OTLP-Exporter-Python/1.27.0
method: POST
uri: https://telemetry.crewai.com:4319/v1/traces
response:
body:
string: "\n\0"
headers:
Content-Length:
- '2'
Content-Type:
- application/x-protobuf
Date:
- Sun, 09 Feb 2025 22:47:29 GMT
status:
code: 200
message: OK
- request:
body: '{"messages": [{"role": "system", "content": "You are Task Execution Evaluator.
Evaluator agent for crew evaluation with precise capabilities to evaluate the
performance of the agents in the crew based on the tasks they have performed\nYour
personal goal is: Your goal is to evaluate the performance of the agents in
the crew based on the tasks they have performed using score from 1 to 10 evaluating
on completion, quality, and overall performance.\nTo give my best complete final
answer to the task respond using the exact following format:\n\nThought: I now
can give a great answer\nFinal Answer: Your final answer must be the great and
the most complete as possible, it must be outcome described.\n\nI MUST use these
formats, my job depends on it!"}, {"role": "user", "content": "\nCurrent Task:
Based on the task description and the expected output, compare and evaluate
the performance of the agents in the crew based on the Task Output they have
performed using score from 1 to 10 evaluating on completion, quality, and overall
performance.task_description: test task_expected_output: test output agent:
test agent_goal: test Task Output: I have followed all the instructions provided
in the task to the best of my ability, and the outcome of the test is as described
in the final answer. It is complete, detailed, and accurate.\n\nThis is the
expected criteria for your final answer: Evaluation Score from 1 to 10 based
on the performance of the agents on the tasks\nyou MUST return the actual complete
content as the final answer, not a summary.\nEnsure your final answer contains
only the content in the following format: {\n \"quality\": float\n}\n\nEnsure
the final output does not include any code block markers like ```json or ```python.\n\nBegin!
This is VERY important to you, use the tools available and give your best Final
Answer, your job depends on it!\n\nThought:"}], "model": "gpt-4", "stop": ["\nObservation:"]}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate
authorization:
- Bearer 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
connection:
- keep-alive
content-length:
- '1935'
content-type:
- application/json
cookie:
- __cf_bm=p1aGVyahvfLAvEwvbX0FMmrN5o18PpVAu2dG_dTgMSU-1739141229-1.0.1.1-_q7aCslZTr11IMFZ81VgyuqsGiqTARFPANUvBEWM_0dZdb97Py78KE1omxdNv5F1pFKoWUqA1kEF2wzQ2wz4aA;
_cfuvid=bsF0jwE67cS.ywAaQU59jKPFC03S1dvynClHm_wTQik-1739141229143-0.0.1.1-604800000
host:
- api.openai.com
user-agent:
- OpenAI/Python 1.61.0
x-stainless-arch:
- x64
x-stainless-async:
- 'false'
x-stainless-lang:
- python
x-stainless-os:
- Linux
x-stainless-package-version:
- 1.61.0
x-stainless-raw-response:
- 'true'
x-stainless-retry-count:
- '0'
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.12.7
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
content: "{\n \"id\": \"chatcmpl-AzANDGSiRIu1XO56ZMfLuO2SuL4l0\",\n \"object\":
\"chat.completion\",\n \"created\": 1739141247,\n \"model\": \"gpt-4-0613\",\n
\ \"choices\": [\n {\n \"index\": 0,\n \"message\": {\n \"role\":
\"assistant\",\n \"content\": \"Looking at the task output provided by
the agent, the agent has expressed a high level of confidence in their ability
to follow the instructions provided to the best of their ability. The agent
seems to have executed the task with detailed attention and accuracy.\\n\\nI
need to evaluate both the quality and the overall performance of the agent keeping
in mind the task description and the expected output. Given the agent's output
and goal, I can deduce the quality of their work as well as their overall performance.\\n\\nFinal
Answer: \\n{\\n \\\"quality\\\": 8.5\\n}\",\n \"refusal\": null\n },\n
\ \"logprobs\": null,\n \"finish_reason\": \"stop\"\n }\n ],\n
\ \"usage\": {\n \"prompt_tokens\": 372,\n \"completion_tokens\": 112,\n
\ \"total_tokens\": 484,\n \"prompt_tokens_details\": {\n \"cached_tokens\":
0,\n \"audio_tokens\": 0\n },\n \"completion_tokens_details\": {\n
\ \"reasoning_tokens\": 0,\n \"audio_tokens\": 0,\n \"accepted_prediction_tokens\":
0,\n \"rejected_prediction_tokens\": 0\n }\n },\n \"service_tier\":
\"default\",\n \"system_fingerprint\": null\n}\n"
headers:
CF-RAY:
- 90f766bc4d87ba33-SEA
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Sun, 09 Feb 2025 22:47:32 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
cf-cache-status:
- DYNAMIC
openai-organization:
- crewai-iuxna1
openai-processing-ms:
- '5056'
openai-version:
- '2020-10-01'
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- '10000'
x-ratelimit-limit-tokens:
- '1000000'
x-ratelimit-remaining-requests:
- '9999'
x-ratelimit-remaining-tokens:
- '999535'
x-ratelimit-reset-requests:
- 6ms
x-ratelimit-reset-tokens:
- 27ms
x-request-id:
- req_ff2926b015823a70e2173c71f8d63209
http_version: HTTP/1.1
status_code: 200
version: 1

View File

@@ -0,0 +1,71 @@
from unittest.mock import MagicMock
import pytest
from crewai.agent import Agent
from crewai.crew import Crew
from crewai.llm import LLM
from crewai.task import Task
from crewai.utilities.evaluators.crew_evaluator_handler import CrewEvaluator
@pytest.mark.vcr()
def test_crew_test_with_custom_llm():
"""Test Crew.test() with both string model name and LLM instance."""
# Setup
agent = Agent(
role="test",
goal="test",
backstory="test",
llm=LLM(model="gpt-4"),
)
task = Task(
description="test",
expected_output="test output",
agent=agent,
)
crew = Crew(agents=[agent], tasks=[task])
# Test with string model name
crew.test(n_iterations=1, llm="gpt-4")
# Test with LLM instance
custom_llm = LLM(model="gpt-4")
crew.test(n_iterations=1, llm=custom_llm)
# Test backward compatibility
crew.test(n_iterations=1, openai_model_name="gpt-4")
# Test error when neither parameter is provided
with pytest.raises(ValueError, match="Must provide either 'llm' or 'openai_model_name' parameter"):
crew.test(n_iterations=1)
def test_crew_evaluator_with_custom_llm():
# Setup
agent = Agent(
role="test",
goal="test",
backstory="test",
llm=LLM(model="gpt-4"),
)
task = Task(
description="test",
expected_output="test output",
agent=agent,
)
crew = Crew(agents=[agent], tasks=[task])
# Test with string model name
evaluator = CrewEvaluator(crew, "gpt-4")
assert isinstance(evaluator.llm, LLM)
assert evaluator.llm.model == "gpt-4"
# Test with LLM instance
custom_llm = LLM(model="gpt-4")
evaluator = CrewEvaluator(crew, custom_llm)
assert evaluator.llm == custom_llm
# Test that evaluator agent uses the correct LLM
evaluator_agent = evaluator._evaluator_agent()
assert evaluator_agent.llm == evaluator.llm