mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-10 16:48:30 +00:00
test: fix evaluation tests and update cassettes
This commit is contained in:
@@ -234,4 +234,130 @@ interactions:
|
|||||||
status:
|
status:
|
||||||
code: 200
|
code: 200
|
||||||
message: OK
|
message: OK
|
||||||
|
- request:
|
||||||
|
body: '{"messages": [{"role": "system", "content": "You are Test Agent. An agent
|
||||||
|
created for testing purposes\nYour personal goal is: Complete test tasks successfully\n\nTo
|
||||||
|
give my best complete final answer to the task respond using the exact following
|
||||||
|
format:\n\nThought: I now can give a great answer\nFinal Answer: Your final
|
||||||
|
answer must be the great and the most complete as possible, it must be outcome
|
||||||
|
described.\n\nI MUST use these formats, my job depends on it!"}, {"role": "user",
|
||||||
|
"content": "Complete this task successfully"}], "model": "gpt-4o-mini", "stop":
|
||||||
|
["\nObservation:"]}'
|
||||||
|
headers:
|
||||||
|
accept:
|
||||||
|
- application/json
|
||||||
|
accept-encoding:
|
||||||
|
- gzip, deflate
|
||||||
|
connection:
|
||||||
|
- keep-alive
|
||||||
|
content-length:
|
||||||
|
- '583'
|
||||||
|
content-type:
|
||||||
|
- application/json
|
||||||
|
host:
|
||||||
|
- api.openai.com
|
||||||
|
user-agent:
|
||||||
|
- OpenAI/Python 1.109.1
|
||||||
|
x-stainless-arch:
|
||||||
|
- arm64
|
||||||
|
x-stainless-async:
|
||||||
|
- 'false'
|
||||||
|
x-stainless-lang:
|
||||||
|
- python
|
||||||
|
x-stainless-os:
|
||||||
|
- MacOS
|
||||||
|
x-stainless-package-version:
|
||||||
|
- 1.109.1
|
||||||
|
x-stainless-raw-response:
|
||||||
|
- 'true'
|
||||||
|
x-stainless-read-timeout:
|
||||||
|
- '600.0'
|
||||||
|
x-stainless-retry-count:
|
||||||
|
- '2'
|
||||||
|
x-stainless-runtime:
|
||||||
|
- CPython
|
||||||
|
x-stainless-runtime-version:
|
||||||
|
- 3.10.18
|
||||||
|
method: POST
|
||||||
|
uri: https://api.openai.com/v1/chat/completions
|
||||||
|
response:
|
||||||
|
content: "{\n \"id\": \"chatcmpl-CKFPJgeu8MLmB8Pj7GWmPhjtnxXrd\",\n \"object\"\
|
||||||
|
: \"chat.completion\",\n \"created\": 1758941821,\n \"model\": \"gpt-4o-mini-2024-07-18\"\
|
||||||
|
,\n \"choices\": [\n {\n \"index\": 0,\n \"message\": {\n \
|
||||||
|
\ \"role\": \"assistant\",\n \"content\": \"Thought: I now can give\
|
||||||
|
\ a great answer \\nFinal Answer: Please provide me with the details of the\
|
||||||
|
\ task you would like me to complete, and I will ensure to deliver a thorough\
|
||||||
|
\ and successful outcome.\",\n \"refusal\": null,\n \"annotations\"\
|
||||||
|
: []\n },\n \"logprobs\": null,\n \"finish_reason\": \"stop\"\
|
||||||
|
\n }\n ],\n \"usage\": {\n \"prompt_tokens\": 99,\n \"completion_tokens\"\
|
||||||
|
: 41,\n \"total_tokens\": 140,\n \"prompt_tokens_details\": {\n \"\
|
||||||
|
cached_tokens\": 0,\n \"audio_tokens\": 0\n },\n \"completion_tokens_details\"\
|
||||||
|
: {\n \"reasoning_tokens\": 0,\n \"audio_tokens\": 0,\n \"accepted_prediction_tokens\"\
|
||||||
|
: 0,\n \"rejected_prediction_tokens\": 0\n }\n },\n \"service_tier\"\
|
||||||
|
: \"default\",\n \"system_fingerprint\": \"fp_560af6e559\"\n}\n"
|
||||||
|
headers:
|
||||||
|
CF-RAY:
|
||||||
|
- 9857bb2fc95e2e9e-EWR
|
||||||
|
Connection:
|
||||||
|
- keep-alive
|
||||||
|
Content-Encoding:
|
||||||
|
- gzip
|
||||||
|
Content-Type:
|
||||||
|
- application/json
|
||||||
|
Date:
|
||||||
|
- Sat, 27 Sep 2025 02:57:02 GMT
|
||||||
|
Server:
|
||||||
|
- cloudflare
|
||||||
|
Set-Cookie:
|
||||||
|
- __cf_bm=n_Cv1TVNkvaq1YzGBvGmHPg_KHOjSEQjY2BBBX8sZUo-1758941822-1.0.1.1-qbQF7Gn3hRepFoWX70NadkX0Iwj3b2I5a6Bl1k3oXdBi41akBXv1dHYN8BvbHvUQtAkQk2ghZcZeOjMMuQPAxtdCPZLK8eQRixWHQhKBxSc;
|
||||||
|
path=/; expires=Sat, 27-Sep-25 03:27:02 GMT; domain=.api.openai.com; HttpOnly;
|
||||||
|
Secure; SameSite=None
|
||||||
|
- _cfuvid=iKfPD7SOU.iPKYonxe8rZqtVEgcAGadxAKKhbqslfqQ-1758941822891-0.0.1.1-604800000;
|
||||||
|
path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
|
||||||
|
Strict-Transport-Security:
|
||||||
|
- max-age=31536000; includeSubDomains; preload
|
||||||
|
Transfer-Encoding:
|
||||||
|
- chunked
|
||||||
|
X-Content-Type-Options:
|
||||||
|
- nosniff
|
||||||
|
access-control-expose-headers:
|
||||||
|
- X-Request-ID
|
||||||
|
alt-svc:
|
||||||
|
- h3=":443"; ma=86400
|
||||||
|
cf-cache-status:
|
||||||
|
- DYNAMIC
|
||||||
|
openai-organization:
|
||||||
|
- REDACTED
|
||||||
|
openai-processing-ms:
|
||||||
|
- '1373'
|
||||||
|
openai-project:
|
||||||
|
- REDACTED
|
||||||
|
openai-version:
|
||||||
|
- '2020-10-01'
|
||||||
|
x-envoy-upstream-service-time:
|
||||||
|
- '1423'
|
||||||
|
x-openai-proxy-wasm:
|
||||||
|
- v0.1
|
||||||
|
x-ratelimit-limit-project-tokens:
|
||||||
|
- '150000000'
|
||||||
|
x-ratelimit-limit-requests:
|
||||||
|
- '30000'
|
||||||
|
x-ratelimit-limit-tokens:
|
||||||
|
- '150000000'
|
||||||
|
x-ratelimit-remaining-project-tokens:
|
||||||
|
- '149999887'
|
||||||
|
x-ratelimit-remaining-requests:
|
||||||
|
- '29999'
|
||||||
|
x-ratelimit-remaining-tokens:
|
||||||
|
- '149999885'
|
||||||
|
x-ratelimit-reset-project-tokens:
|
||||||
|
- 0s
|
||||||
|
x-ratelimit-reset-requests:
|
||||||
|
- 2ms
|
||||||
|
x-ratelimit-reset-tokens:
|
||||||
|
- 0s
|
||||||
|
x-request-id:
|
||||||
|
- req_53031c24ce964e9195146fd196dba801
|
||||||
|
http_version: HTTP/1.1
|
||||||
|
status_code: 200
|
||||||
version: 1
|
version: 1
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
@@ -227,8 +227,8 @@ class TestAgentEvaluator:
|
|||||||
(goal_alignment,) = result.metrics.values()
|
(goal_alignment,) = result.metrics.values()
|
||||||
assert goal_alignment.score == 5.0
|
assert goal_alignment.score == 5.0
|
||||||
|
|
||||||
expected_feedback = "The agent provided a thorough guide on how to conduct a test task but failed to produce specific expected output"
|
# Check that feedback mentions missing requirements or partial alignment
|
||||||
assert expected_feedback in goal_alignment.feedback
|
assert "missed key requirements" in goal_alignment.feedback or "partial" in goal_alignment.feedback.lower()
|
||||||
|
|
||||||
assert goal_alignment.raw_response is not None
|
assert goal_alignment.raw_response is not None
|
||||||
assert '"score": 5' in goal_alignment.raw_response
|
assert '"score": 5' in goal_alignment.raw_response
|
||||||
|
|||||||
Reference in New Issue
Block a user