test: fix evaluation tests and update cassettes

This commit is contained in:
Greyson LaLonde
2025-09-26 22:59:43 -04:00
parent d2249e621d
commit 9d3fe72e75
3 changed files with 369 additions and 404 deletions

View File

@@ -234,4 +234,130 @@ interactions:
status: status:
code: 200 code: 200
message: OK message: OK
- request:
body: '{"messages": [{"role": "system", "content": "You are Test Agent. An agent
created for testing purposes\nYour personal goal is: Complete test tasks successfully\n\nTo
give my best complete final answer to the task respond using the exact following
format:\n\nThought: I now can give a great answer\nFinal Answer: Your final
answer must be the great and the most complete as possible, it must be outcome
described.\n\nI MUST use these formats, my job depends on it!"}, {"role": "user",
"content": "Complete this task successfully"}], "model": "gpt-4o-mini", "stop":
["\nObservation:"]}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate
connection:
- keep-alive
content-length:
- '583'
content-type:
- application/json
host:
- api.openai.com
user-agent:
- OpenAI/Python 1.109.1
x-stainless-arch:
- arm64
x-stainless-async:
- 'false'
x-stainless-lang:
- python
x-stainless-os:
- MacOS
x-stainless-package-version:
- 1.109.1
x-stainless-raw-response:
- 'true'
x-stainless-read-timeout:
- '600.0'
x-stainless-retry-count:
- '2'
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.10.18
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
content: "{\n \"id\": \"chatcmpl-CKFPJgeu8MLmB8Pj7GWmPhjtnxXrd\",\n \"object\"\
: \"chat.completion\",\n \"created\": 1758941821,\n \"model\": \"gpt-4o-mini-2024-07-18\"\
,\n \"choices\": [\n {\n \"index\": 0,\n \"message\": {\n \
\ \"role\": \"assistant\",\n \"content\": \"Thought: I now can give\
\ a great answer \\nFinal Answer: Please provide me with the details of the\
\ task you would like me to complete, and I will ensure to deliver a thorough\
\ and successful outcome.\",\n \"refusal\": null,\n \"annotations\"\
: []\n },\n \"logprobs\": null,\n \"finish_reason\": \"stop\"\
\n }\n ],\n \"usage\": {\n \"prompt_tokens\": 99,\n \"completion_tokens\"\
: 41,\n \"total_tokens\": 140,\n \"prompt_tokens_details\": {\n \"\
cached_tokens\": 0,\n \"audio_tokens\": 0\n },\n \"completion_tokens_details\"\
: {\n \"reasoning_tokens\": 0,\n \"audio_tokens\": 0,\n \"accepted_prediction_tokens\"\
: 0,\n \"rejected_prediction_tokens\": 0\n }\n },\n \"service_tier\"\
: \"default\",\n \"system_fingerprint\": \"fp_560af6e559\"\n}\n"
headers:
CF-RAY:
- 9857bb2fc95e2e9e-EWR
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Sat, 27 Sep 2025 02:57:02 GMT
Server:
- cloudflare
Set-Cookie:
- __cf_bm=n_Cv1TVNkvaq1YzGBvGmHPg_KHOjSEQjY2BBBX8sZUo-1758941822-1.0.1.1-qbQF7Gn3hRepFoWX70NadkX0Iwj3b2I5a6Bl1k3oXdBi41akBXv1dHYN8BvbHvUQtAkQk2ghZcZeOjMMuQPAxtdCPZLK8eQRixWHQhKBxSc;
path=/; expires=Sat, 27-Sep-25 03:27:02 GMT; domain=.api.openai.com; HttpOnly;
Secure; SameSite=None
- _cfuvid=iKfPD7SOU.iPKYonxe8rZqtVEgcAGadxAKKhbqslfqQ-1758941822891-0.0.1.1-604800000;
path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
Strict-Transport-Security:
- max-age=31536000; includeSubDomains; preload
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
cf-cache-status:
- DYNAMIC
openai-organization:
- REDACTED
openai-processing-ms:
- '1373'
openai-project:
- REDACTED
openai-version:
- '2020-10-01'
x-envoy-upstream-service-time:
- '1423'
x-openai-proxy-wasm:
- v0.1
x-ratelimit-limit-project-tokens:
- '150000000'
x-ratelimit-limit-requests:
- '30000'
x-ratelimit-limit-tokens:
- '150000000'
x-ratelimit-remaining-project-tokens:
- '149999887'
x-ratelimit-remaining-requests:
- '29999'
x-ratelimit-remaining-tokens:
- '149999885'
x-ratelimit-reset-project-tokens:
- 0s
x-ratelimit-reset-requests:
- 2ms
x-ratelimit-reset-tokens:
- 0s
x-request-id:
- req_53031c24ce964e9195146fd196dba801
http_version: HTTP/1.1
status_code: 200
version: 1 version: 1

File diff suppressed because one or more lines are too long

View File

@@ -227,8 +227,8 @@ class TestAgentEvaluator:
(goal_alignment,) = result.metrics.values() (goal_alignment,) = result.metrics.values()
assert goal_alignment.score == 5.0 assert goal_alignment.score == 5.0
expected_feedback = "The agent provided a thorough guide on how to conduct a test task but failed to produce specific expected output" # Check that feedback mentions missing requirements or partial alignment
assert expected_feedback in goal_alignment.feedback assert "missed key requirements" in goal_alignment.feedback or "partial" in goal_alignment.feedback.lower()
assert goal_alignment.raw_response is not None assert goal_alignment.raw_response is not None
assert '"score": 5' in goal_alignment.raw_response assert '"score": 5' in goal_alignment.raw_response