test: fix evaluation tests and update cassettes

2026-01-10 16:48:30 +00:00 · 2025-09-26 22:59:43 -04:00
parent d2249e621d
commit 9d3fe72e75
3 changed files with 369 additions and 404 deletions
--- a/tests/cassettes/TestAgentEvaluator.test_eval_lite_agent.yaml
+++ b/tests/cassettes/TestAgentEvaluator.test_eval_lite_agent.yaml
@@ -234,4 +234,130 @@ interactions:
    status:
      code: 200
      message: OK
 - request:
    body: '{"messages": [{"role": "system", "content": "You are Test Agent. An agent
      created for testing purposes\nYour personal goal is: Complete test tasks successfully\n\nTo
      give my best complete final answer to the task respond using the exact following
      format:\n\nThought: I now can give a great answer\nFinal Answer: Your final
      answer must be the great and the most complete as possible, it must be outcome
      described.\n\nI MUST use these formats, my job depends on it!"}, {"role": "user",
      "content": "Complete this task successfully"}], "model": "gpt-4o-mini", "stop":
      ["\nObservation:"]}'
    headers:
      accept:
      - application/json
      accept-encoding:
      - gzip, deflate
      connection:
      - keep-alive
      content-length:
      - '583'
      content-type:
      - application/json
      host:
      - api.openai.com
      user-agent:
      - OpenAI/Python 1.109.1
      x-stainless-arch:
      - arm64
      x-stainless-async:
      - 'false'
      x-stainless-lang:
      - python
      x-stainless-os:
      - MacOS
      x-stainless-package-version:
      - 1.109.1
      x-stainless-raw-response:
      - 'true'
      x-stainless-read-timeout:
      - '600.0'
      x-stainless-retry-count:
      - '2'
      x-stainless-runtime:
      - CPython
      x-stainless-runtime-version:
      - 3.10.18
    method: POST
    uri: https://api.openai.com/v1/chat/completions
  response:
    content: "{\n  \"id\": \"chatcmpl-CKFPJgeu8MLmB8Pj7GWmPhjtnxXrd\",\n  \"object\"\
      : \"chat.completion\",\n  \"created\": 1758941821,\n  \"model\": \"gpt-4o-mini-2024-07-18\"\
      ,\n  \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n    \
      \    \"role\": \"assistant\",\n        \"content\": \"Thought: I now can give\
      \ a great answer  \\nFinal Answer: Please provide me with the details of the\
      \ task you would like me to complete, and I will ensure to deliver a thorough\
      \ and successful outcome.\",\n        \"refusal\": null,\n        \"annotations\"\
      : []\n      },\n      \"logprobs\": null,\n      \"finish_reason\": \"stop\"\
      \n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\": 99,\n    \"completion_tokens\"\
      : 41,\n    \"total_tokens\": 140,\n    \"prompt_tokens_details\": {\n      \"\
      cached_tokens\": 0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\"\
      : {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\"\
      : 0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\"\
      : \"default\",\n  \"system_fingerprint\": \"fp_560af6e559\"\n}\n"
    headers:
      CF-RAY:
      - 9857bb2fc95e2e9e-EWR
      Connection:
      - keep-alive
      Content-Encoding:
      - gzip
      Content-Type:
      - application/json
      Date:
      - Sat, 27 Sep 2025 02:57:02 GMT
      Server:
      - cloudflare
      Set-Cookie:
      - __cf_bm=n_Cv1TVNkvaq1YzGBvGmHPg_KHOjSEQjY2BBBX8sZUo-1758941822-1.0.1.1-qbQF7Gn3hRepFoWX70NadkX0Iwj3b2I5a6Bl1k3oXdBi41akBXv1dHYN8BvbHvUQtAkQk2ghZcZeOjMMuQPAxtdCPZLK8eQRixWHQhKBxSc;
        path=/; expires=Sat, 27-Sep-25 03:27:02 GMT; domain=.api.openai.com; HttpOnly;
        Secure; SameSite=None
      - _cfuvid=iKfPD7SOU.iPKYonxe8rZqtVEgcAGadxAKKhbqslfqQ-1758941822891-0.0.1.1-604800000;
        path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
      Strict-Transport-Security:
      - max-age=31536000; includeSubDomains; preload
      Transfer-Encoding:
      - chunked
      X-Content-Type-Options:
      - nosniff
      access-control-expose-headers:
      - X-Request-ID
      alt-svc:
      - h3=":443"; ma=86400
      cf-cache-status:
      - DYNAMIC
      openai-organization:
      - REDACTED
      openai-processing-ms:
      - '1373'
      openai-project:
      - REDACTED
      openai-version:
      - '2020-10-01'
      x-envoy-upstream-service-time:
      - '1423'
      x-openai-proxy-wasm:
      - v0.1
      x-ratelimit-limit-project-tokens:
      - '150000000'
      x-ratelimit-limit-requests:
      - '30000'
      x-ratelimit-limit-tokens:
      - '150000000'
      x-ratelimit-remaining-project-tokens:
      - '149999887'
      x-ratelimit-remaining-requests:
      - '29999'
      x-ratelimit-remaining-tokens:
      - '149999885'
      x-ratelimit-reset-project-tokens:
      - 0s
      x-ratelimit-reset-requests:
      - 2ms
      x-ratelimit-reset-tokens:
      - 0s
      x-request-id:
      - req_53031c24ce964e9195146fd196dba801
    http_version: HTTP/1.1
    status_code: 200
 version: 1
--- a/tests/cassettes/TestAgentEvaluator.test_eval_specific_agents_from_crew.yaml
+++ b/tests/cassettes/TestAgentEvaluator.test_eval_specific_agents_from_crew.yaml
--- a/tests/experimental/evaluation/test_agent_evaluator.py
+++ b/tests/experimental/evaluation/test_agent_evaluator.py
@@ -227,8 +227,8 @@ class TestAgentEvaluator:
            (goal_alignment,) = result.metrics.values()
            assert goal_alignment.score == 5.0
-            expected_feedback = "The agent provided a thorough guide on how to conduct a test task but failed to produce specific expected output"
+            # Check that feedback mentions missing requirements or partial alignment
-            assert expected_feedback in goal_alignment.feedback
+            assert "missed key requirements" in goal_alignment.feedback or "partial" in goal_alignment.feedback.lower()
            assert goal_alignment.raw_response is not None
            assert '"score": 5' in goal_alignment.raw_response