fix: sort imports using ruff --fix

Co-Authored-By: Joe Moura <joao@crewai.com>
fix: update test assertions and sort imports
2026-01-08 15:48:29 +00:00 · 2025-02-09 23:06:51 +00:00 · 2025-02-09 23:05:31 +00:00 · 2025-02-09 23:03:57 +00:00 · 2025-02-09 23:01:20 +00:00 · 2025-02-09 22:59:56 +00:00
5 changed files with 1073 additions and 22 deletions
--- a/src/crewai/crew.py
+++ b/src/crewai/crew.py
@@ -1147,20 +1147,32 @@ class Crew(BaseModel):
    def test(
        self,
-        n_iterations: int,
+        n_iterations: int = 1,
        openai_model_name: Optional[str] = None,
        llm: Optional[Union[str, LLM]] = None,
        inputs: Optional[Dict[str, Any]] = None,
    ) -> None:
-        """Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures."""
+        """Test and evaluate the Crew with the given inputs for n iterations.
        Args:
            n_iterations: Number of iterations to run the test
            openai_model_name: OpenAI model name to use for evaluation (deprecated)
            llm: LLM instance or model name to use for evaluation
            inputs: Optional dictionary of inputs to pass to the crew
        """
        if not llm and not openai_model_name:
            raise ValueError("Must provide either 'llm' or 'openai_model_name' parameter")
        model_to_use = self._get_llm_instance(llm, openai_model_name)
        test_crew = self.copy()
        self._test_execution_span = test_crew._telemetry.test_execution_span(
            test_crew,
            n_iterations,
            inputs,
-            openai_model_name,  # type: ignore[arg-type]
+            str(model_to_use.model),
-        )  # type: ignore[arg-type]
+        )
-        evaluator = CrewEvaluator(test_crew, openai_model_name)  # type: ignore[arg-type]
+        evaluator = CrewEvaluator(test_crew, model_to_use)
        for i in range(1, n_iterations + 1):
            evaluator.set_iteration(i)
@@ -1168,6 +1180,28 @@ class Crew(BaseModel):
        evaluator.print_crew_evaluation_result()
    def _get_llm_instance(self, llm: Optional[Union[str, LLM]], openai_model_name: Optional[str]) -> LLM:
        """Get an LLM instance from either llm or openai_model_name parameter.
        Args:
            llm: LLM instance or model name
            openai_model_name: OpenAI model name (deprecated)
        Returns:
            LLM instance
        Raises:
            ValueError: If neither llm nor openai_model_name is provided
        """
        model = llm if llm is not None else openai_model_name
        if model is None:
            raise ValueError("Must provide either 'llm' or 'openai_model_name' parameter")
        if isinstance(model, str):
            return LLM(model=model)
        if not isinstance(model, LLM):
            raise ValueError("Model must be either a string or an LLM instance")
        return model
    def __repr__(self):
        return f"Crew(id={self.id}, process={self.process}, number_of_agents={len(self.agents)}, number_of_tasks={len(self.tasks)})"
--- a/src/crewai/utilities/evaluators/crew_evaluator_handler.py
+++ b/src/crewai/utilities/evaluators/crew_evaluator_handler.py
@@ -1,4 +1,5 @@
 from collections import defaultdict
 from typing import Union
 from pydantic import BaseModel, Field
 from rich.box import HEAVY_EDGE
@@ -6,6 +7,7 @@ from rich.console import Console
 from rich.table import Table
 from crewai.agent import Agent
 from crewai.llm import LLM
 from crewai.task import Task
 from crewai.tasks.task_output import TaskOutput
 from crewai.telemetry import Telemetry
@@ -32,9 +34,9 @@ class CrewEvaluator:
    run_execution_times: defaultdict = defaultdict(list)
    iteration: int = 0
-    def __init__(self, crew, openai_model_name: str):
+    def __init__(self, crew, llm: Union[str, LLM]):
        self.crew = crew
-        self.openai_model_name = openai_model_name
+        self.llm = LLM(model=llm) if isinstance(llm, str) else llm
        self._telemetry = Telemetry()
        self._setup_for_evaluating()
@@ -51,7 +53,7 @@ class CrewEvaluator:
            ),
            backstory="Evaluator agent for crew evaluation with precise capabilities to evaluate the performance of the agents in the crew based on the tasks they have performed",
            verbose=False,
-            llm=self.openai_model_name,
+            llm=self.llm,
        )
    def _evaluation_task(
@@ -181,7 +183,7 @@ class CrewEvaluator:
                self.crew,
                evaluation_result.pydantic.quality,
                current_task.execution_duration,
-                self.openai_model_name,
+                self.llm.model,
            )
            self.tasks_scores[self.iteration].append(evaluation_result.pydantic.quality)
            self.run_execution_times[self.iteration].append(
--- a/tests/crew_test.py
+++ b/tests/crew_test.py
@@ -3306,8 +3306,7 @@ def test_conditional_should_execute():
@mock.patch("crewai.crew.CrewEvaluator")
@mock.patch("crewai.crew.Crew.copy")
-@mock.patch("crewai.crew.Crew.kickoff")
+def test_crew_testing_function(copy_mock, crew_evaluator_mock):
 def test_crew_testing_function(kickoff_mock, copy_mock, crew_evaluator):
    task = Task(
        description="Come up with a list of 5 interesting ideas to explore for an article, then write one amazing paragraph highlight for each idea that showcases how good an article about this topic could be. Return the list of ideas with their paragraph and your notes.",
        expected_output="5 bullet points with a paragraph for each idea.",
@@ -3319,25 +3318,28 @@ def test_crew_testing_function(kickoff_mock, copy_mock, crew_evaluator):
        tasks=[task],
    )
-    # Create a mock for the copied crew
+    # Create a mock for the copied crew with a mock kickoff method
-    copy_mock.return_value = crew
+    copied_crew = MagicMock()
    copy_mock.return_value = copied_crew
    # Create a mock for the CrewEvaluator instance
    evaluator_instance = MagicMock()
    crew_evaluator_mock.return_value = evaluator_instance
    n_iterations = 2
    crew.test(n_iterations, openai_model_name="gpt-4o-mini", inputs={"topic": "AI"})
    # Ensure kickoff is called on the copied crew
-    kickoff_mock.assert_has_calls(
+    copied_crew.kickoff.assert_has_calls(
        [mock.call(inputs={"topic": "AI"}), mock.call(inputs={"topic": "AI"})]
    )
-    crew_evaluator.assert_has_calls(
+    # Verify CrewEvaluator interactions
-        [
+    # We don't check the exact LLM object since it's created internally
-            mock.call(crew, "gpt-4o-mini"),
+    assert len(crew_evaluator_mock.mock_calls) == 4
-            mock.call().set_iteration(1),
+    assert crew_evaluator_mock.mock_calls[1] == mock.call().set_iteration(1)
-            mock.call().set_iteration(2),
+    assert crew_evaluator_mock.mock_calls[2] == mock.call().set_iteration(2)
-            mock.call().print_crew_evaluation_result(),
+    assert crew_evaluator_mock.mock_calls[3] == mock.call().print_crew_evaluation_result()
        ]
    )
@pytest.mark.vcr(filter_headers=["authorization"])
--- a/tests/utilities/evaluators/cassettes/test_crew_test_with_custom_llm.yaml
+++ b/tests/utilities/evaluators/cassettes/test_crew_test_with_custom_llm.yaml
@@ -0,0 +1,942 @@
 interactions:
 - request:
    body: '{"messages": [{"role": "system", "content": "You are test. test\nYour personal
      goal is: test\nTo give my best complete final answer to the task respond using
      the exact following format:\n\nThought: I now can give a great answer\nFinal
      Answer: Your final answer must be the great and the most complete as possible,
      it must be outcome described.\n\nI MUST use these formats, my job depends on
      it!"}, {"role": "user", "content": "\nCurrent Task: test\n\nThis is the expected
      criteria for your final answer: test output\nyou MUST return the actual complete
      content as the final answer, not a summary.\n\nBegin! This is VERY important
      to you, use the tools available and give your best Final Answer, your job depends
      on it!\n\nThought:"}], "model": "gpt-4", "stop": ["\nObservation:"]}'
    headers:
      accept:
      - application/json
      accept-encoding:
      - gzip, deflate
      authorization:
      - Bearer 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
      connection:
      - keep-alive
      content-length:
      - '780'
      content-type:
      - application/json
      host:
      - api.openai.com
      user-agent:
      - OpenAI/Python 1.61.0
      x-stainless-arch:
      - x64
      x-stainless-async:
      - 'false'
      x-stainless-lang:
      - python
      x-stainless-os:
      - Linux
      x-stainless-package-version:
      - 1.61.0
      x-stainless-raw-response:
      - 'true'
      x-stainless-retry-count:
      - '0'
      x-stainless-runtime:
      - CPython
      x-stainless-runtime-version:
      - 3.12.7
    method: POST
    uri: https://api.openai.com/v1/chat/completions
  response:
    content: "{\n  \"id\": \"chatcmpl-AzAMqMWFX8hC1szIKxWNSyXm0SPFi\",\n  \"object\":
      \"chat.completion\",\n  \"created\": 1739141224,\n  \"model\": \"gpt-4-0613\",\n
      \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
      \"assistant\",\n        \"content\": \"I am prepared to conduct the test efficiently.\\n\\nFinal
      Answer: The test output that aligns with the given criteria is a detailed description
      of the testing process, providing a thorough understanding for anyone reviewing
      it. The output not only contains the raw data or results but also includes step-by-step
      documentation of the process employed, thoughts and reasoning behind each step,
      deviations if any from the original plan, and how these deviations impacted
      the results. In addition, it captures any errors or unexpected occurrences during
      the course of the test, and proposes possible explanations or solutions for
      these. It is detailed yet comprehensible, catering to both technical and non-technical
      audiences. It is a result of meticulous planning, diligent execution, and robust
      post-test analysis, making it a complete content.\",\n        \"refusal\": null\n
      \     },\n      \"logprobs\": null,\n      \"finish_reason\": \"stop\"\n    }\n
      \ ],\n  \"usage\": {\n    \"prompt_tokens\": 149,\n    \"completion_tokens\":
      151,\n    \"total_tokens\": 300,\n    \"prompt_tokens_details\": {\n      \"cached_tokens\":
      0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\": {\n
      \     \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
      0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
      \"default\",\n  \"system_fingerprint\": null\n}\n"
    headers:
      CF-RAY:
      - 90f7662cab11ba33-SEA
      Connection:
      - keep-alive
      Content-Encoding:
      - gzip
      Content-Type:
      - application/json
      Date:
      - Sun, 09 Feb 2025 22:47:09 GMT
      Server:
      - cloudflare
      Set-Cookie:
      - __cf_bm=p1aGVyahvfLAvEwvbX0FMmrN5o18PpVAu2dG_dTgMSU-1739141229-1.0.1.1-_q7aCslZTr11IMFZ81VgyuqsGiqTARFPANUvBEWM_0dZdb97Py78KE1omxdNv5F1pFKoWUqA1kEF2wzQ2wz4aA;
        path=/; expires=Sun, 09-Feb-25 23:17:09 GMT; domain=.api.openai.com; HttpOnly;
        Secure; SameSite=None
      - _cfuvid=bsF0jwE67cS.ywAaQU59jKPFC03S1dvynClHm_wTQik-1739141229143-0.0.1.1-604800000;
        path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
      Transfer-Encoding:
      - chunked
      X-Content-Type-Options:
      - nosniff
      access-control-expose-headers:
      - X-Request-ID
      alt-svc:
      - h3=":443"; ma=86400
      cf-cache-status:
      - DYNAMIC
      openai-organization:
      - crewai-iuxna1
      openai-processing-ms:
      - '4585'
      openai-version:
      - '2020-10-01'
      strict-transport-security:
      - max-age=31536000; includeSubDomains; preload
      x-ratelimit-limit-requests:
      - '10000'
      x-ratelimit-limit-tokens:
      - '1000000'
      x-ratelimit-remaining-requests:
      - '9999'
      x-ratelimit-remaining-tokens:
      - '999822'
      x-ratelimit-reset-requests:
      - 6ms
      x-ratelimit-reset-tokens:
      - 10ms
      x-request-id:
      - req_1ba81a80018602119b871a7a42d7becf
    http_version: HTTP/1.1
    status_code: 200
 - request:
    body: !!binary |
      Ct8LCiQKIgoMc2VydmljZS5uYW1lEhIKEGNyZXdBSS10ZWxlbWV0cnkStgsKEgoQY3Jld2FpLnRl
      bGVtZXRyeRL5AQoQT8uAOJ+suOhFs22RW56o6BII0Ob64+TP3XQqE0NyZXcgVGVzdCBFeGVjdXRp
      b24wATk1IlKguqsiGEGGgWOguqsiGEobCg5jcmV3YWlfdmVyc2lvbhIJCgcwLjEwMC4xSi4KCGNy
      ZXdfa2V5EiIKIGZlYjFlMjFiMzI1NmM1OWE2NDcxNTJhZmRkNjYzMjJlSjEKB2NyZXdfaWQSJgok
      M2Q1MGJkYWItZDI1NS00MjFiLThkMzMtZjZmOTAzMThhOWQwShEKCml0ZXJhdGlvbnMSAwoBMUoV
      Cgptb2RlbF9uYW1lEgcKBWdwdC00egIYAYUBAAEAABKSBwoQgzrB2KxaHe9FwPNktJHbFRIIT8gM
      r7rSvJUqDENyZXcgQ3JlYXRlZDABOW4kaqC6qyIYQe+yeaC6qyIYShsKDmNyZXdhaV92ZXJzaW9u
      EgkKBzAuMTAwLjFKGgoOcHl0aG9uX3ZlcnNpb24SCAoGMy4xMi43Si4KCGNyZXdfa2V5EiIKIGZl
      YjFlMjFiMzI1NmM1OWE2NDcxNTJhZmRkNjYzMjJlSjEKB2NyZXdfaWQSJgokM2Q1MGJkYWItZDI1
      NS00MjFiLThkMzMtZjZmOTAzMThhOWQwShwKDGNyZXdfcHJvY2VzcxIMCgpzZXF1ZW50aWFsShEK
      C2NyZXdfbWVtb3J5EgIQAEoaChRjcmV3X251bWJlcl9vZl90YXNrcxICGAFKGwoVY3Jld19udW1i
      ZXJfb2ZfYWdlbnRzEgIYAUrFAgoLY3Jld19hZ2VudHMStQIKsgJbeyJrZXkiOiAiOTc2ZjhmNTBh
      Y2NmZWJhMjIzZTQ5YzQyYjE2ZTk5ZTYiLCAiaWQiOiAiN2E3NmZjNmYtZTI5YS00MDBlLWI0NGEt
      NzAyMDNlMzg1Y2RmIiwgInJvbGUiOiAidGVzdCIsICJ2ZXJib3NlPyI6IGZhbHNlLCAibWF4X2l0
      ZXIiOiAyNSwgIm1heF9ycG0iOiBudWxsLCAiZnVuY3Rpb25fY2FsbGluZ19sbG0iOiAiIiwgImxs
      bSI6ICJncHQtNCIsICJkZWxlZ2F0aW9uX2VuYWJsZWQ/IjogZmFsc2UsICJhbGxvd19jb2RlX2V4
      ZWN1dGlvbj8iOiBmYWxzZSwgIm1heF9yZXRyeV9saW1pdCI6IDIsICJ0b29sc19uYW1lcyI6IFtd
      fV1K+QEKCmNyZXdfdGFza3MS6gEK5wFbeyJrZXkiOiAiZGE5NWViZGIzNmU0Y2RmOTJkZjZhNmRk
      MTZiY2VlMGUiLCAiaWQiOiAiNTcwYmJlYjQtYzkzNi00NTNkLTg2MjktYzhjMDM0ODA5NDhjIiwg
      ImFzeW5jX2V4ZWN1dGlvbj8iOiBmYWxzZSwgImh1bWFuX2lucHV0PyI6IGZhbHNlLCAiYWdlbnRf
      cm9sZSI6ICJ0ZXN0IiwgImFnZW50X2tleSI6ICI5NzZmOGY1MGFjY2ZlYmEyMjNlNDljNDJiMTZl
      OTllNiIsICJ0b29sc19uYW1lcyI6IFtdfV16AhgBhQEAAQAAEo4CChAus8hZAJcezzXdP2XqhVyF
      Egi1wnliqIdQdSoMVGFzayBDcmVhdGVkMAE5nneIoLqrIhhBKDyJoLqrIhhKLgoIY3Jld19rZXkS
      IgogZmViMWUyMWIzMjU2YzU5YTY0NzE1MmFmZGQ2NjMyMmVKMQoHY3Jld19pZBImCiQzZDUwYmRh
      Yi1kMjU1LTQyMWItOGQzMy1mNmY5MDMxOGE5ZDBKLgoIdGFza19rZXkSIgogZGE5NWViZGIzNmU0
      Y2RmOTJkZjZhNmRkMTZiY2VlMGVKMQoHdGFza19pZBImCiQ1NzBiYmViNC1jOTM2LTQ1M2QtODYy
      OS1jOGMwMzQ4MDk0OGN6AhgBhQEAAQAA
    headers:
      Accept:
      - '*/*'
      Accept-Encoding:
      - gzip, deflate
      Connection:
      - keep-alive
      Content-Length:
      - '1506'
      Content-Type:
      - application/x-protobuf
      User-Agent:
      - OTel-OTLP-Exporter-Python/1.27.0
    method: POST
    uri: https://telemetry.crewai.com:4319/v1/traces
  response:
    body:
      string: "\n\0"
    headers:
      Content-Length:
      - '2'
      Content-Type:
      - application/x-protobuf
      Date:
      - Sun, 09 Feb 2025 22:47:09 GMT
    status:
      code: 200
      message: OK
 - request:
    body: '{"messages": [{"role": "system", "content": "You are Task Execution Evaluator.
      Evaluator agent for crew evaluation with precise capabilities to evaluate the
      performance of the agents in the crew based on the tasks they have performed\nYour
      personal goal is: Your goal is to evaluate the performance of the agents in
      the crew based on the tasks they have performed using score from 1 to 10 evaluating
      on completion, quality, and overall performance.\nTo give my best complete final
      answer to the task respond using the exact following format:\n\nThought: I now
      can give a great answer\nFinal Answer: Your final answer must be the great and
      the most complete as possible, it must be outcome described.\n\nI MUST use these
      formats, my job depends on it!"}, {"role": "user", "content": "\nCurrent Task:
      Based on the task description and the expected output, compare and evaluate
      the performance of the agents in the crew based on the Task Output they have
      performed using score from 1 to 10 evaluating on completion, quality, and overall
      performance.task_description: test task_expected_output: test output agent:
      test agent_goal: test Task Output: The test output that aligns with the given
      criteria is a detailed description of the testing process, providing a thorough
      understanding for anyone reviewing it. The output not only contains the raw
      data or results but also includes step-by-step documentation of the process
      employed, thoughts and reasoning behind each step, deviations if any from the
      original plan, and how these deviations impacted the results. In addition, it
      captures any errors or unexpected occurrences during the course of the test,
      and proposes possible explanations or solutions for these. It is detailed yet
      comprehensible, catering to both technical and non-technical audiences. It is
      a result of meticulous planning, diligent execution, and robust post-test analysis,
      making it a complete content.\n\nThis is the expected criteria for your final
      answer: Evaluation Score from 1 to 10 based on the performance of the agents
      on the tasks\nyou MUST return the actual complete content as the final answer,
      not a summary.\nEnsure your final answer contains only the content in the following
      format: {\n  \"quality\": float\n}\n\nEnsure the final output does not include
      any code block markers like ```json or ```python.\n\nBegin! This is VERY important
      to you, use the tools available and give your best Final Answer, your job depends
      on it!\n\nThought:"}], "model": "gpt-4", "stop": ["\nObservation:"]}'
    headers:
      accept:
      - application/json
      accept-encoding:
      - gzip, deflate
      authorization:
      - Bearer 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
      connection:
      - keep-alive
      content-length:
      - '2523'
      content-type:
      - application/json
      cookie:
      - __cf_bm=p1aGVyahvfLAvEwvbX0FMmrN5o18PpVAu2dG_dTgMSU-1739141229-1.0.1.1-_q7aCslZTr11IMFZ81VgyuqsGiqTARFPANUvBEWM_0dZdb97Py78KE1omxdNv5F1pFKoWUqA1kEF2wzQ2wz4aA;
        _cfuvid=bsF0jwE67cS.ywAaQU59jKPFC03S1dvynClHm_wTQik-1739141229143-0.0.1.1-604800000
      host:
      - api.openai.com
      user-agent:
      - OpenAI/Python 1.61.0
      x-stainless-arch:
      - x64
      x-stainless-async:
      - 'false'
      x-stainless-lang:
      - python
      x-stainless-os:
      - Linux
      x-stainless-package-version:
      - 1.61.0
      x-stainless-raw-response:
      - 'true'
      x-stainless-retry-count:
      - '0'
      x-stainless-runtime:
      - CPython
      x-stainless-runtime-version:
      - 3.12.7
    method: POST
    uri: https://api.openai.com/v1/chat/completions
  response:
    content: "{\n  \"id\": \"chatcmpl-AzAMvAOSw5847reo2vh61focjnyK2\",\n  \"object\":
      \"chat.completion\",\n  \"created\": 1739141229,\n  \"model\": \"gpt-4-0613\",\n
      \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
      \"assistant\",\n        \"content\": \"Based on the given task output, I can
      determine that the test agent has performed impressively well. Their work is
      comprehensive, catering to both non-technical and technical audiences and includes
      complete and detailed process documentation. Further, the way they detect and
      elaborate deviances and errors shows their meticulousness and efficiency. Their
      planning, execution, and analysis are sound.\\n\\nFinal Answer: The quality
      of this task is admirable, paying attention to details and meticulously planning
      and reasoning behind each step. Considering all these, on a scale from 1 to
      10, I would rate the task performed by the test agent as follows:\\n\\n{\\n
      \ \\\"quality\\\": 9.5\\n}\",\n        \"refusal\": null\n      },\n      \"logprobs\":
      null,\n      \"finish_reason\": \"stop\"\n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\":
      471,\n    \"completion_tokens\": 133,\n    \"total_tokens\": 604,\n    \"prompt_tokens_details\":
      {\n      \"cached_tokens\": 0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
      {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
      0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
      \"default\",\n  \"system_fingerprint\": null\n}\n"
    headers:
      CF-Cache-Status:
      - DYNAMIC
      CF-RAY:
      - 90f7664a581eba33-SEA
      Connection:
      - keep-alive
      Content-Encoding:
      - gzip
      Content-Type:
      - application/json
      Date:
      - Sun, 09 Feb 2025 22:47:14 GMT
      Server:
      - cloudflare
      Transfer-Encoding:
      - chunked
      X-Content-Type-Options:
      - nosniff
      access-control-expose-headers:
      - X-Request-ID
      alt-svc:
      - h3=":443"; ma=86400
      openai-organization:
      - crewai-iuxna1
      openai-processing-ms:
      - '4884'
      openai-version:
      - '2020-10-01'
      strict-transport-security:
      - max-age=31536000; includeSubDomains; preload
      x-ratelimit-limit-requests:
      - '10000'
      x-ratelimit-limit-tokens:
      - '1000000'
      x-ratelimit-remaining-requests:
      - '9999'
      x-ratelimit-remaining-tokens:
      - '999388'
      x-ratelimit-reset-requests:
      - 6ms
      x-ratelimit-reset-tokens:
      - 36ms
      x-request-id:
      - req_0335ff13c1777c1bcdbee89879bc132c
    http_version: HTTP/1.1
    status_code: 200
 - request:
    body: !!binary |
      CvsNCiQKIgoMc2VydmljZS5uYW1lEhIKEGNyZXdBSS10ZWxlbWV0cnkS0g0KEgoQY3Jld2FpLnRl
      bGVtZXRyeRKZAgoQJQy0LAglHxA7Ok+n0Gmi9hIIybabc5KDQlkqG0NyZXcgSW5kaXZpZHVhbCBU
      ZXN0IFJlc3VsdDABOQK5tem8qyIYQVTw0+m8qyIYShsKDmNyZXdhaV92ZXJzaW9uEgkKBzAuMTAw
      LjFKLgoIY3Jld19rZXkSIgogZmViMWUyMWIzMjU2YzU5YTY0NzE1MmFmZGQ2NjMyMmVKMQoHY3Jl
      d19pZBImCiQzZDUwYmRhYi1kMjU1LTQyMWItOGQzMy1mNmY5MDMxOGE5ZDBKEAoHcXVhbGl0eRIF
      CgM5LjVKFwoJZXhlY190aW1lEgoKCDQuODA5MTc4ShUKCm1vZGVsX25hbWUSBwoFZ3B0LTR6AhgB
      hQEAAQAAEvkBChCnePSvJJg/cFeYF3HlEvyVEgh3YAewpHkssyoTQ3JldyBUZXN0IEV4ZWN1dGlv
      bjABOeiBSOq8qyIYQQWTV+q8qyIYShsKDmNyZXdhaV92ZXJzaW9uEgkKBzAuMTAwLjFKLgoIY3Jl
      d19rZXkSIgogZmViMWUyMWIzMjU2YzU5YTY0NzE1MmFmZGQ2NjMyMmVKMQoHY3Jld19pZBImCiQ2
      YzYyNmEyZi05OGRlLTQ2ODAtOWJhNC01NWVkYzdmODhiZTNKEQoKaXRlcmF0aW9ucxIDCgExShUK
      Cm1vZGVsX25hbWUSBwoFZ3B0LTR6AhgBhQEAAQAAEpIHChB0Gz3vlppAjams1hbMI/RQEggPCufR
      e9thfSoMQ3JldyBDcmVhdGVkMAE5X3td6ryrIhhBWNhr6ryrIhhKGwoOY3Jld2FpX3ZlcnNpb24S
      CQoHMC4xMDAuMUoaCg5weXRob25fdmVyc2lvbhIICgYzLjEyLjdKLgoIY3Jld19rZXkSIgogZmVi
      MWUyMWIzMjU2YzU5YTY0NzE1MmFmZGQ2NjMyMmVKMQoHY3Jld19pZBImCiQ2YzYyNmEyZi05OGRl
      LTQ2ODAtOWJhNC01NWVkYzdmODhiZTNKHAoMY3Jld19wcm9jZXNzEgwKCnNlcXVlbnRpYWxKEQoL
      Y3Jld19tZW1vcnkSAhAAShoKFGNyZXdfbnVtYmVyX29mX3Rhc2tzEgIYAUobChVjcmV3X251bWJl
      cl9vZl9hZ2VudHMSAhgBSsUCCgtjcmV3X2FnZW50cxK1AgqyAlt7ImtleSI6ICI5NzZmOGY1MGFj
      Y2ZlYmEyMjNlNDljNDJiMTZlOTllNiIsICJpZCI6ICJkYTA4M2Q5ZS0xOWU5LTQyMzAtYjZmNC0y
      NjlhNzM1NzViOWQiLCAicm9sZSI6ICJ0ZXN0IiwgInZlcmJvc2U/IjogZmFsc2UsICJtYXhfaXRl
      ciI6IDI1LCAibWF4X3JwbSI6IG51bGwsICJmdW5jdGlvbl9jYWxsaW5nX2xsbSI6ICIiLCAibGxt
      IjogImdwdC00IiwgImRlbGVnYXRpb25fZW5hYmxlZD8iOiBmYWxzZSwgImFsbG93X2NvZGVfZXhl
      Y3V0aW9uPyI6IGZhbHNlLCAibWF4X3JldHJ5X2xpbWl0IjogMiwgInRvb2xzX25hbWVzIjogW119
      XUr5AQoKY3Jld190YXNrcxLqAQrnAVt7ImtleSI6ICJkYTk1ZWJkYjM2ZTRjZGY5MmRmNmE2ZGQx
      NmJjZWUwZSIsICJpZCI6ICJhNGUwYjM1Ny0zMDBlLTQ0MjMtYTU1My0yZTZlMWQxODg1M2MiLCAi
      YXN5bmNfZXhlY3V0aW9uPyI6IGZhbHNlLCAiaHVtYW5faW5wdXQ/IjogZmFsc2UsICJhZ2VudF9y
      b2xlIjogInRlc3QiLCAiYWdlbnRfa2V5IjogIjk3NmY4ZjUwYWNjZmViYTIyM2U0OWM0MmIxNmU5
      OWU2IiwgInRvb2xzX25hbWVzIjogW119XXoCGAGFAQABAAASjgIKEC5jZek+sSlZP8lSwF5zTSYS
      CIHcVJhIpsWuKgxUYXNrIENyZWF0ZWQwATl9vnrqvKsiGEFgjnvqvKsiGEouCghjcmV3X2tleRIi
      CiBmZWIxZTIxYjMyNTZjNTlhNjQ3MTUyYWZkZDY2MzIyZUoxCgdjcmV3X2lkEiYKJDZjNjI2YTJm
      LTk4ZGUtNDY4MC05YmE0LTU1ZWRjN2Y4OGJlM0ouCgh0YXNrX2tleRIiCiBkYTk1ZWJkYjM2ZTRj
      ZGY5MmRmNmE2ZGQxNmJjZWUwZUoxCgd0YXNrX2lkEiYKJGE0ZTBiMzU3LTMwMGUtNDQyMy1hNTUz
      LTJlNmUxZDE4ODUzY3oCGAGFAQABAAA=
    headers:
      Accept:
      - '*/*'
      Accept-Encoding:
      - gzip, deflate
      Connection:
      - keep-alive
      Content-Length:
      - '1790'
      Content-Type:
      - application/x-protobuf
      User-Agent:
      - OTel-OTLP-Exporter-Python/1.27.0
    method: POST
    uri: https://telemetry.crewai.com:4319/v1/traces
  response:
    body:
      string: "\n\0"
    headers:
      Content-Length:
      - '2'
      Content-Type:
      - application/x-protobuf
      Date:
      - Sun, 09 Feb 2025 22:47:14 GMT
    status:
      code: 200
      message: OK
 - request:
    body: '{"messages": [{"role": "system", "content": "You are test. test\nYour personal
      goal is: test\nTo give my best complete final answer to the task respond using
      the exact following format:\n\nThought: I now can give a great answer\nFinal
      Answer: Your final answer must be the great and the most complete as possible,
      it must be outcome described.\n\nI MUST use these formats, my job depends on
      it!"}, {"role": "user", "content": "\nCurrent Task: test\n\nThis is the expected
      criteria for your final answer: test output\nyou MUST return the actual complete
      content as the final answer, not a summary.\n\nBegin! This is VERY important
      to you, use the tools available and give your best Final Answer, your job depends
      on it!\n\nThought:"}], "model": "gpt-4", "stop": ["\nObservation:"]}'
    headers:
      accept:
      - application/json
      accept-encoding:
      - gzip, deflate
      authorization:
      - Bearer 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
      connection:
      - keep-alive
      content-length:
      - '780'
      content-type:
      - application/json
      cookie:
      - __cf_bm=p1aGVyahvfLAvEwvbX0FMmrN5o18PpVAu2dG_dTgMSU-1739141229-1.0.1.1-_q7aCslZTr11IMFZ81VgyuqsGiqTARFPANUvBEWM_0dZdb97Py78KE1omxdNv5F1pFKoWUqA1kEF2wzQ2wz4aA;
        _cfuvid=bsF0jwE67cS.ywAaQU59jKPFC03S1dvynClHm_wTQik-1739141229143-0.0.1.1-604800000
      host:
      - api.openai.com
      user-agent:
      - OpenAI/Python 1.61.0
      x-stainless-arch:
      - x64
      x-stainless-async:
      - 'false'
      x-stainless-lang:
      - python
      x-stainless-os:
      - Linux
      x-stainless-package-version:
      - 1.61.0
      x-stainless-raw-response:
      - 'true'
      x-stainless-retry-count:
      - '0'
      x-stainless-runtime:
      - CPython
      x-stainless-runtime-version:
      - 3.12.7
    method: POST
    uri: https://api.openai.com/v1/chat/completions
  response:
    content: "{\n  \"id\": \"chatcmpl-AzAN0cgAktzQnGukedPNpZsTy461c\",\n  \"object\":
      \"chat.completion\",\n  \"created\": 1739141234,\n  \"model\": \"gpt-4-0613\",\n
      \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
      \"assistant\",\n        \"content\": \"I have an understanding of the task at
      hand and am ready to provide an in-depth and comprehensive answer.\\n\\nFinal
      Answer: As per the requirement of the task to provide a complete output, I am
      returning this test output as my final answer. It is not a summary, but rather
      a full and comprehensive response that fully addresses the question and expectations
      set forth. Your test output is ready.\",\n        \"refusal\": null\n      },\n
      \     \"logprobs\": null,\n      \"finish_reason\": \"stop\"\n    }\n  ],\n
      \ \"usage\": {\n    \"prompt_tokens\": 149,\n    \"completion_tokens\": 77,\n
      \   \"total_tokens\": 226,\n    \"prompt_tokens_details\": {\n      \"cached_tokens\":
      0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\": {\n
      \     \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
      0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
      \"default\",\n  \"system_fingerprint\": null\n}\n"
    headers:
      CF-Cache-Status:
      - DYNAMIC
      CF-RAY:
      - 90f76669bd9eba33-SEA
      Connection:
      - keep-alive
      Content-Encoding:
      - gzip
      Content-Type:
      - application/json
      Date:
      - Sun, 09 Feb 2025 22:47:17 GMT
      Server:
      - cloudflare
      Transfer-Encoding:
      - chunked
      X-Content-Type-Options:
      - nosniff
      access-control-expose-headers:
      - X-Request-ID
      alt-svc:
      - h3=":443"; ma=86400
      openai-organization:
      - crewai-iuxna1
      openai-processing-ms:
      - '3379'
      openai-version:
      - '2020-10-01'
      strict-transport-security:
      - max-age=31536000; includeSubDomains; preload
      x-ratelimit-limit-requests:
      - '10000'
      x-ratelimit-limit-tokens:
      - '1000000'
      x-ratelimit-remaining-requests:
      - '9999'
      x-ratelimit-remaining-tokens:
      - '999822'
      x-ratelimit-reset-requests:
      - 6ms
      x-ratelimit-reset-tokens:
      - 10ms
      x-request-id:
      - req_977371ae262154885689d766016ed132
    http_version: HTTP/1.1
    status_code: 200
 - request:
    body: '{"messages": [{"role": "system", "content": "You are Task Execution Evaluator.
      Evaluator agent for crew evaluation with precise capabilities to evaluate the
      performance of the agents in the crew based on the tasks they have performed\nYour
      personal goal is: Your goal is to evaluate the performance of the agents in
      the crew based on the tasks they have performed using score from 1 to 10 evaluating
      on completion, quality, and overall performance.\nTo give my best complete final
      answer to the task respond using the exact following format:\n\nThought: I now
      can give a great answer\nFinal Answer: Your final answer must be the great and
      the most complete as possible, it must be outcome described.\n\nI MUST use these
      formats, my job depends on it!"}, {"role": "user", "content": "\nCurrent Task:
      Based on the task description and the expected output, compare and evaluate
      the performance of the agents in the crew based on the Task Output they have
      performed using score from 1 to 10 evaluating on completion, quality, and overall
      performance.task_description: test task_expected_output: test output agent:
      test agent_goal: test Task Output: As per the requirement of the task to provide
      a complete output, I am returning this test output as my final answer. It is
      not a summary, but rather a full and comprehensive response that fully addresses
      the question and expectations set forth. Your test output is ready.\n\nThis
      is the expected criteria for your final answer: Evaluation Score from 1 to 10
      based on the performance of the agents on the tasks\nyou MUST return the actual
      complete content as the final answer, not a summary.\nEnsure your final answer
      contains only the content in the following format: {\n  \"quality\": float\n}\n\nEnsure
      the final output does not include any code block markers like ```json or ```python.\n\nBegin!
      This is VERY important to you, use the tools available and give your best Final
      Answer, your job depends on it!\n\nThought:"}], "model": "gpt-4", "stop": ["\nObservation:"]}'
    headers:
      accept:
      - application/json
      accept-encoding:
      - gzip, deflate
      authorization:
      - Bearer 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
      connection:
      - keep-alive
      content-length:
      - '2017'
      content-type:
      - application/json
      cookie:
      - __cf_bm=p1aGVyahvfLAvEwvbX0FMmrN5o18PpVAu2dG_dTgMSU-1739141229-1.0.1.1-_q7aCslZTr11IMFZ81VgyuqsGiqTARFPANUvBEWM_0dZdb97Py78KE1omxdNv5F1pFKoWUqA1kEF2wzQ2wz4aA;
        _cfuvid=bsF0jwE67cS.ywAaQU59jKPFC03S1dvynClHm_wTQik-1739141229143-0.0.1.1-604800000
      host:
      - api.openai.com
      user-agent:
      - OpenAI/Python 1.61.0
      x-stainless-arch:
      - x64
      x-stainless-async:
      - 'false'
      x-stainless-lang:
      - python
      x-stainless-os:
      - Linux
      x-stainless-package-version:
      - 1.61.0
      x-stainless-raw-response:
      - 'true'
      x-stainless-retry-count:
      - '0'
      x-stainless-runtime:
      - CPython
      x-stainless-runtime-version:
      - 3.12.7
    method: POST
    uri: https://api.openai.com/v1/chat/completions
  response:
    content: "{\n  \"id\": \"chatcmpl-AzAN3PPgDvH836sMJBXCVRc3im99S\",\n  \"object\":
      \"chat.completion\",\n  \"created\": 1739141237,\n  \"model\": \"gpt-4-0613\",\n
      \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
      \"assistant\",\n        \"content\": \"Based on the information provided, the
      agent appears to have completed the task, providing an output that they have
      defined as 'full and comprehensive'. It appears that the agent has attempted
      to meet all the expectations of the task description and has reached the goal
      of returning a 'test output' as the final answer.\\n\\nFinal Answer: Considering
      these aspects, for task completion, the agent receives a 10 as they have successfully
      generated an output. For quality, the agent again receives a significant score
      of 10, because the full and comprehensive nature of the output matches the task's
      expectations. Finally, taking into account both the completion and quality aspects,
      the overall performance evaluation is also 10, recognizing the perfect alignment
      between the task's expected output and the output delivered by the agent. \\nTherefore,
      the final evaluation score can be summarized in the below format:\\n{\\n\\\"completion\\\":
      10,\\n\\\"quality\\\": 10,\\n\\\"overall performance\\\": 10\\n}\",\n        \"refusal\":
      null\n      },\n      \"logprobs\": null,\n      \"finish_reason\": \"stop\"\n
      \   }\n  ],\n  \"usage\": {\n    \"prompt_tokens\": 385,\n    \"completion_tokens\":
      190,\n    \"total_tokens\": 575,\n    \"prompt_tokens_details\": {\n      \"cached_tokens\":
      0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\": {\n
      \     \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
      0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
      \"default\",\n  \"system_fingerprint\": null\n}\n"
    headers:
      CF-RAY:
      - 90f7667f9bfdba33-SEA
      Connection:
      - keep-alive
      Content-Encoding:
      - gzip
      Content-Type:
      - application/json
      Date:
      - Sun, 09 Feb 2025 22:47:24 GMT
      Server:
      - cloudflare
      Transfer-Encoding:
      - chunked
      X-Content-Type-Options:
      - nosniff
      access-control-expose-headers:
      - X-Request-ID
      alt-svc:
      - h3=":443"; ma=86400
      cf-cache-status:
      - DYNAMIC
      openai-organization:
      - crewai-iuxna1
      openai-processing-ms:
      - '6909'
      openai-version:
      - '2020-10-01'
      strict-transport-security:
      - max-age=31536000; includeSubDomains; preload
      x-ratelimit-limit-requests:
      - '10000'
      x-ratelimit-limit-tokens:
      - '1000000'
      x-ratelimit-remaining-requests:
      - '9999'
      x-ratelimit-remaining-tokens:
      - '999515'
      x-ratelimit-reset-requests:
      - 6ms
      x-ratelimit-reset-tokens:
      - 29ms
      x-request-id:
      - req_3e283ae8c1cd132001ecf2d96198bbd6
    http_version: HTTP/1.1
    status_code: 200
 - request:
    body: '{"messages": [{"role": "system", "content": "You are test. test\nYour personal
      goal is: test\nTo give my best complete final answer to the task respond using
      the exact following format:\n\nThought: I now can give a great answer\nFinal
      Answer: Your final answer must be the great and the most complete as possible,
      it must be outcome described.\n\nI MUST use these formats, my job depends on
      it!"}, {"role": "user", "content": "\nCurrent Task: test\n\nThis is the expected
      criteria for your final answer: test output\nyou MUST return the actual complete
      content as the final answer, not a summary.\n\nBegin! This is VERY important
      to you, use the tools available and give your best Final Answer, your job depends
      on it!\n\nThought:"}], "model": "gpt-4", "stop": ["\nObservation:"]}'
    headers:
      accept:
      - application/json
      accept-encoding:
      - gzip, deflate
      authorization:
      - Bearer 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
      connection:
      - keep-alive
      content-length:
      - '780'
      content-type:
      - application/json
      cookie:
      - __cf_bm=p1aGVyahvfLAvEwvbX0FMmrN5o18PpVAu2dG_dTgMSU-1739141229-1.0.1.1-_q7aCslZTr11IMFZ81VgyuqsGiqTARFPANUvBEWM_0dZdb97Py78KE1omxdNv5F1pFKoWUqA1kEF2wzQ2wz4aA;
        _cfuvid=bsF0jwE67cS.ywAaQU59jKPFC03S1dvynClHm_wTQik-1739141229143-0.0.1.1-604800000
      host:
      - api.openai.com
      user-agent:
      - OpenAI/Python 1.61.0
      x-stainless-arch:
      - x64
      x-stainless-async:
      - 'false'
      x-stainless-lang:
      - python
      x-stainless-os:
      - Linux
      x-stainless-package-version:
      - 1.61.0
      x-stainless-raw-response:
      - 'true'
      x-stainless-retry-count:
      - '0'
      x-stainless-runtime:
      - CPython
      x-stainless-runtime-version:
      - 3.12.7
    method: POST
    uri: https://api.openai.com/v1/chat/completions
  response:
    content: "{\n  \"id\": \"chatcmpl-AzANAznjrZdppuFIsRnEouHG8WuM0\",\n  \"object\":
      \"chat.completion\",\n  \"created\": 1739141244,\n  \"model\": \"gpt-4-0613\",\n
      \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
      \"assistant\",\n        \"content\": \"I am ready to prepare my final answer
      based on the test output criteria provided\\nFinal Answer: I have followed all
      the instructions provided in the task to the best of my ability, and the outcome
      of the test is as described in the final answer. It is complete, detailed, and
      accurate.\",\n        \"refusal\": null\n      },\n      \"logprobs\": null,\n
      \     \"finish_reason\": \"stop\"\n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\":
      149,\n    \"completion_tokens\": 59,\n    \"total_tokens\": 208,\n    \"prompt_tokens_details\":
      {\n      \"cached_tokens\": 0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
      {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
      0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
      \"default\",\n  \"system_fingerprint\": null\n}\n"
    headers:
      CF-Cache-Status:
      - DYNAMIC
      CF-RAY:
      - 90f766abaaa3ba33-SEA
      Connection:
      - keep-alive
      Content-Encoding:
      - gzip
      Content-Type:
      - application/json
      Date:
      - Sun, 09 Feb 2025 22:47:27 GMT
      Server:
      - cloudflare
      Transfer-Encoding:
      - chunked
      X-Content-Type-Options:
      - nosniff
      access-control-expose-headers:
      - X-Request-ID
      alt-svc:
      - h3=":443"; ma=86400
      openai-organization:
      - crewai-iuxna1
      openai-processing-ms:
      - '2533'
      openai-version:
      - '2020-10-01'
      strict-transport-security:
      - max-age=31536000; includeSubDomains; preload
      x-ratelimit-limit-requests:
      - '10000'
      x-ratelimit-limit-tokens:
      - '1000000'
      x-ratelimit-remaining-requests:
      - '9999'
      x-ratelimit-remaining-tokens:
      - '999822'
      x-ratelimit-reset-requests:
      - 6ms
      x-ratelimit-reset-tokens:
      - 10ms
      x-request-id:
      - req_6ea4c81627695f58de56727aa8d8cc59
    http_version: HTTP/1.1
    status_code: 200
 - request:
    body: !!binary |
      CvwNCiQKIgoMc2VydmljZS5uYW1lEhIKEGNyZXdBSS10ZWxlbWV0cnkS0w0KEgoQY3Jld2FpLnRl
      bGVtZXRyeRKaAgoQGJwIgEdh/Dq2y8ue+Gl/XxIInTNNpEL8yjQqG0NyZXcgSW5kaXZpZHVhbCBU
      ZXN0IFJlc3VsdDABOaAPzl6/qyIYQYnY6V6/qyIYShsKDmNyZXdhaV92ZXJzaW9uEgkKBzAuMTAw
      LjFKLgoIY3Jld19rZXkSIgogZmViMWUyMWIzMjU2YzU5YTY0NzE1MmFmZGQ2NjMyMmVKMQoHY3Jl
      d19pZBImCiQ2YzYyNmEyZi05OGRlLTQ2ODAtOWJhNC01NWVkYzdmODhiZTNKEQoHcXVhbGl0eRIG
      CgQxMC4wShcKCWV4ZWNfdGltZRIKCggzLjUwMzUyNkoVCgptb2RlbF9uYW1lEgcKBWdwdC00egIY
      AYUBAAEAABL5AQoQsfCFg6/ZkEo2LShWV3X+WhII4I9o90lQzxMqE0NyZXcgVGVzdCBFeGVjdXRp
      b24wATmWhUlfv6siGEFeYlZfv6siGEobCg5jcmV3YWlfdmVyc2lvbhIJCgcwLjEwMC4xSi4KCGNy
      ZXdfa2V5EiIKIGZlYjFlMjFiMzI1NmM1OWE2NDcxNTJhZmRkNjYzMjJlSjEKB2NyZXdfaWQSJgok
      ZDU2ZjljMWEtYmRkMS00MDI3LWI1ZjctMzg1ZGVlMWU2YjljShEKCml0ZXJhdGlvbnMSAwoBMUoV
      Cgptb2RlbF9uYW1lEgcKBWdwdC00egIYAYUBAAEAABKSBwoQlucrHD/mwnCU8Dl9QKzgYhIIGyix
      8K7RcoAqDENyZXcgQ3JlYXRlZDABOTQxXF+/qyIYQamXaV+/qyIYShsKDmNyZXdhaV92ZXJzaW9u
      EgkKBzAuMTAwLjFKGgoOcHl0aG9uX3ZlcnNpb24SCAoGMy4xMi43Si4KCGNyZXdfa2V5EiIKIGZl
      YjFlMjFiMzI1NmM1OWE2NDcxNTJhZmRkNjYzMjJlSjEKB2NyZXdfaWQSJgokZDU2ZjljMWEtYmRk
      MS00MDI3LWI1ZjctMzg1ZGVlMWU2YjljShwKDGNyZXdfcHJvY2VzcxIMCgpzZXF1ZW50aWFsShEK
      C2NyZXdfbWVtb3J5EgIQAEoaChRjcmV3X251bWJlcl9vZl90YXNrcxICGAFKGwoVY3Jld19udW1i
      ZXJfb2ZfYWdlbnRzEgIYAUrFAgoLY3Jld19hZ2VudHMStQIKsgJbeyJrZXkiOiAiOTc2ZjhmNTBh
      Y2NmZWJhMjIzZTQ5YzQyYjE2ZTk5ZTYiLCAiaWQiOiAiMzcwMzA5YTQtMDU5OS00MWVlLWFiMTgt
      YWE1ZmQ1Mjg2ZGQ1IiwgInJvbGUiOiAidGVzdCIsICJ2ZXJib3NlPyI6IGZhbHNlLCAibWF4X2l0
      ZXIiOiAyNSwgIm1heF9ycG0iOiBudWxsLCAiZnVuY3Rpb25fY2FsbGluZ19sbG0iOiAiIiwgImxs
      bSI6ICJncHQtNCIsICJkZWxlZ2F0aW9uX2VuYWJsZWQ/IjogZmFsc2UsICJhbGxvd19jb2RlX2V4
      ZWN1dGlvbj8iOiBmYWxzZSwgIm1heF9yZXRyeV9saW1pdCI6IDIsICJ0b29sc19uYW1lcyI6IFtd
      fV1K+QEKCmNyZXdfdGFza3MS6gEK5wFbeyJrZXkiOiAiZGE5NWViZGIzNmU0Y2RmOTJkZjZhNmRk
      MTZiY2VlMGUiLCAiaWQiOiAiZTBmNDgzNjAtYzNjNS00ZGY1LThkZjEtNDg2ZTc4OWNiZWUyIiwg
      ImFzeW5jX2V4ZWN1dGlvbj8iOiBmYWxzZSwgImh1bWFuX2lucHV0PyI6IGZhbHNlLCAiYWdlbnRf
      cm9sZSI6ICJ0ZXN0IiwgImFnZW50X2tleSI6ICI5NzZmOGY1MGFjY2ZlYmEyMjNlNDljNDJiMTZl
      OTllNiIsICJ0b29sc19uYW1lcyI6IFtdfV16AhgBhQEAAQAAEo4CChA4OLnKHp32b0EUM2g5rs+r
      EgjifMpu5dQ6xCoMVGFzayBDcmVhdGVkMAE5x1d3X7+rIhhBMxF4X7+rIhhKLgoIY3Jld19rZXkS
      IgogZmViMWUyMWIzMjU2YzU5YTY0NzE1MmFmZGQ2NjMyMmVKMQoHY3Jld19pZBImCiRkNTZmOWMx
      YS1iZGQxLTQwMjctYjVmNy0zODVkZWUxZTZiOWNKLgoIdGFza19rZXkSIgogZGE5NWViZGIzNmU0
      Y2RmOTJkZjZhNmRkMTZiY2VlMGVKMQoHdGFza19pZBImCiRlMGY0ODM2MC1jM2M1LTRkZjUtOGRm
      MS00ODZlNzg5Y2JlZTJ6AhgBhQEAAQAA
    headers:
      Accept:
      - '*/*'
      Accept-Encoding:
      - gzip, deflate
      Connection:
      - keep-alive
      Content-Length:
      - '1791'
      Content-Type:
      - application/x-protobuf
      User-Agent:
      - OTel-OTLP-Exporter-Python/1.27.0
    method: POST
    uri: https://telemetry.crewai.com:4319/v1/traces
  response:
    body:
      string: "\n\0"
    headers:
      Content-Length:
      - '2'
      Content-Type:
      - application/x-protobuf
      Date:
      - Sun, 09 Feb 2025 22:47:29 GMT
    status:
      code: 200
      message: OK
 - request:
    body: '{"messages": [{"role": "system", "content": "You are Task Execution Evaluator.
      Evaluator agent for crew evaluation with precise capabilities to evaluate the
      performance of the agents in the crew based on the tasks they have performed\nYour
      personal goal is: Your goal is to evaluate the performance of the agents in
      the crew based on the tasks they have performed using score from 1 to 10 evaluating
      on completion, quality, and overall performance.\nTo give my best complete final
      answer to the task respond using the exact following format:\n\nThought: I now
      can give a great answer\nFinal Answer: Your final answer must be the great and
      the most complete as possible, it must be outcome described.\n\nI MUST use these
      formats, my job depends on it!"}, {"role": "user", "content": "\nCurrent Task:
      Based on the task description and the expected output, compare and evaluate
      the performance of the agents in the crew based on the Task Output they have
      performed using score from 1 to 10 evaluating on completion, quality, and overall
      performance.task_description: test task_expected_output: test output agent:
      test agent_goal: test Task Output: I have followed all the instructions provided
      in the task to the best of my ability, and the outcome of the test is as described
      in the final answer. It is complete, detailed, and accurate.\n\nThis is the
      expected criteria for your final answer: Evaluation Score from 1 to 10 based
      on the performance of the agents on the tasks\nyou MUST return the actual complete
      content as the final answer, not a summary.\nEnsure your final answer contains
      only the content in the following format: {\n  \"quality\": float\n}\n\nEnsure
      the final output does not include any code block markers like ```json or ```python.\n\nBegin!
      This is VERY important to you, use the tools available and give your best Final
      Answer, your job depends on it!\n\nThought:"}], "model": "gpt-4", "stop": ["\nObservation:"]}'
    headers:
      accept:
      - application/json
      accept-encoding:
      - gzip, deflate
      authorization:
      - Bearer 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
      connection:
      - keep-alive
      content-length:
      - '1935'
      content-type:
      - application/json
      cookie:
      - __cf_bm=p1aGVyahvfLAvEwvbX0FMmrN5o18PpVAu2dG_dTgMSU-1739141229-1.0.1.1-_q7aCslZTr11IMFZ81VgyuqsGiqTARFPANUvBEWM_0dZdb97Py78KE1omxdNv5F1pFKoWUqA1kEF2wzQ2wz4aA;
        _cfuvid=bsF0jwE67cS.ywAaQU59jKPFC03S1dvynClHm_wTQik-1739141229143-0.0.1.1-604800000
      host:
      - api.openai.com
      user-agent:
      - OpenAI/Python 1.61.0
      x-stainless-arch:
      - x64
      x-stainless-async:
      - 'false'
      x-stainless-lang:
      - python
      x-stainless-os:
      - Linux
      x-stainless-package-version:
      - 1.61.0
      x-stainless-raw-response:
      - 'true'
      x-stainless-retry-count:
      - '0'
      x-stainless-runtime:
      - CPython
      x-stainless-runtime-version:
      - 3.12.7
    method: POST
    uri: https://api.openai.com/v1/chat/completions
  response:
    content: "{\n  \"id\": \"chatcmpl-AzANDGSiRIu1XO56ZMfLuO2SuL4l0\",\n  \"object\":
      \"chat.completion\",\n  \"created\": 1739141247,\n  \"model\": \"gpt-4-0613\",\n
      \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
      \"assistant\",\n        \"content\": \"Looking at the task output provided by
      the agent, the agent has expressed a high level of confidence in their ability
      to follow the instructions provided to the best of their ability. The agent
      seems to have executed the task with detailed attention and accuracy.\\n\\nI
      need to evaluate both the quality and the overall performance of the agent keeping
      in mind the task description and the expected output. Given the agent's output
      and goal, I can deduce the quality of their work as well as their overall performance.\\n\\nFinal
      Answer: \\n{\\n  \\\"quality\\\": 8.5\\n}\",\n        \"refusal\": null\n      },\n
      \     \"logprobs\": null,\n      \"finish_reason\": \"stop\"\n    }\n  ],\n
      \ \"usage\": {\n    \"prompt_tokens\": 372,\n    \"completion_tokens\": 112,\n
      \   \"total_tokens\": 484,\n    \"prompt_tokens_details\": {\n      \"cached_tokens\":
      0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\": {\n
      \     \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
      0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
      \"default\",\n  \"system_fingerprint\": null\n}\n"
    headers:
      CF-RAY:
      - 90f766bc4d87ba33-SEA
      Connection:
      - keep-alive
      Content-Encoding:
      - gzip
      Content-Type:
      - application/json
      Date:
      - Sun, 09 Feb 2025 22:47:32 GMT
      Server:
      - cloudflare
      Transfer-Encoding:
      - chunked
      X-Content-Type-Options:
      - nosniff
      access-control-expose-headers:
      - X-Request-ID
      alt-svc:
      - h3=":443"; ma=86400
      cf-cache-status:
      - DYNAMIC
      openai-organization:
      - crewai-iuxna1
      openai-processing-ms:
      - '5056'
      openai-version:
      - '2020-10-01'
      strict-transport-security:
      - max-age=31536000; includeSubDomains; preload
      x-ratelimit-limit-requests:
      - '10000'
      x-ratelimit-limit-tokens:
      - '1000000'
      x-ratelimit-remaining-requests:
      - '9999'
      x-ratelimit-remaining-tokens:
      - '999535'
      x-ratelimit-reset-requests:
      - 6ms
      x-ratelimit-reset-tokens:
      - 27ms
      x-request-id:
      - req_ff2926b015823a70e2173c71f8d63209
    http_version: HTTP/1.1
    status_code: 200
 version: 1
--- a/tests/utilities/evaluators/test_custom_llm_support.py
+++ b/tests/utilities/evaluators/test_custom_llm_support.py
@@ -0,0 +1,71 @@
 from unittest.mock import MagicMock
 import pytest
 from crewai.agent import Agent
 from crewai.crew import Crew
 from crewai.llm import LLM
 from crewai.task import Task
 from crewai.utilities.evaluators.crew_evaluator_handler import CrewEvaluator
@pytest.mark.vcr()
 def test_crew_test_with_custom_llm():
    """Test Crew.test() with both string model name and LLM instance."""
    # Setup
    agent = Agent(
        role="test",
        goal="test",
        backstory="test",
        llm=LLM(model="gpt-4"),
    )
    task = Task(
        description="test",
        expected_output="test output",
        agent=agent,
    )
    crew = Crew(agents=[agent], tasks=[task])
    # Test with string model name
    crew.test(n_iterations=1, llm="gpt-4")
    # Test with LLM instance
    custom_llm = LLM(model="gpt-4")
    crew.test(n_iterations=1, llm=custom_llm)
    # Test backward compatibility
    crew.test(n_iterations=1, openai_model_name="gpt-4")
    # Test error when neither parameter is provided
    with pytest.raises(ValueError, match="Must provide either 'llm' or 'openai_model_name' parameter"):
        crew.test(n_iterations=1)
 def test_crew_evaluator_with_custom_llm():
    # Setup
    agent = Agent(
        role="test",
        goal="test",
        backstory="test",
        llm=LLM(model="gpt-4"),
    )
    task = Task(
        description="test",
        expected_output="test output",
        agent=agent,
    )
    crew = Crew(agents=[agent], tasks=[task])
    # Test with string model name
    evaluator = CrewEvaluator(crew, "gpt-4")
    assert isinstance(evaluator.llm, LLM)
    assert evaluator.llm.model == "gpt-4"
    # Test with LLM instance
    custom_llm = LLM(model="gpt-4")
    evaluator = CrewEvaluator(crew, custom_llm)
    assert evaluator.llm == custom_llm
    # Test that evaluator agent uses the correct LLM
    evaluator_agent = evaluator._evaluator_agent()
    assert evaluator_agent.llm == evaluator.llm
Author	SHA1	Message	Date
Devin AI	10af8e35fd	fix: sort imports using ruff --fix Co-Authored-By: Joe Moura <joao@crewai.com>	2025-02-09 23:06:51 +00:00
Devin AI	e343017414	fix: update test assertions and sort imports Co-Authored-By: Joe Moura <joao@crewai.com>	2025-02-09 23:05:31 +00:00
Devin AI	dad121a692	fix: sort imports in test_custom_llm_support.py Co-Authored-By: Joe Moura <joao@crewai.com>	2025-02-09 23:03:57 +00:00
Devin AI	8dc07febc7	fix: sort imports and remove duplicate assertions Co-Authored-By: Joe Moura <joao@crewai.com>	2025-02-09 23:01:20 +00:00
Devin AI	09240a7b62	fix: remove duplicate assertions in test_crew_testing_function Co-Authored-By: Joe Moura <joao@crewai.com>	2025-02-09 22:59:56 +00:00
Devin AI	0423dd8134	fix: improve error handling and import order - Add better error handling in _get_llm_instance - Fix import order in test_custom_llm_support.py Co-Authored-By: Joe Moura <joao@crewai.com>	2025-02-09 22:57:14 +00:00
Devin AI	f4efdc55e2	refactor: implement review suggestions - Extract model conversion logic to _get_llm_instance helper method - Improve error message clarity - Simplify LLM instance creation in CrewEvaluator Co-Authored-By: Joe Moura <joao@crewai.com>	2025-02-09 22:52:13 +00:00
Devin AI	0ab66041da	fix: address type-checker and lint issues - Add proper type hints in Crew.test() - Fix import sorting in test file Co-Authored-By: Joe Moura <joao@crewai.com>	2025-02-09 22:50:06 +00:00
Devin AI	b8f2603bf3	test: add VCR cassettes for custom LLM support tests Co-Authored-By: Joe Moura <joao@crewai.com>	2025-02-09 22:47:50 +00:00
Devin AI	ff620f0ad6	test: switch to VCR for test recording Co-Authored-By: Joe Moura <joao@crewai.com>	2025-02-09 22:47:00 +00:00
Devin AI	1caf45ad9b	test: improve test reliability by mocking LLM responses Co-Authored-By: Joe Moura <joao@crewai.com>	2025-02-09 22:44:50 +00:00
Devin AI	4a216d1f15	fix: use llm.model instead of openai_model_name in CrewEvaluator Co-Authored-By: Joe Moura <joao@crewai.com>	2025-02-09 22:43:56 +00:00
Devin AI	5f5a1b3687	fix: add expected_output field to Task in tests Co-Authored-By: Joe Moura <joao@crewai.com>	2025-02-09 22:43:22 +00:00
Devin AI	f3704a44b3	test: add tests for custom LLM support in Crew.test() and CrewEvaluator - Add tests for string model name input - Add tests for LLM instance input - Add tests for backward compatibility - Add tests for error cases - Add tests for CrewEvaluator LLM handling Co-Authored-By: Joe Moura <joao@crewai.com>	2025-02-09 22:42:44 +00:00
Devin AI	9bd39464cc	feat: enable custom LLM support for Crew.test() - Add llm parameter to Crew.test() that accepts string or LLM instance - Maintain backward compatibility with openai_model_name parameter - Update CrewEvaluator to handle any LLM implementation - Improve docstrings and type hints Fixes #2080 Co-Authored-By: Joe Moura <joao@crewai.com>	2025-02-09 22:42:05 +00:00