refactor: simplify TaskGuardrail to use LLM for validation, no code generation

This commit is contained in:
Lucas Gomide
2025-04-29 09:50:03 -03:00
parent e3ab80f517
commit e940ff3cbd
14 changed files with 3883 additions and 4217 deletions

View File

@@ -324,11 +324,7 @@ blog_task = Task(
### TaskGuardrail
The `TaskGuardrail` class provides a sophisticated way to generate and execute validation code for task outputs. Here's how it works:
#### Code Execution
The generated code can be executed in two ways: Docker container (Default, Recommended) or current environment (unsafe mode)
The `TaskGuardrail` class offers a robust mechanism for validating task outputs
### Error Handling Best Practices
@@ -791,7 +787,7 @@ task = Task(
)
```
##### Using YAML
#### Using YAML
```yaml
research_task:

View File

@@ -1,39 +1,40 @@
from typing import Any, Tuple
from typing import Any, Optional, Tuple
from pydantic import BaseModel, Field
from crewai.agent import Agent, LiteAgentOutput
from crewai.llm import LLM
from crewai.task import Task
from crewai.tasks.task_output import TaskOutput
from crewai.utilities.printer import Printer
class TaskGuardrailResult(BaseModel):
valid: bool = Field(
description="Whether the task output complies with the guardrail"
)
feedback: str | None = Field(
description="A feedback about the task output if it is not valid",
default=None,
)
class TaskGuardrail:
"""A task that validates the output of another task using generated Python code.
This class generates and executes Python code to validate task outputs based on
specified criteria. It uses an LLM to generate the validation code and provides
safety guardrails for code execution.
The code is executed in a Docker container if available, otherwise it is executed in a sandboxed environment.
If unsafe mode is enabled, the code is executed in the current environment.
This class is used to validate the output from a Task based on specified criteria.
It uses an LLM to validate the output and provides a feedback if the output is not valid.
Args:
description (str): The description of the validation criteria.
task (Task, optional): The task whose output needs validation.
llm (LLM, optional): The language model to use for code generation.
additional_instructions (str, optional): Additional instructions for the guardrail task.
unsafe_mode (bool, optional): Whether to run the code in unsafe mode.
Raises:
ValueError: If no valid LLM is provided.
"""
generated_code: str = ""
def __init__(
self,
description: str,
task: Task | None = None,
llm: LLM | None = None,
additional_instructions: str = "",
unsafe_mode: bool = False,
):
self.description = description
@@ -47,84 +48,36 @@ class TaskGuardrail:
)
self.llm: LLM | None = llm or fallback_llm
self.additional_instructions = additional_instructions
self.unsafe_mode = unsafe_mode
def _validate_output(self, task_output: TaskOutput) -> LiteAgentOutput:
agent = Agent(
role="Guardrail Agent",
goal="Validate the output of the task",
backstory="You are a expert at validating the output of a task. By providing effective feedback if the output is not valid.",
llm=self.llm,
)
@property
def system_instructions(self) -> str:
"""System instructions for the LLM code generation.
query = f"""
Ensure the following task result complies with the given guardrail.
Returns:
str: Complete system instructions including security constraints.
Task result:
{task_output.raw}
Guardrail:
{self.description}
Your task:
- Confirm if the Task result complies with the guardrail.
- If not, provide clear feedback explaining what is wrong (e.g., by how much it violates the rule, or what specific part fails).
- Focus only on identifying issues — do not propose corrections.
- If the Task result complies with the guardrail, saying that is valid
"""
security_instructions = (
"- DO NOT wrap the output in markdown or use triple backticks. Return only raw Python code."
"- DO NOT use `exec`, `eval`, `compile`, `open`, `os`, `subprocess`, `socket`, `shutil`, or any other system-level modules.\n"
"- Your code must not perform any file I/O, shell access, or dynamic code execution."
)
return (
"You are a expert Python developer"
"You **must strictly** follow the task description, use the provided raw output as the input in your code. "
"Your code must:\n"
"- Return results with: print((True, data)) on success, or print((False, 'very detailed error message')) on failure. Make sure the final output is being assined to 'result' variable.\n"
"- Use the literal string of the task output (already included in your input) if needed.\n"
"- Generate the code **following strictly** the task description.\n"
"- Be valid Python 3 — executable as-is.\n"
f"{security_instructions}\n"
"Additional instructions (do not override the previous instructions):\n"
f"{self.additional_instructions}"
)
def user_instructions(self, task_output: TaskOutput) -> str:
"""Generates user instructions for the LLM code generation.
result = agent.kickoff(query, response_format=TaskGuardrailResult)
Args:
task_output (TaskOutput): The output to be validated.
Returns:
str: Instructions for generating validation code.
"""
return (
"Based on the task description below, generate Python 3 code that validates the task output. \n"
"Task description:\n"
f"{self.description}\n"
"Here is the raw output from the task: \n"
f"'{task_output.raw}' \n"
"Use this exact string literal inside your generated code (do not reference variables like task_output.raw)."
"Now generate Python code that follows the instructions above."
)
def generate_code(self, task_output: TaskOutput) -> str:
"""Generates Python code for validating the task output.
Args:
task_output (TaskOutput): The output to be validated.
"""
if self.llm is None:
raise ValueError("Provide a valid LLM to the TaskGuardrail")
response = self.llm.call(
messages=[
{
"role": "system",
"content": self.system_instructions,
},
{
"role": "user",
"content": self.user_instructions(task_output=task_output),
},
]
)
printer = Printer()
printer.print(
content=f"The following code was generated for the guardrail task:\n{response}\n",
color="cyan",
)
return response
return result
def __call__(self, task_output: TaskOutput) -> Tuple[bool, Any]:
"""Executes the validation code on the task output.
"""Validates the output of a task based on specified criteria.
Args:
task_output (TaskOutput): The output to be validated.
@@ -134,28 +87,16 @@ class TaskGuardrail:
- bool: True if validation passed, False otherwise
- Any: The validation result or error message
"""
import ast
from crewai_tools import CodeInterpreterTool
try:
result = self._validate_output(task_output)
assert isinstance(
result.pydantic, TaskGuardrailResult
), "The guardrail result is not a valid pydantic model"
self.generated_code = self.generate_code(task_output)
result = CodeInterpreterTool(
code=self.generated_code, unsafe_mode=self.unsafe_mode
).run()
error_messages = [
"Something went wrong while running the code",
"No result variable found", # when running in unsafe mode, the final output should be stored in the result variable
]
if any(msg in result for msg in error_messages):
return False, result
if isinstance(result, str):
try:
result = ast.literal_eval(result)
except Exception as e:
return False, f"Error parsing result: {str(e)}"
return result
if result.pydantic.valid:
return True, task_output.raw
else:
return False, result.pydantic.feedback
except Exception as e:
return False, f"Error while validating the task output: {str(e)}"

View File

@@ -7,8 +7,8 @@ class TaskGuardrailStartedEvent(BaseEvent):
"""Event emitted when a guardrail task starts
Attributes:
messages: Content can be either a string or a list of dictionaries that support
multimodal content (text, images, etc.)
guardrail: The guardrail callable or TaskGuardrail instance
retry_count: The number of times the guardrail has been retried
"""
type: str = "task_guardrail_started"
@@ -23,8 +23,7 @@ class TaskGuardrailStartedEvent(BaseEvent):
super().__init__(**data)
if isinstance(self.guardrail, TaskGuardrail):
assert self.guardrail.generated_code is not None
self.guardrail = self.guardrail.generated_code.strip()
self.guardrail = self.guardrail.description.strip()
elif isinstance(self.guardrail, Callable):
self.guardrail = getsource(self.guardrail).strip()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,522 +0,0 @@
interactions:
- request:
body: '{"messages": [{"role": "system", "content": "You are Test Agent. Test Backstory\nYour
personal goal is: Test Goal\nTo give my best complete final answer to the task
respond using the exact following format:\n\nThought: I now can give a great
answer\nFinal Answer: Your final answer must be the great and the most complete
as possible, it must be outcome described.\n\nI MUST use these formats, my job
depends on it!"}, {"role": "user", "content": "\nCurrent Task: Test task\n\nThis
is the expected criteria for your final answer: Output\nyou MUST return the
actual complete content as the final answer, not a summary.\n\nBegin! This is
VERY important to you, use the tools available and give your best Final Answer,
your job depends on it!\n\nThought:"}], "model": "gpt-4o-mini", "stop": ["\nObservation:"]}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate, zstd
connection:
- keep-alive
content-length:
- '807'
content-type:
- application/json
host:
- api.openai.com
user-agent:
- OpenAI/Python 1.68.2
x-stainless-arch:
- arm64
x-stainless-async:
- 'false'
x-stainless-lang:
- python
x-stainless-os:
- MacOS
x-stainless-package-version:
- 1.68.2
x-stainless-raw-response:
- 'true'
x-stainless-read-timeout:
- '600.0'
x-stainless-retry-count:
- '0'
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.11.12
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAAwAAAP//jFTBbuQ2DL3PVxA6e4KJm+zszq0ttkBORYuiPbSLASPRNndlSRHpmcwu
8u+FZCcz2ebQi2H78ZGPj6K+rQAMO7MDYwdUOya//ulX/a3tf/+Z3f3jX/xxdGGDfz60p7uvx+mj
aQoj3n8mq8+sKxvH5Ek5hhm2mVCpZL3e3ty22/bdpq3AGB35QuuTrm/ieuTA63bT3qw32/X1+4U9
RLYkZgd/rwAAvtVn0RkcPZodbJrnPyOJYE9m9xIEYHL05Y9BERbFoKY5gzYGpVCl30GIR7AYoOcD
AUJfZAMGOVIG+Cf8wgE9/Fi/d/DHQKAoXwAVBgwOMj1MnEkAoRiQaaAgNVNw4EiRPZUoSTEIgQ6o
MBKpgA4E9JjIKjmwmZUyI3QxQ5w0TXoFdwG6yXfsPYcedGB5LjdS0AZYgQVIhIIyetAINh4oA3oP
mTwdMCiglBpSBaUcD+wIllmVl+pEAxRkynMZ1KqNQxfziGWgpYz1hLkpBMtCTU2H1k4ZlfwJ0Llc
lMx9BSInELv6Ufy6Ks5l6mKmBu7gyN6XNj0HWgi2TDGfqrQYKKg0gK6QSl8YTtBP7Kgw5LkPN8uo
2umsvKsjm10s0nWIOU79UIOP5P06ZSqmlQRH1iFOCnFk1WIAjynmcmKW8QnEDDKNI2b+Ojv0ypzS
GgtgSjmiHebe+gkzBqVyogbuh/XDhJ719N1JEFSWjhfXFkPccixqdrm6PLiZukmwLE+YvL8AMIS4
EMrKfFqQp5cl8bFPOd7Ld1TTcWAZ9plQYigLIRqTqejTCuBTXcbp1X6ZlOOYdK/xC9Vy17c/zPnM
+Q64QK8/LKhGRX8G2m3bvJFwv9h+sc/Goh3Inann5cfJcbwAVhdt/1fOW7nn1jn0/yf9GbCWkpLb
p0yO7euWz2GZPtcFfzvsxeYq2AjlA1vaK1Muo3DU4eTnm8vISZTGfcehp5wyz9dXl/bd1t2+w/fY
WrN6Wv0LAAD//wMAAfXtOswFAAA=
headers:
CF-RAY:
- 934022059c2c0110-GRU
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Mon, 21 Apr 2025 21:56:45 GMT
Server:
- cloudflare
Set-Cookie:
- __cf_bm=9.xrptc4Zx5NtXl.2MzDRi3N1u8YVt6tNHmSwFyx94A-1745272605-1.0.1.1-v3SFlNedUJ2GFxpW0cts207UyNYzhzfJLBW4o_D8D1t15aRi1Bfh8TEkoVN8JQQdIgDqze4xz4.o3yDgegWJrUGzKroLzXP0VeCDkmLibTc;
path=/; expires=Mon, 21-Apr-25 22:26:45 GMT; domain=.api.openai.com; HttpOnly;
Secure; SameSite=None
- _cfuvid=e_MIZNumotQmvbprZ3okpLcxs_RLI2Yb_jiAh0fYHT8-1745272605039-0.0.1.1-604800000;
path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
cf-cache-status:
- DYNAMIC
openai-organization:
- crewai-iuxna1
openai-processing-ms:
- '2377'
openai-version:
- '2020-10-01'
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- '30000'
x-ratelimit-limit-tokens:
- '150000000'
x-ratelimit-remaining-requests:
- '29999'
x-ratelimit-remaining-tokens:
- '149999832'
x-ratelimit-reset-requests:
- 2ms
x-ratelimit-reset-tokens:
- 0s
x-request-id:
- req_f39581c88a83855cf77c06098b787948
status:
code: 200
message: OK
- request:
body: '{"messages": [{"role": "system", "content": "You are Test Agent. Test Backstory\nYour
personal goal is: Test Goal\nTo give my best complete final answer to the task
respond using the exact following format:\n\nThought: I now can give a great
answer\nFinal Answer: Your final answer must be the great and the most complete
as possible, it must be outcome described.\n\nI MUST use these formats, my job
depends on it!"}, {"role": "user", "content": "\nCurrent Task: Test task\n\nThis
is the expected criteria for your final answer: Output\nyou MUST return the
actual complete content as the final answer, not a summary.\n\nThis is the context
you''re working with:\n### Previous attempt failed validation: bad result\n\n\n###
Previous result:\nThe task at hand requires a comprehensive and detailed response
that meets the expected criteria for output. In fulfilling this requirement,
it is essential to cover all relevant aspects and provide complete content,
ensuring that the information is clear, concise, and accurately addresses the
needs of the task. Therefore, I will outline the necessary components, adhere
to any guidelines provided, and ensure that the final output is thorough and
well-presented, without omitting important details or summarizing the information.
This approach will guarantee a high-quality response that satisfies the outlined
expectations.\n\n\nTry again, making sure to address the validation error.\n\nBegin!
This is VERY important to you, use the tools available and give your best Final
Answer, your job depends on it!\n\nThought:"}], "model": "gpt-4o-mini", "stop":
["\nObservation:"]}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate, zstd
connection:
- keep-alive
content-length:
- '1619'
content-type:
- application/json
cookie:
- __cf_bm=9.xrptc4Zx5NtXl.2MzDRi3N1u8YVt6tNHmSwFyx94A-1745272605-1.0.1.1-v3SFlNedUJ2GFxpW0cts207UyNYzhzfJLBW4o_D8D1t15aRi1Bfh8TEkoVN8JQQdIgDqze4xz4.o3yDgegWJrUGzKroLzXP0VeCDkmLibTc;
_cfuvid=e_MIZNumotQmvbprZ3okpLcxs_RLI2Yb_jiAh0fYHT8-1745272605039-0.0.1.1-604800000
host:
- api.openai.com
user-agent:
- OpenAI/Python 1.68.2
x-stainless-arch:
- arm64
x-stainless-async:
- 'false'
x-stainless-lang:
- python
x-stainless-os:
- MacOS
x-stainless-package-version:
- 1.68.2
x-stainless-raw-response:
- 'true'
x-stainless-read-timeout:
- '600.0'
x-stainless-retry-count:
- '0'
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.11.12
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAA4xWTW8cNwy9+1cQczTWi83GjhPf3CQFjKJI27hoizowuBJnhrZGUkVq15sg/72Q
ZvbDaQ69GOshRb1HPpL6cgLQsG2uoDE9qhmiO/vhg/56YR5uPy4uzfvFm5//ePfnb8vPf70y4Z+f
PjSzciKsHsjo7tTchCE6Ug5+NJtEqFSivrg8v1heLl8tLqphCJZcOdZFPTsPZwN7Plsuludni8uz
F6+n031gQ9Jcwd8nAABf6t+C01t6aq5gMdt9GUgEO2qu9k4ATQqufGlQhEXRazM7GE3wSr5CvwEf
NmDQQ8drAoSuwAb0sqEEcOd/ZI8Oruv/V3DbEyjKI6BCj96CJ0MirKgkgKB9SCF3PRSbJUV2ZCGR
xOCFQAOQl5wIarYYvSHYsPagPQE9RTJKFkxipcQIbUgQssas83p1TDxg2sKY+gKYpQSNKazZEvig
8JBFAUHyUF1XWWvwqTzlRyUPiVypTzmuE6sZoLWp8PEdoHPFh9boFVAKNKm00PaUiocGQL/dXW6h
y2zJsSeZw52/87cB0PRMawLtWWb1njY4FzbleEEUPHkV2LBzsCJgb1wuodhX513irmq8F3M4Pb3x
moLNpgjt9PQK3jrC5LZgqWVP9VTMKQYhCO2eWQUesrqdj3DnuWVTKxBaKMDXIy2syBL15KVqotZ+
fueh6GFZQLzbVfbao9sKS0Hy/gmHEp7Q9FPC9hBCZLMXh9vOCtOQYkio5coWjcoMLCrOKtR94g0K
gWi2TAIhAT1hqWOtuuQYQ9JaKSkKHEoy5/C+AvCdIxiKGFZVWi6ksdo7sSC02TnI3lIqLWILkgmw
5KowGFC1cr/zLwvxtw4T67Zi/CWRYZnK8LsQmFKKaorVRODQdxm7kvLKPExiPG4IE/yatsC+DWnA
Ulagth3l7bZzuF4HtoDDiru8u3pqIu1Rx3THwF5LM2BSNnlUNntA2OB29GMBQtmOV071tZXZeWUW
ivZ2dD7W7uHPo1geaQst1wSNLcBeuOtVwCbceGhTGKrjvuPHfI9sjjUwg0SVqakN1BPwUGq4k2H9
cpSJmEjIFzL1WpWdMgwdN26lcVFoXFvL5SQ6+I0k5GSoivOmBYzRscGVo9mu0SBRS4m8GcXl2D9W
ZbU5aU8JEmFVxYB1HjkZU1nG5V5FhwvrZHnSEmlKELCfUI5CErAsJovQmPnbURZ1PIBoykZzUSnG
mAKafgY342zoMib0SkW1B/EULAORSr1iam87zdGaQIFNz44mtGN3Jy6Bd6Ua+60PjkXZwJpp83xw
jIN+Djc6QimNXSeHc1vAKs6YaM0hC0TWFp0TkFxnAKyxy0dtMGrn21FcB2Oi1XbU9YhSUFnKXAip
ig/dbhMcb7JEbRYs29Rn544M6H2YUlB26KfJ8nW/NV3oYgor+eZo07Jn6e8ToQRfNqRoiE21fj0B
+FS3c362cJuYwhD1XsMj1euWb5ZjvObwKDhYXy5eT1YNiu5geLWYlvrzgPdjqeRowTcGTU/2cPTw
GsBsORwZTo5o/xfO92KP1Nl3/yf8wWAMRSV7HxNZNs8pH9wSPdQd/323fZor4EYordnQvTKlUgpL
LWY3PmUa2YrScN+y7yjFxON7po337aW9eIWvcWmak68n/wIAAP//AwA0/RJL3QkAAA==
headers:
CF-RAY:
- 93402216690b0110-GRU
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Mon, 21 Apr 2025 21:56:49 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
cf-cache-status:
- DYNAMIC
openai-organization:
- crewai-iuxna1
openai-processing-ms:
- '4451'
openai-version:
- '2020-10-01'
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- '30000'
x-ratelimit-limit-tokens:
- '150000000'
x-ratelimit-remaining-requests:
- '29999'
x-ratelimit-remaining-tokens:
- '149999631'
x-ratelimit-reset-requests:
- 2ms
x-ratelimit-reset-tokens:
- 0s
x-request-id:
- req_8383a16d5f5b7f53d659bebf481ba936
status:
code: 200
message: OK
- request:
body: '{"messages": [{"role": "system", "content": "You are Test Agent. Test Backstory\nYour
personal goal is: Test Goal\nTo give my best complete final answer to the task
respond using the exact following format:\n\nThought: I now can give a great
answer\nFinal Answer: Your final answer must be the great and the most complete
as possible, it must be outcome described.\n\nI MUST use these formats, my job
depends on it!"}, {"role": "user", "content": "\nCurrent Task: Test task\n\nThis
is the expected criteria for your final answer: Output\nyou MUST return the
actual complete content as the final answer, not a summary.\n\nBegin! This is
VERY important to you, use the tools available and give your best Final Answer,
your job depends on it!\n\nThought:"}], "model": "gpt-4o-mini", "stop": ["\nObservation:"]}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate, zstd
connection:
- keep-alive
content-length:
- '807'
content-type:
- application/json
cookie:
- __cf_bm=9.xrptc4Zx5NtXl.2MzDRi3N1u8YVt6tNHmSwFyx94A-1745272605-1.0.1.1-v3SFlNedUJ2GFxpW0cts207UyNYzhzfJLBW4o_D8D1t15aRi1Bfh8TEkoVN8JQQdIgDqze4xz4.o3yDgegWJrUGzKroLzXP0VeCDkmLibTc;
_cfuvid=e_MIZNumotQmvbprZ3okpLcxs_RLI2Yb_jiAh0fYHT8-1745272605039-0.0.1.1-604800000
host:
- api.openai.com
user-agent:
- OpenAI/Python 1.68.2
x-stainless-arch:
- arm64
x-stainless-async:
- 'false'
x-stainless-lang:
- python
x-stainless-os:
- MacOS
x-stainless-package-version:
- 1.68.2
x-stainless-raw-response:
- 'true'
x-stainless-read-timeout:
- '600.0'
x-stainless-retry-count:
- '0'
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.11.12
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAA4xU227bRhB911cM+CwJtnyN3xygBoI+xG1doGgTCKPdITnRcpbdGUoRAgP9jf5e
v6TYpWTJjR8KEATIs3PmdvZ8mwBU7Ks7qFyL5ro+zN5/tJ/urz7cPjziw68ff3y6vXz8vb1/fLyw
n29/q6Y5Iq6+kLND1NzFrg9kHGWEXSI0yqznN5dXi5vF9flZAbroKeSwprfZZZx1LDxbnC0uZ2c3
s/PbfXQb2ZFWd/DHBADgW3nnOsXT1+oOClf505EqNlTdvRwCqFIM+U+FqqyGYtX0CLooRlJK/wAS
t+BQoOENAUKTywYU3VIC+CQPLBjgvnzfwROpgaGuFTAReFJuhDxYBNpgGNAItCfHNTvQNYegEBOs
JW4D+YaABRBy/hRDIA8kG05ROhKbw1NL4BIbJUaoYzpwsjSgg3OkCiwuDJ7ABUxsO4g1sKilweXR
6xQwcFP4YMvWQiBMkgnGdfGG8hnxYC1BoA2FTEHSYEMlylraAQV2bFCn2EGPydhxj2I6h1/23WEI
uykg2GEioG0cgn8pEMHl3CWXi+JYCfoUuz6nQIM4WGAhhW3+YgX62pMz8mPWXN7rzPfec+5xzMx2
SNhiWdyWQph5qjnvQ11Muek0rBK7MSGqUn4K9Z8Dhv34EmkfJQMrVPIQBfpEnoxSV8gOK5nD+x2Q
6FCoC2emOk6AFRIF2qDY2HaLIZA0LE0peMshANX1uIewg45Qh0TfNfvPX3+Dwx5XHNiYtLD1KW7Y
ExRNrELWknLTWtaExczBCQbxlLLgfdl5Xah1KLuHDs0ozeGHQwm50R4T5rFOSyM5qiNro9/LpCby
K3Rr6Mi1KKzdKP08SjHGAGPuF4HGGlB2x6lMYdtyIMCgETpkMeQiSJRT9WeN+MHloiweVZtLaFLc
Wjs/vcGJ6kExu4gMIZwAKBKt9FO84/MeeX5xixCbPsWV/ie0qllY22Ui1CjZGdRiXxX0eQLwubjS
8MpoqlHOS4trKunOry5Gvupohqfo9R61aBiOwMXZu+kbhEtPhhz0xNgqh64lfww9uiAOnuMJMDlp
+/ty3uIeW2dp/g/9EXCOeiO/zHeG3euWj8cSfSmX++1jL2MuBVdKacOOlsaU8io81TiE0cIr3alR
t6xZGkp94tHH635Z3/ira7zFhasmz5N/AQAA//8DAAhvMU7VBgAA
headers:
CF-RAY:
- 93402233baf00110-GRU
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Mon, 21 Apr 2025 21:56:56 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
cf-cache-status:
- DYNAMIC
openai-organization:
- crewai-iuxna1
openai-processing-ms:
- '6058'
openai-version:
- '2020-10-01'
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- '30000'
x-ratelimit-limit-tokens:
- '150000000'
x-ratelimit-remaining-requests:
- '29999'
x-ratelimit-remaining-tokens:
- '149999832'
x-ratelimit-reset-requests:
- 2ms
x-ratelimit-reset-tokens:
- 0s
x-request-id:
- req_f5273114a4a797fd0928674edb442194
status:
code: 200
message: OK
- request:
body: '{"messages": [{"role": "system", "content": "You are Test Agent. Test Backstory\nYour
personal goal is: Test Goal\nTo give my best complete final answer to the task
respond using the exact following format:\n\nThought: I now can give a great
answer\nFinal Answer: Your final answer must be the great and the most complete
as possible, it must be outcome described.\n\nI MUST use these formats, my job
depends on it!"}, {"role": "user", "content": "\nCurrent Task: Test task\n\nThis
is the expected criteria for your final answer: Output\nyou MUST return the
actual complete content as the final answer, not a summary.\n\nThis is the context
you''re working with:\n### Previous attempt failed validation: bad result\n\n\n###
Previous result:\nTest tasks are designed to evaluate specific skills or knowledge
in a controlled environment. The criteria for evaluating success include clarity
of instructions, alignment with learning objectives, and the level of engagement
they elicit from participants. Specifically, a test task should include a clear
and concise prompt that outlines what is expected from the participants. Additionally,
it should have a well-defined scoring rubric that assesses the quality of responses
based on predetermined criteria. By ensuring that the test task is relevant
and challenging, it will effectively measure the participants\u2019 capabilities
and provide valuable insights into their understanding of the subject matter.
Effective preparation, testing methods, and feedback mechanisms are essential
to the success of any test task, while also maintaining an environment conducive
to learning and growth.\n\n\nTry again, making sure to address the validation
error.\n\nBegin! This is VERY important to you, use the tools available and
give your best Final Answer, your job depends on it!\n\nThought:"}], "model":
"gpt-4o-mini", "stop": ["\nObservation:"]}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate, zstd
connection:
- keep-alive
content-length:
- '1887'
content-type:
- application/json
cookie:
- __cf_bm=9.xrptc4Zx5NtXl.2MzDRi3N1u8YVt6tNHmSwFyx94A-1745272605-1.0.1.1-v3SFlNedUJ2GFxpW0cts207UyNYzhzfJLBW4o_D8D1t15aRi1Bfh8TEkoVN8JQQdIgDqze4xz4.o3yDgegWJrUGzKroLzXP0VeCDkmLibTc;
_cfuvid=e_MIZNumotQmvbprZ3okpLcxs_RLI2Yb_jiAh0fYHT8-1745272605039-0.0.1.1-604800000
host:
- api.openai.com
user-agent:
- OpenAI/Python 1.68.2
x-stainless-arch:
- arm64
x-stainless-async:
- 'false'
x-stainless-lang:
- python
x-stainless-os:
- MacOS
x-stainless-package-version:
- 1.68.2
x-stainless-raw-response:
- 'true'
x-stainless-read-timeout:
- '600.0'
x-stainless-retry-count:
- '0'
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.11.12
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAAwAAAP//jFfbbhtHEn3XVxT4kl2DJGTZlmy9yV4nCBCs5ayxXmAdCMXumpkOe7rG
fSFFBAbyG/t7+yWLqh7OUIoC7AsBTnfX5dSpU92/nQEsnF1cw8J0mE0/+NXbD/njD+fv/ee/X92/
+9fN839+/Pj57e35hx8ub9+8WSzlBG9+JZOPp9aG+8FTdhzqsomEmcTq86uXry6uLi6fX+pCz5a8
HGuHvHrJq94Ft7o4v3i5Or9aPX89nu7YGUqLa/j3GQDAb/orcQZL94trOF8ev/SUEra0uJ42ASwi
e/mywJRcyhjyYjkvGg6Zgob+IwTeg8EArdsRILQSNmBIe4oAX8L3LqCHG/1/DZ8oZciYtgkwEqQc
i8klkgVMiVLqKeQEaSDjGmfQ+wNYSq4NZCEz0A59wSx+BozZGTdgyN8lSFvnfVrCNvDek21pCRgs
CKaUKRhHCfYudy4AgoQf2Xuy0ETsac9xu4ZPHSUaY0sUdwR98dkNnmAoceBEaQkuGF+sCy3kjsBZ
ClkDlbIBN5IQhTZ3Sd3vCbeBkp6U/T1hKpEkSdnsCWMQW1yy4V52ySnZOUTeuTQaRSPmceMJGiK7
QbNdw5fwJXxiqCwBDEBNQyZLEfIR5CW4DC6BhBCyQy8YGg7JWYqQaEcRPWzpAOQ1qnQtZp+v4dmz
dxKdxvOOg3GJ4MdQy+U4pGfPrgUwcCffoC8pw0ariq7tcsNxj9GqEcJ0EO8lWIpCKLuG27mECVLH
xVstINA9muwPsO+wxn8/kMlkBYzcUb/UWkolJUjlSOOEIrWl3I60nIJiP+SjafVWvMAlEAtAoHzY
4wGyuCLvehcwk5TvANhvXFtcPiyBQiqxll327SgeOJDExkGtJewJBmxpLQheCII33rVBa63R/nSs
9odjkBOKU8GOoVoXSREwHCNpyGpDPM2smewApYwb71InlOZYKXSCrsDhEuAUkeZDqaYju+f2k6Qi
edphyFq6nlD8NcUvAb3nvTgXLxgAjSlRwntEbYVEexI4zl1Z/UnnO1EQW7t6Q1NjW4XvhcD3PrTY
qkGB6Qb25P1qFoPHkAXOwMEfxkxOfG5KBvSJgdTiU8CIfm0I0HSOdmK9i1zaTqOMWLvq2CJLiIR+
teconBoGP7b/2L07jI4sfC2UVBQajj3mBH9JxXSAaVKVVZXopSQQj4q5FLgGdWnQg6WepcOqg7+u
YQZFY6bQYTAEPWe3qyKkuuO8d6EV6RF8p4QzgZCHUvYHBfqlAP1ZgP3b2EH/MKw8/7lsojNKUJ6Z
5g+zBEuJvxb0Lh+k4pHSwEGlDiGNVqJaOZZoQ2BpR54HsiPwDzdwyd6FatpElyk6VKaNTh2HxxLs
xWCSAAaKirXgMQmGHkYFXr4cqSnEqRIxuZljzBFDGjDSyH/DfV+CVLnS9ZQ/KiB2p04zQ1uc1fCd
VJEGrJVTrF8J1j/XxjJUlbVD7ym09KdCsCEwGKkpMglNxCZPLTP1aOY/9Pt3ikB0FAzVUTQSVWZI
5lMGJ0MBo+O0hhtrnU4a7w86OsYghkipgiFmIg/RCQEUekHUHNMY1TEYLhFbSoquElkm71ZqJrEM
kTee+lViv5NvIm1cMvCO4r4j3x+L+6BTBcNLFYZp0N3OEKvh78fpKHDeRidCqPPOyoQabZ4MR7T0
Val8UqoT4HUGW/qTQdWhXHeMGXssUuISjdCf7qkfPMZREHgYOOYSXJZLiPCntjet4ZZTXtVYDB8n
6W6e8iLFJhYjk1uYfIIHtJH3MgODPZmpLrQnGi2jssbWeCV/OFLzpFXEgI4UaCL3Ovd6ufFtqUJ+
pZcBxVBCex92LnI4yvJPKFpSbzc0L0lb7DtnuoeQSz5o5b6aMsmlb8SyYfkvukEhkd6jsKF8GPGj
IEr2dBkaIi8d2nDMR3bTvTA2jckKt9ou1zYYR8A0THvU7vfLibVKUhg4OS3FNG3nhlJY3h5EiDgO
LLypYKfTIfEw8Xk6jQNzGlSUHtD8v7//BwwOuHG+EmaaXnprdZuSK4tqchxaFu9TmJKkDt6j1Eo4
VerGW6YWQTihev+AOyqjj4J5eKPFSJiUi66X/tBsl6p7eisocpmfoxnweLBOKvWht04PZEsdnOin
C/D69JUhspdQXjqheH+ygCFwriNR3je/jCvfpheN51Y0Jj06umiEed2d5MBBXi8p87DQ1W9nAL/o
y6k8eAwt6h3yLvOW1N2LizfV3mJ+sM2rr168GFczZ/TzwuvLi+UTBu8sZXQ+nTy+FgZNR3Y+Or/U
sFjHJwtnJ2n/MZynbNfUXWj/H/PzgjE0ZLJ3QyTrzMOU522RftWp+/S2CWYNeCFPLGfoLjuKUgpL
DRZfn5mLdEiZ+rvGhZbiEF19azbDXXNlX13ia7wwi7NvZ/8DAAD//wMA87a9+nkPAAA=
headers:
CF-RAY:
- 9340225b9bca0110-GRU
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Mon, 21 Apr 2025 21:57:05 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
cf-cache-status:
- DYNAMIC
openai-organization:
- crewai-iuxna1
openai-processing-ms:
- '9141'
openai-version:
- '2020-10-01'
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- '30000'
x-ratelimit-limit-tokens:
- '150000000'
x-ratelimit-remaining-requests:
- '29999'
x-ratelimit-remaining-tokens:
- '149999564'
x-ratelimit-reset-requests:
- 2ms
x-ratelimit-reset-tokens:
- 0s
x-request-id:
- req_0fc29337116c1d19a0543dfe5b0db291
status:
code: 200
message: OK
version: 1

View File

@@ -0,0 +1,288 @@
interactions:
- request:
body: '{"messages": [{"role": "system", "content": "You are Test Agent. Test Backstory\nYour
personal goal is: Test Goal\nTo give my best complete final answer to the task
respond using the exact following format:\n\nThought: I now can give a great
answer\nFinal Answer: Your final answer must be the great and the most complete
as possible, it must be outcome described.\n\nI MUST use these formats, my job
depends on it!"}, {"role": "user", "content": "\nCurrent Task: Gather information
about available books on the First World War\n\nThis is the expected criteria
for your final answer: A list of available books on the First World War\nyou
MUST return the actual complete content as the final answer, not a summary.\n\nBegin!
This is VERY important to you, use the tools available and give your best Final
Answer, your job depends on it!\n\nThought:"}], "model": "gpt-4o-mini", "stop":
["\nObservation:"]}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate
connection:
- keep-alive
content-length:
- '903'
content-type:
- application/json
host:
- api.openai.com
user-agent:
- OpenAI/Python 1.68.2
x-stainless-arch:
- arm64
x-stainless-async:
- 'false'
x-stainless-lang:
- python
x-stainless-os:
- MacOS
x-stainless-package-version:
- 1.68.2
x-stainless-raw-response:
- 'true'
x-stainless-read-timeout:
- '600.0'
x-stainless-retry-count:
- '0'
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.12.9
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAAwAAAP//jFdhb9w2Ev3uXzHQlwCBbewmThzvN1+Q2G3PQM+X1mjPhTFLjqSpKY46
pHazV+S/H0hK2t00Be6LYZEi9d68x8fZP08AKrbVCirTYjRd787+cU/3P7+9tfTPq+1PTffDtr7+
tf759+HG9tdSnaYVsv6dTJxWnRvpekeRxZdpo4SR0q7Ly4s3V68Xi+VlnujEkkvLmj6eXchZx57P
Xi1eXZwtLs+W78bVrbChUK3gPycAAH/mvwmnt/S5WsHidBrpKARsqFrNLwFUKi6NVBgCh4g+Vqf7
SSM+ks/QvwMvWzDooeENAUKTYAP6sCUFePQf2aOD6/y8gltSAg6AkOgqteRDWuc4RJAacIPscO0I
1iLPAcRDbAk+soYID6LOwgPq6tE/+uU5vHz5WH1qCf7tiPotumfSsIJb2cKHQaUneCAfIUpaA+xh
ebW8eKxgvYP3rXKI0rek8N6hPr98mcACwKeWQ/44WHIbCsA+SgZRBPoMNZooGiC2GMGRhXFehrhW
wufEIz1vUU9B6pqUfQPsAzdtPNivF8eRDTpAbyGIYXRgdx47NiFtQqhuB68WsQVDPg66G3mdJ/6v
Cv/rsSw/eSueVpDrEUV3E4ybLMhDApP4J7TLq+W7Uoeb8+/P4Y52pHMB8tOLAPS5xyKOR1WM6T8j
G8rMCchH1kwSthxbQKjFDFmxtcQWOnYcUXcQomKkhilknmltO3To0wdImbyhAOTtoGQTpCDOcvpK
etvwhh2jD4COnwvx1yNx5+BfA1OcTPJAIZJ6+KjiY6H3Qdm0cIfKCPfUof4x0Ez0GnrhxqOP4GVD
rghqsI+DUiGpFNhliAW74tDhAfAM+IY00Zlx2yELPrsVvjuFNQayE1AcYiupxLI9qkJmd7G39Veu
L5S+l9bDD0QN+plJeYReZcM21RksReTkzVmGNvl974pszpowDsfuTPJhjI7C6YFyp7N0yQDoHHDX
o8mlb5ysk4frGlkLhzeFw42IXe8oOS6J9anFUZZ7WZNGuFHcUDg+eThEWbM0in27AyUjg4/huGyH
xsHEdiz9VPm/Hj6EnjSIH8+a0fHgpdGeTPb2KI4RXzs2ce/WXkI8Sz4nxUzvbaH3HiOGqClE8sla
TalzI8k+stfsDj/DLYbIvtnTnQbAtCqejRs9RxtKjB2hTdCHPm+1N9OMq8TDrFaR/ID/i5Q1HFOq
7NUqELNdDVPk0XWXe9eljySp4IO3ZOFHQjPGyr2gHePjYiKmDSpFuENzx84deHIeSXI5mU7UcegV
U+bHo1RNF4zyeojfTtcsLnV9i4H/OzG2ZDiw+AAdWkroUglzkBiVEA6z893fHrIVXMP7chMT3JYz
M3ON7OGGXTLvsWvNtGI6ZRGfKYCOAGKrMjTtQW7Grw6iSgccAxgcQjFPesI6pmyJ7SkMofh4y5ZA
0TeUNggy6BQcV4VTcWJi9guhlmDM5LKchcqPODi4xe7Ai12Jb5qv3J43EtHBLu0idbk7jq1XPGZp
gyFialxSTKDbZUkSfie+OXPF50B1TSbO+28TqAx8uSjIk/FuRQON9WbTIjm4E+0HbWQGO5a8ZWeV
/IswpTc5Vzw2Bx1Cm/abrHYUHF9nRctN61IGTqNr8RbWFLdEHtBzh65kQr69AmDHNsSSGS1KEeED
mnZUNsw9TEqhAIPnPwY6jJyyXb6iyuENgGsZ4rdanlPo8HmE1sEG3ZC7JKXRAlCLAvqdeEodBmm6
DG1qemJiPqmZ+EseHp16ftjXKdVDwNRb+sG5gwn0XorEuaP8bZz5MveQTppeZR2+WlrV7Dm0T0oY
xKd+MTVdVZ79cgLwW+5Vh6P2s+pVuj4+RXmm/Lnl5dirVvsWeT/75vXFOBsTw/3E5WKaONrwaSz1
QbtbGTQt2f3SfW+Mg2U5mDg5oP1XON/au1Bn3/w/2+8njKE+kn3qlSybY8r715TST4i/e20ucwZc
BdING3qKTJqksFTj4EpjX4VdiNQ91ewb0l65dPd1/2TXaPDtwtaL6uTLyf8AAAD//wMA10Lu/OsM
AAA=
headers:
CF-RAY:
- 937ed42dee2e621f-GRU
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Tue, 29 Apr 2025 12:33:48 GMT
Server:
- cloudflare
Set-Cookie:
- __cf_bm=mLRCnpdB3n_6medIZWHnUu8MNRGZsD6riaRhN47PK74-1745930028-1.0.1.1-M2lDM1_V9hNCK0MZrBnFalF3lndC3JkS8zhDOGww_LmOrgdpU9fZLpNZUmyinCQOnlCjDjDYJUECM82ffT1anqBiO1NoDeNp91EPKiK7s.8;
path=/; expires=Tue, 29-Apr-25 13:03:48 GMT; domain=.api.openai.com; HttpOnly;
Secure; SameSite=None
- _cfuvid=eTrj_ZhCx2XuylS5vYROwUlPrJBwOyrbS2Ki.msl45E-1745930028010-0.0.1.1-604800000;
path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
cf-cache-status:
- DYNAMIC
openai-organization:
- crewai-iuxna1
openai-processing-ms:
- '10856'
openai-version:
- '2020-10-01'
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- '30000'
x-ratelimit-limit-tokens:
- '150000000'
x-ratelimit-remaining-requests:
- '29999'
x-ratelimit-remaining-tokens:
- '149999807'
x-ratelimit-reset-requests:
- 2ms
x-ratelimit-reset-tokens:
- 0s
x-request-id:
- req_bc2d62d8325b2bdd3e98544a66389132
status:
code: 200
message: OK
- request:
body: '{"messages": [{"role": "system", "content": "You are Guardrail Agent. You
are a expert at validating the output of a task. By providing effective feedback
if the output is not valid.\nYour personal goal is: Validate the output of the
task\n\nTo give my best complete final answer to the task respond using the
exact following format:\n\nThought: I now can give a great answer\nFinal Answer:
Your final answer must be the great and the most complete as possible, it must
be outcome described.\n\nI MUST use these formats, my job depends on it!\nIMPORTANT:
Your final answer MUST contain all the information requested in the following
format: {\n \"valid\": bool,\n \"feedback\": str | None\n}\n\nIMPORTANT: Ensure
the final output does not include any code block markers like ```json or ```python."},
{"role": "user", "content": "\n Ensure the following task result complies
with the given guardrail.\n\n Task result:\n Here is a comprehensive
list of available books on the First World War:\n\n1. **\"The Sleepwalkers:
How Europe Went to War in 1914\" by Christopher Clark** \n This book delves
into the complex factors that led to the outbreak of the war, offering insights
into the political and social dynamics of early 20th century Europe.\n\n2. **\"A
World Undone: The Story of the Great War, 1914 to 1918\" by G.J. Meyer** \n Meyer''s
expansive narrative covers the entire war with a focus on both military strategies
and the human experiences endured by soldiers and civilians alike.\n\n3. **\"All
Quiet on the Western Front\" by Erich Maria Remarque** \n A poignant novel
that captures the resilience and trauma experienced by German soldiers during
World War I, based on the author''s own experiences.\n\n4. **\"The First World
War\" by John Keegan** \n Keegan provides a detailed military history of
the war, featuring insights on battles, strategies, and the overall impact on
global affairs.\n\n5. **\"Goodbye to All That\" by Robert Graves** \n This
autobiography recounts the author''s experiences as a soldier during the war,
offering a personal and critical perspective on the conflicts and the post-war
era.\n\n6. **\"Catastrophe 1914: Europe Goes to War\" by Max Hastings** \n Hastings
chronicles the events leading up to World War I and the early battles, detailing
the war''s initial impact on European societies.\n\n7. **\"The War That Ended
Peace: The Road to 1914\" by Margaret MacMillan** \n MacMillan explores the
political and historical factors that contributed to the outbreak of war, emphasizing
the decisions made by leaders across Europe.\n\n8. **\"The First World War:
A Complete History\" by Martin Gilbert** \n This complete history takes readers
through the entirety of the war, from its causes to its aftermath, using a wide
range of sources.\n\n9. **\"1914: The Year the World Ended\" by Paul Ham** \n Ham
focuses on the pivotal year of 1914 and the early war''s devastation, analyzing
its long-lasting effects on the world.\n\n10. **\"War Horse\" by Michael Morpurgo** \n This
children''s novel tells the story of a horse and his experiences during the
war, highlighting the bond between animals and humans amidst the chaos.\n\nEach
of these books offers unique perspectives and rich details about the First World
War, making them valuable resources for anyone interested in this pivotal period
in history.\n\n Guardrail:\n Ensure the authors are from Italy\n \n Your
task:\n - Confirm if the Task result complies with the guardrail.\n -
If not, provide clear feedback explaining what is wrong (e.g., by how much it
violates the rule, or what specific part fails).\n - Focus only on identifying
issues \u2014 do not propose corrections.\n - If the Task result complies
with the guardrail, saying that is valid\n "}], "model": "gpt-4o-mini",
"stop": ["\nObservation:"]}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate
connection:
- keep-alive
content-length:
- '3917'
content-type:
- application/json
cookie:
- __cf_bm=mLRCnpdB3n_6medIZWHnUu8MNRGZsD6riaRhN47PK74-1745930028-1.0.1.1-M2lDM1_V9hNCK0MZrBnFalF3lndC3JkS8zhDOGww_LmOrgdpU9fZLpNZUmyinCQOnlCjDjDYJUECM82ffT1anqBiO1NoDeNp91EPKiK7s.8;
_cfuvid=eTrj_ZhCx2XuylS5vYROwUlPrJBwOyrbS2Ki.msl45E-1745930028010-0.0.1.1-604800000
host:
- api.openai.com
user-agent:
- OpenAI/Python 1.68.2
x-stainless-arch:
- arm64
x-stainless-async:
- 'false'
x-stainless-lang:
- python
x-stainless-os:
- MacOS
x-stainless-package-version:
- 1.68.2
x-stainless-raw-response:
- 'true'
x-stainless-read-timeout:
- '600.0'
x-stainless-retry-count:
- '0'
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.12.9
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAAwAAAP//jFPLbtswELz7KxY8y4HsJHWsW9wiQVq0hzx6aBUIa3IlMaFIlVw5NQL/
e0Epiew+gF4EamZnuC8+TwCEViIDIWtk2bRmurqm29X1+/PNzYfL06+fFV8svz2sttuF+XHzUSRR
4dYPJPlVdSRd0xpi7exAS0/IFF1ni5PT5XE6m896onGKTJRVLU9P3LTRVk/n6fxkmi6ms7MXde20
pCAy+D4BAHjuvzFPq+inyCBNXpGGQsCKRPYWBCC8MxERGIIOjJZFMpLSWSbbp35bu66qOYMrsO4J
JFqo9IYAoYr5A9rwRB4gtxfaooHz/j+D59wC5GKDRqtcZFCiCZQMYEmk1igfI56LL84SuBK4JsCO
a+cDGB2YFGjbo4zhETyFzjCgJyi9a+CK0WyP4NyYA2VDNraY1BjpuCYP0nWWvaaQQOhkDRjgknyD
dpv0BnefEkCrhvPNUS5yu9vviaeyCxjnYjtj9gi01jHGS/tp3L8wu7f+G1e13q3Db1JRaqtDXXjC
4GzsdWDXip7dTQDu+zl3B6MTrXdNywW7R+qvWywXg58Y12tk370sgWDHaEb87PRVdeBXKGLUJuxt
ipAoa1KjdFwr7JR2e8Rkr+o/s/mb91C5ttX/2I+ElNQyqaL1pLQ8rHgM8xRf37/C3rrcJywC+Y2W
VLAmHyehqMTODG9ChG1gaopS24p86/XwMMq2SI+X87P5PF2mYrKb/AIAAP//AwD77a3iJgQAAA==
headers:
CF-RAY:
- 937ed6bd68faa435-GRU
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Tue, 29 Apr 2025 12:35:23 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
cf-cache-status:
- DYNAMIC
openai-organization:
- crewai-iuxna1
openai-processing-ms:
- '1138'
openai-version:
- '2020-10-01'
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- '30000'
x-ratelimit-limit-tokens:
- '150000000'
x-ratelimit-remaining-requests:
- '29999'
x-ratelimit-remaining-tokens:
- '149999072'
x-ratelimit-reset-requests:
- 2ms
x-ratelimit-reset-tokens:
- 0s
x-request-id:
- req_2ba1be014a5974ba354aff564e26516a
status:
code: 200
message: OK
version: 1

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,250 @@
interactions:
- request:
body: '{"messages": [{"role": "system", "content": "You are Guardrail Agent. You
are a expert at validating the output of a task. By providing effective feedback
if the output is not valid.\nYour personal goal is: Validate the output of the
task\n\nTo give my best complete final answer to the task respond using the
exact following format:\n\nThought: I now can give a great answer\nFinal Answer:
Your final answer must be the great and the most complete as possible, it must
be outcome described.\n\nI MUST use these formats, my job depends on it!\nIMPORTANT:
Your final answer MUST contain all the information requested in the following
format: {\n \"valid\": bool,\n \"feedback\": str | None\n}\n\nIMPORTANT: Ensure
the final output does not include any code block markers like ```json or ```python."},
{"role": "user", "content": "\n Ensure the following task result complies
with the given guardrail.\n\n Task result:\n \n Lorem Ipsum
is simply dummy text of the printing and typesetting industry. Lorem Ipsum has
been the industry''s standard dummy text ever\n \n\n Guardrail:\n Ensure
the result has less than 10 words\n \n Your task:\n - Confirm
if the Task result complies with the guardrail.\n - If not, provide clear
feedback explaining what is wrong (e.g., by how much it violates the rule, or
what specific part fails).\n - Focus only on identifying issues \u2014
do not propose corrections.\n - If the Task result complies with the
guardrail, saying that is valid\n "}], "model": "gpt-4o-mini", "stop":
["\nObservation:"]}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate
connection:
- keep-alive
content-length:
- '1629'
content-type:
- application/json
host:
- api.openai.com
user-agent:
- OpenAI/Python 1.68.2
x-stainless-arch:
- arm64
x-stainless-async:
- 'false'
x-stainless-lang:
- python
x-stainless-os:
- MacOS
x-stainless-package-version:
- 1.68.2
x-stainless-raw-response:
- 'true'
x-stainless-read-timeout:
- '600.0'
x-stainless-retry-count:
- '0'
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.12.9
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAAwAAAP//jFPLbtswELz7KxY824GkxLGtW4KiQB+XBmkRtAqENbmSmFAkQVJ2UsP/
HlByLKdNgV4IcGdnOPvgbgLApGA5MN5g4K1Vs+ub718rm324+5z+/CLt1dXD5fU3s1jd3Wx//GbT
yDDrB+LhlXXGTWsVBWn0AHNHGCiqpouL+XKVZum8B1ojSEVabcPswsxaqeUsS7KLWbKYpcsDuzGS
k2c5/JoAAOz6M/rUgp5YDsn0NdKS91gTy49JAMwZFSMMvZc+oA5sOoLc6EC6t37bmK5uQg6fQJst
cNRQyw0BQh39A2q/JQdQ6I9So4Kr/p7DrtAABdugkqJgOVSoPE2HYEUk1sgfY7xgtw1BQP8Ijnyn
AsTHUWoP6SVsjRN+CvTEiYTUNYSGoO7QCYdSgZKtDGAqqCiaCA1qSJOBBetnOAicFazQ+9MCHVWd
x9hk3Sl1AqDWJmAcUt/a+wOyPzZTmdo6s/Z/UFkltfRN6Qi90bFxPhjLenQ/Abjvh9a9mQOzzrQ2
lME8Uv/ceTIf9Ni4KyM6Tw9gMAHVCWt+OX1HrxQUUCp/MnbGkTckRuq4I9gJaU6AyUnVf7t5T3uo
XOr6f+RHgHOygURpHQnJ31Y8pjmKX+lfaccu94aZJ7eRnMogycVJCKqwU8OCM//sA7VlJXVNzjo5
bHlly+R8lS2zLFklbLKfvAAAAP//AwCHe/Jh8wMAAA==
headers:
CF-RAY:
- 937b20ddf9607def-GRU
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Tue, 29 Apr 2025 01:46:56 GMT
Server:
- cloudflare
Set-Cookie:
- __cf_bm=nHa2kVJI_yO1RIsmZcEednJ1e9UVy1liv_sjBNtSj7Q-1745891216-1.0.1.1-jUH9kFawVBjnbq8sIL2.MQx.p7JvBZWUhqlkNKRlStWSgQxT0eZMPcgq9TCQoJAjuyNwhqfpK4HuX6x5n8UbQgAb6JrWJEG823e6GpGROEA;
path=/; expires=Tue, 29-Apr-25 02:16:56 GMT; domain=.api.openai.com; HttpOnly;
Secure; SameSite=None
- _cfuvid=gg2UeahMCOOR8YhitRtzDwENMOnTOuQdyTMVJVHG0Mg-1745891216085-0.0.1.1-604800000;
path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
cf-cache-status:
- DYNAMIC
openai-organization:
- crewai-iuxna1
openai-processing-ms:
- '896'
openai-version:
- '2020-10-01'
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- '30000'
x-ratelimit-limit-tokens:
- '150000000'
x-ratelimit-remaining-requests:
- '29999'
x-ratelimit-remaining-tokens:
- '149999631'
x-ratelimit-reset-requests:
- 2ms
x-ratelimit-reset-tokens:
- 0s
x-request-id:
- req_859221ed1aedb26cc9d335004ccf183e
status:
code: 200
message: OK
- request:
body: '{"messages": [{"role": "system", "content": "You are Guardrail Agent. You
are a expert at validating the output of a task. By providing effective feedback
if the output is not valid.\nYour personal goal is: Validate the output of the
task\n\nTo give my best complete final answer to the task respond using the
exact following format:\n\nThought: I now can give a great answer\nFinal Answer:
Your final answer must be the great and the most complete as possible, it must
be outcome described.\n\nI MUST use these formats, my job depends on it!\nIMPORTANT:
Your final answer MUST contain all the information requested in the following
format: {\n \"valid\": bool,\n \"feedback\": str | None\n}\n\nIMPORTANT: Ensure
the final output does not include any code block markers like ```json or ```python."},
{"role": "user", "content": "\n Ensure the following task result complies
with the given guardrail.\n\n Task result:\n \n Lorem Ipsum
is simply dummy text of the printing and typesetting industry. Lorem Ipsum has
been the industry''s standard dummy text ever\n \n\n Guardrail:\n Ensure
the result has less than 500 words\n \n Your task:\n -
Confirm if the Task result complies with the guardrail.\n - If not, provide
clear feedback explaining what is wrong (e.g., by how much it violates the rule,
or what specific part fails).\n - Focus only on identifying issues \u2014
do not propose corrections.\n - If the Task result complies with the
guardrail, saying that is valid\n "}], "model": "gpt-4o-mini", "stop":
["\nObservation:"]}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate
connection:
- keep-alive
content-length:
- '1630'
content-type:
- application/json
cookie:
- __cf_bm=nHa2kVJI_yO1RIsmZcEednJ1e9UVy1liv_sjBNtSj7Q-1745891216-1.0.1.1-jUH9kFawVBjnbq8sIL2.MQx.p7JvBZWUhqlkNKRlStWSgQxT0eZMPcgq9TCQoJAjuyNwhqfpK4HuX6x5n8UbQgAb6JrWJEG823e6GpGROEA;
_cfuvid=gg2UeahMCOOR8YhitRtzDwENMOnTOuQdyTMVJVHG0Mg-1745891216085-0.0.1.1-604800000
host:
- api.openai.com
user-agent:
- OpenAI/Python 1.68.2
x-stainless-arch:
- arm64
x-stainless-async:
- 'false'
x-stainless-lang:
- python
x-stainless-os:
- MacOS
x-stainless-package-version:
- 1.68.2
x-stainless-raw-response:
- 'true'
x-stainless-read-timeout:
- '600.0'
x-stainless-retry-count:
- '0'
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.12.9
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAAwAAAP//jJJNb9swDIbv/hWEzvHgfHRpfesOG3opsGE7LYXBSLStRZY0iU43BPnv
g5wPu10H7GLAfPhSfEkeMgChlShByBZZdt7kH758e1wzbnfbO6o/f1osV3T/+BO7UNNDIWZJ4bY/
SPJF9U66zhti7ewJy0DIlKrO16ub27v5srgZQOcUmSRrPOcrl3fa6nxRLFZ5sc7nt2d167SkKEr4
ngEAHIZv6tMq+iVKKGaXSEcxYkOivCYBiOBMigiMUUdGy2I2Qukskx1a/9q6vmm5hAew7hkkWmj0
ngChSf0D2vhMAWBjP2qLBu6H/xIOGwuwEXs0Wm1ECRx6mp1iNZHaotylsO2N2djj9PFAdR/RnOEE
oLWOMQ1wsP10JserUeMaH9w2vpKKWlsd2yoQRmeTqcjOi4EeM4CnYaD9ixkJH1znuWK3o+G583KG
4Vz2ONLF7RmyYzQT1XI5e6NepYhRmzhZiZAoW1KjdNwf9kq7Ccgmrv/u5q3aJ+faNv9TfgRSkmdS
lQ+ktHzpeEwLlM78X2nXKQ8Ni0hhryVVrCmkTSiqsTen4xPxd2TqqlrbhoIP+nSBta/SueD7QtWF
yI7ZHwAAAP//AwAiLXhqjwMAAA==
headers:
CF-RAY:
- 937b2311ee091b1b-GRU
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Tue, 29 Apr 2025 01:48:26 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
cf-cache-status:
- DYNAMIC
openai-organization:
- crewai-iuxna1
openai-processing-ms:
- '610'
openai-version:
- '2020-10-01'
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- '30000'
x-ratelimit-limit-tokens:
- '150000000'
x-ratelimit-remaining-requests:
- '29999'
x-ratelimit-remaining-tokens:
- '149999631'
x-ratelimit-reset-requests:
- 2ms
x-ratelimit-reset-tokens:
- 0s
x-request-id:
- req_c136835c16be6bc1e4d820f239c4b620
status:
code: 200
message: OK
version: 1

View File

@@ -6,7 +6,7 @@ research_task:
expected_output: >
A list with 10 bullet points of the most relevant information about {topic}
agent: researcher
guardrail: make sure each bullet contains a minimum of 100 words
guardrail: ensure each bullet contains its source
reporting_task:
description: >

View File

@@ -143,31 +143,17 @@ def test_agent_function_calling_llm():
), "agent's function_calling_llm is incorrect"
# VCR could not record the request to localhost from Docker to get it version, so we need to mock the tool.
# TODO: We can remove this mock after some issue such as https://github.com/kevin1024/vcrpy/issues/519 been addressed
@pytest.fixture
def code_interpreter_tool_mock():
with patch(
"crewai_tools.tools.code_interpreter_tool.code_interpreter_tool.CodeInterpreterTool._run",
return_value="(True, 'good result')",
):
yield
def test_task_guardrail(code_interpreter_tool_mock):
def test_task_guardrail():
crew = InternalCrew()
research_task = crew.research_task()
assert (
research_task.guardrail
== "make sure each bullet contains a minimum of 100 words"
)
assert research_task.guardrail == "ensure each bullet contains its source"
reporting_task = crew.reporting_task()
assert reporting_task.guardrail is None
@pytest.mark.vcr(filter_headers=["authorization"])
def test_before_kickoff_modification(code_interpreter_tool_mock):
def test_before_kickoff_modification():
crew = InternalCrew()
inputs = {"topic": "LLMs"}
result = crew.crew().kickoff(inputs=inputs)
@@ -175,7 +161,7 @@ def test_before_kickoff_modification(code_interpreter_tool_mock):
@pytest.mark.vcr(filter_headers=["authorization"])
def test_after_kickoff_modification(code_interpreter_tool_mock):
def test_after_kickoff_modification():
crew = InternalCrew()
# Assuming the crew execution returns a dict
result = crew.crew().kickoff({"topic": "LLMs"})
@@ -186,14 +172,14 @@ def test_after_kickoff_modification(code_interpreter_tool_mock):
@pytest.mark.vcr(filter_headers=["authorization"])
def test_before_kickoff_with_none_input(code_interpreter_tool_mock):
def test_before_kickoff_with_none_input():
crew = InternalCrew()
crew.crew().kickoff(None)
# Test should pass without raising exceptions
@pytest.mark.vcr(filter_headers=["authorization"])
def test_multiple_before_after_kickoff(code_interpreter_tool_mock):
def test_multiple_before_after_kickoff():
@CrewBase
class MultipleHooksCrew:
agents: List[BaseAgent]

View File

@@ -139,127 +139,32 @@ def sample_agent():
return Agent(role="Test Agent", goal="Test Goal", backstory="Test Backstory")
@pytest.mark.vcr(filter_headers=["authorization"])
def test_guardrail_using_llm(sample_agent):
task = Task(
description="Test task",
expected_output="Output",
guardrail="Ensure the output is equal to 'good result'",
)
with patch(
"crewai.tasks.task_guardrail.TaskGuardrail.__call__",
side_effect=[(False, "bad result"), (True, "good result")],
) as mock_guardrail:
task.execute_sync(agent=sample_agent)
assert mock_guardrail.call_count == 2
task.guardrail = TaskGuardrail(
description="Ensure the output is equal to 'good result'",
llm=LLM(model="gpt-4o-mini"),
)
with patch(
"crewai.tasks.task_guardrail.TaskGuardrail.__call__",
side_effect=[(False, "bad result"), (True, "good result")],
) as mock_guardrail:
task.execute_sync(agent=sample_agent)
assert mock_guardrail.call_count == 2
@pytest.fixture
def task_output():
return TaskOutput(
raw="Test output",
raw="""
Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever
""",
description="Test task",
expected_output="Output",
agent="Test Agent",
)
def test_task_guardrail_initialization_no_llm(task_output):
"""Test TaskGuardrail initialization fails without LLM"""
with pytest.raises(ValueError, match="Provide a valid LLM to the TaskGuardrail"):
TaskGuardrail(description="Test")(task_output)
@pytest.fixture
def mock_llm():
llm = Mock(spec=LLM)
llm.call.return_value = """
output = 'Sample book data'
if isinstance(output, str):
result = (True, output)
else:
result = (False, 'Invalid output format')
print(result)
"""
return llm
@pytest.mark.parametrize(
"tool_run_output",
[
{
"output": "(True, 'Valid output')",
"expected_result": True,
"expected_output": "Valid output",
},
{
"output": "(False, 'Invalid output format')",
"expected_result": False,
"expected_output": "Invalid output format",
},
{
"output": "Something went wrong while running the code, Invalid output format",
"expected_result": False,
"expected_output": "Something went wrong while running the code, Invalid output format",
},
{
"output": "No result variable found",
"expected_result": False,
"expected_output": "No result variable found",
},
{
"output": (False, "Invalid output format"),
"expected_result": False,
"expected_output": "Invalid output format",
},
{
"output": "bla-bla-bla",
"expected_result": False,
"expected_output": "Error parsing result: malformed node or string on line 1",
},
],
)
@patch("crewai_tools.CodeInterpreterTool.run")
def test_task_guardrail_execute_code(mock_run, mock_llm, tool_run_output, task_output):
mock_run.return_value = tool_run_output["output"]
guardrail = TaskGuardrail(description="Test validation", llm=mock_llm)
@pytest.mark.vcr(filter_headers=["authorization"])
def test_task_guardrail_process_output(task_output):
guardrail = TaskGuardrail(description="Ensure the result has less than 10 words")
result = guardrail(task_output)
assert result[0] == tool_run_output["expected_result"]
assert tool_run_output["expected_output"] in result[1]
assert result[0] is False
assert "exceeding the guardrail limit of fewer than" in result[1].lower()
@patch("crewai_tools.CodeInterpreterTool.run")
def test_guardrail_using_additional_instructions(mock_run, mock_llm, task_output):
mock_run.return_value = "(True, 'Valid output')"
additional_instructions = (
"This is an additional instruction created by the user follow it strictly"
)
guardrail = TaskGuardrail(
description="Test validation",
llm=mock_llm,
additional_instructions=additional_instructions,
)
guardrail = TaskGuardrail(description="Ensure the result has less than 500 words")
guardrail(task_output)
assert additional_instructions in str(mock_llm.call.call_args)
result = guardrail(task_output)
assert result[0] is True
assert result[1] == task_output.raw
@pytest.mark.vcr(filter_headers=["authorization"])
@@ -287,26 +192,13 @@ def test_guardrail_emits_events(sample_agent):
)
task = Task(
description="Test task",
expected_output="Output",
guardrail="Ensure the output is equal to 'good result'",
description="Gather information about available books on the First World War",
agent=sample_agent,
expected_output="A list of available books on the First World War",
guardrail="Ensure the authors are from Italy",
)
with (
patch(
"crewai_tools.CodeInterpreterTool.run",
side_effect=[
"Something went wrong while running the code",
(True, "good result"),
],
),
patch(
"crewai.tasks.task_guardrail.TaskGuardrail.generate_code",
return_value="""def guardrail(result: TaskOutput):
return (True, result.raw.upper())""",
),
):
task.execute_sync(agent=sample_agent)
result = task.execute_sync(agent=sample_agent)
def custom_guardrail(result: TaskOutput):
return (True, "good result from callable function")
@@ -320,35 +212,26 @@ def test_guardrail_emits_events(sample_agent):
task.execute_sync(agent=sample_agent)
expected_started_events = [
{
"guardrail": """def guardrail(result: TaskOutput):
return (True, result.raw.upper())""",
"retry_count": 0,
},
{
"guardrail": """def guardrail(result: TaskOutput):
return (True, result.raw.upper())""",
"retry_count": 1,
},
{"guardrail": "Ensure the authors are from Italy", "retry_count": 0},
{"guardrail": "Ensure the authors are from Italy", "retry_count": 1},
{
"guardrail": """def custom_guardrail(result: TaskOutput):
return (True, "good result from callable function")""",
"retry_count": 0,
},
]
expected_completed_events = [
{
"success": False,
"result": None,
"error": "Something went wrong while running the code",
"error": "The task result does not comply with the guardrail because none of "
"the listed authors are from Italy. All authors mentioned are from "
"different countries, including Germany, the UK, the USA, and others, "
"which violates the requirement that authors must be Italian.",
"retry_count": 0,
},
{
"success": True,
"result": "good result",
"error": None,
"retry_count": 1,
},
{"success": True, "result": result.raw, "error": None, "retry_count": 1},
{
"success": True,
"result": "good result from callable function",
@@ -360,20 +243,23 @@ def test_guardrail_emits_events(sample_agent):
assert completed_guardrail == expected_completed_events
@pytest.mark.parametrize("unsafe_mode", [True, False])
def test_task_guardrail_force_code_tool_unsafe_mode(mock_llm, task_output, unsafe_mode):
guardrail = TaskGuardrail(
description="Test validation", llm=mock_llm, unsafe_mode=unsafe_mode
)
@pytest.mark.vcr(filter_headers=["authorization"])
def test_guardrail_when_an_error_occurs(sample_agent, task_output):
with (
patch(
"crewai_tools.CodeInterpreterTool.__init__", return_value=None
) as mock_init,
patch(
"crewai_tools.CodeInterpreterTool.run", return_value=(True, "Valid output")
"crewai.Agent.kickoff",
side_effect=Exception("Unexpected error"),
),
pytest.raises(
Exception,
match="Error while validating the task output: Unexpected error",
),
):
result = guardrail(task_output)
mock_init.assert_called_once_with(code=ANY, unsafe_mode=unsafe_mode)
assert result == (True, "Valid output")
task = Task(
description="Gather information about available books on the First World War",
agent=sample_agent,
expected_output="A list of available books on the First World War",
guardrail="Ensure the authors are from Italy",
max_retries=0,
)
task.execute_sync(agent=sample_agent)