mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-11 00:58:30 +00:00
Merge branch 'main' of github.com:crewAIInc/crewAI into devin/1751908431-fix-lite-agent-llm-isinstance-check
This commit is contained in:
@@ -1896,6 +1896,80 @@ def test_agent_with_knowledge_sources_generate_search_query():
|
||||
assert "red" in result.raw.lower()
|
||||
|
||||
|
||||
@pytest.mark.vcr(record_mode='none', filter_headers=["authorization"])
|
||||
def test_agent_with_knowledge_with_no_crewai_knowledge():
|
||||
mock_knowledge = MagicMock(spec=Knowledge)
|
||||
|
||||
agent = Agent(
|
||||
role="Information Agent",
|
||||
goal="Provide information based on knowledge sources",
|
||||
backstory="You have access to specific knowledge sources.",
|
||||
llm=LLM(model="openrouter/openai/gpt-4o-mini",api_key=os.getenv('OPENROUTER_API_KEY')),
|
||||
knowledge=mock_knowledge
|
||||
)
|
||||
|
||||
# Create a task that requires the agent to use the knowledge
|
||||
task = Task(
|
||||
description="What is Vidit's favorite color?",
|
||||
expected_output="Vidit's favorclearite color.",
|
||||
agent=agent,
|
||||
)
|
||||
|
||||
crew = Crew(agents=[agent], tasks=[task])
|
||||
crew.kickoff()
|
||||
mock_knowledge.query.assert_called_once()
|
||||
|
||||
|
||||
@pytest.mark.vcr(record_mode='none', filter_headers=["authorization"])
|
||||
def test_agent_with_only_crewai_knowledge():
|
||||
mock_knowledge = MagicMock(spec=Knowledge)
|
||||
|
||||
agent = Agent(
|
||||
role="Information Agent",
|
||||
goal="Provide information based on knowledge sources",
|
||||
backstory="You have access to specific knowledge sources.",
|
||||
llm=LLM(model="openrouter/openai/gpt-4o-mini",api_key=os.getenv('OPENROUTER_API_KEY'))
|
||||
)
|
||||
|
||||
# Create a task that requires the agent to use the knowledge
|
||||
task = Task(
|
||||
description="What is Vidit's favorite color?",
|
||||
expected_output="Vidit's favorclearite color.",
|
||||
agent=agent
|
||||
)
|
||||
|
||||
crew = Crew(agents=[agent], tasks=[task],knowledge=mock_knowledge)
|
||||
crew.kickoff()
|
||||
mock_knowledge.query.assert_called_once()
|
||||
|
||||
|
||||
@pytest.mark.vcr(record_mode='none', filter_headers=["authorization"])
|
||||
def test_agent_knowledege_with_crewai_knowledge():
|
||||
crew_knowledge = MagicMock(spec=Knowledge)
|
||||
agent_knowledge = MagicMock(spec=Knowledge)
|
||||
|
||||
|
||||
agent = Agent(
|
||||
role="Information Agent",
|
||||
goal="Provide information based on knowledge sources",
|
||||
backstory="You have access to specific knowledge sources.",
|
||||
llm=LLM(model="openrouter/openai/gpt-4o-mini",api_key=os.getenv('OPENROUTER_API_KEY')),
|
||||
knowledge=agent_knowledge
|
||||
)
|
||||
|
||||
# Create a task that requires the agent to use the knowledge
|
||||
task = Task(
|
||||
description="What is Vidit's favorite color?",
|
||||
expected_output="Vidit's favorclearite color.",
|
||||
agent=agent,
|
||||
)
|
||||
|
||||
crew = Crew(agents=[agent],tasks=[task],knowledge=crew_knowledge)
|
||||
crew.kickoff()
|
||||
agent_knowledge.query.assert_called_once()
|
||||
crew_knowledge.query.assert_called_once()
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
def test_litellm_auth_error_handling():
|
||||
"""Test that LiteLLM authentication errors are handled correctly and not retried."""
|
||||
|
||||
File diff suppressed because one or more lines are too long
150
tests/cassettes/test_agent_knowledege_with_crewai_knowledge.yaml
Normal file
150
tests/cassettes/test_agent_knowledege_with_crewai_knowledge.yaml
Normal file
@@ -0,0 +1,150 @@
|
||||
interactions:
|
||||
- request:
|
||||
body: '{"model": "openai/gpt-4o-mini", "messages": [{"role": "system", "content":
|
||||
"Your goal is to rewrite the user query so that it is optimized for retrieval
|
||||
from a vector database. Consider how the query will be used to find relevant
|
||||
documents, and aim to make it more specific and context-aware. \n\n Do not include
|
||||
any other text than the rewritten query, especially any preamble or postamble
|
||||
and only add expected output format if its relevant to the rewritten query.
|
||||
\n\n Focus on the key words of the intended task and to retrieve the most relevant
|
||||
information. \n\n There will be some extra context provided that might need
|
||||
to be removed such as expected_output formats structured_outputs and other instructions."},
|
||||
{"role": "user", "content": "The original query is: What is Vidit''s favorite
|
||||
color?\n\nThis is the expected criteria for your final answer: Vidit''s favorclearite
|
||||
color.\nyou MUST return the actual complete content as the final answer, not
|
||||
a summary.."}], "stream": false, "stop": ["\nObservation:"]}'
|
||||
headers:
|
||||
accept:
|
||||
- '*/*'
|
||||
accept-encoding:
|
||||
- gzip, deflate
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '1017'
|
||||
content-type:
|
||||
- application/json
|
||||
host:
|
||||
- openrouter.ai
|
||||
http-referer:
|
||||
- https://litellm.ai
|
||||
user-agent:
|
||||
- litellm/1.68.0
|
||||
x-title:
|
||||
- liteLLM
|
||||
method: POST
|
||||
uri: https://openrouter.ai/api/v1/chat/completions
|
||||
response:
|
||||
body:
|
||||
string: !!binary |
|
||||
H4sIAAAAAAAAAwAAAP//4lKAAS4AAAAA//90kE1vE0EMhv9K9V64TMrmgyadG8ceECAhhIrQarrj
|
||||
3bidHY/GTgSK9r+jpUpaJLja78djn8ARHgPlxXK72a6X6+12szhq7Id72d2V8b58/nbzQb98gkOp
|
||||
cuRIFR4fC+X3d3AYJVKChxTKgd8OxRYbWYycGQ7y8EidwaPbB7vuZCyJjCXDoasUjCL8S61Dtxfu
|
||||
SOG/n5BkKFUeFD4fUnLoObPu20pBJcNDTQoccjA+UvufLedIP+Ebh5FUw0DwJ1RJBI+gymoh20wj
|
||||
2SjPpF85sr3Rqz4cpbLRVSdJ6jUcKvUHDenM81zFeXgeTNMPB/2lRuMMM1Atlf8k9qVt1rer3WrV
|
||||
3DZwOJw5SpWxWGvyRFnnR7ybQc4/usxvHEwspBfhbun+NreRLHDSObUL3Z7iRdxM/wh9rb/c8coy
|
||||
Tb8BAAD//wMAqVt3JyMCAAA=
|
||||
headers:
|
||||
Access-Control-Allow-Origin:
|
||||
- '*'
|
||||
CF-RAY:
|
||||
- 9402cb503aec46c0-BOM
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Encoding:
|
||||
- gzip
|
||||
Content-Type:
|
||||
- application/json
|
||||
Date:
|
||||
- Thu, 15 May 2025 12:56:14 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
Vary:
|
||||
- Accept-Encoding
|
||||
x-clerk-auth-message:
|
||||
- Invalid JWT form. A JWT consists of three parts separated by dots. (reason=token-invalid,
|
||||
token-carrier=header)
|
||||
x-clerk-auth-reason:
|
||||
- token-invalid
|
||||
x-clerk-auth-status:
|
||||
- signed-out
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
- request:
|
||||
body: '{"model": "openai/gpt-4o-mini", "messages": [{"role": "system", "content":
|
||||
"You are Information Agent. You have access to specific knowledge sources.\nYour
|
||||
personal goal is: Provide information based on knowledge sources\nTo give my
|
||||
best complete final answer to the task respond using the exact following format:\n\nThought:
|
||||
I now can give a great answer\nFinal Answer: Your final answer must be the great
|
||||
and the most complete as possible, it must be outcome described.\n\nI MUST use
|
||||
these formats, my job depends on it!"}, {"role": "user", "content": "\nCurrent
|
||||
Task: What is Vidit''s favorite color?\n\nThis is the expected criteria for
|
||||
your final answer: Vidit''s favorclearite color.\nyou MUST return the actual
|
||||
complete content as the final answer, not a summary.\n\nBegin! This is VERY
|
||||
important to you, use the tools available and give your best Final Answer, your
|
||||
job depends on it!\n\nThought:"}], "stream": false, "stop": ["\nObservation:"]}'
|
||||
headers:
|
||||
accept:
|
||||
- '*/*'
|
||||
accept-encoding:
|
||||
- gzip, deflate
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '951'
|
||||
content-type:
|
||||
- application/json
|
||||
host:
|
||||
- openrouter.ai
|
||||
http-referer:
|
||||
- https://litellm.ai
|
||||
user-agent:
|
||||
- litellm/1.68.0
|
||||
x-title:
|
||||
- liteLLM
|
||||
method: POST
|
||||
uri: https://openrouter.ai/api/v1/chat/completions
|
||||
response:
|
||||
body:
|
||||
string: !!binary |
|
||||
H4sIAAAAAAAAAwAAAP//4lKAAS4AAAAA///iQjABAAAA//90kE9rG0EMxb/K8C69jNON7WJ7boFS
|
||||
CD2ENm2g/1jGs/Ja7aw0zIydBuPvXjbBcQrtUU9P0u/pAO7g0JNMLhfzxexytli8mdy8r7c6/3Lb
|
||||
v13eff00088fPj7AImXdc0cZDjeJ5OoaFoN2FOGgicTz6z7VyVwnAwvDQtc/KVQ4hK2vF0GHFKmy
|
||||
CixCJl+pgzuftQhb5UAF7tsBUfuUdV3gZBejxYaFy7bN5IsKHErVBAvxlffU/qfL0tFvuMZioFJ8
|
||||
T3AHZI0EB18Kl+qljjQqlWQkvTai9yZ4MT3vyXjTj6DGS7mnbMx3ecfio7l6rJ25447rq2I2fq+Z
|
||||
K5mgUbPhYtZxRxewyLTZFR9PMZ4IWfon4Xj8YVEeSqVhzNBTTpkfQTapbWar6XI6bVYNLHYn/JR1
|
||||
SLWt+oukjP9rRv7Ta8/6yqJq9fGsLFf27+m2o+o5lnFt8GFL3bO5Of5j60v/c5AXI8fjHwAAAP//
|
||||
AwDEkP8dZgIAAA==
|
||||
headers:
|
||||
Access-Control-Allow-Origin:
|
||||
- '*'
|
||||
CF-RAY:
|
||||
- 9402cb55c9fe46c0-BOM
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Encoding:
|
||||
- gzip
|
||||
Content-Type:
|
||||
- application/json
|
||||
Date:
|
||||
- Thu, 15 May 2025 12:56:15 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
Vary:
|
||||
- Accept-Encoding
|
||||
x-clerk-auth-message:
|
||||
- Invalid JWT form. A JWT consists of three parts separated by dots. (reason=token-invalid,
|
||||
token-carrier=header)
|
||||
x-clerk-auth-reason:
|
||||
- token-invalid
|
||||
x-clerk-auth-status:
|
||||
- signed-out
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
version: 1
|
||||
@@ -0,0 +1,151 @@
|
||||
interactions:
|
||||
- request:
|
||||
body: '{"model": "openai/gpt-4o-mini", "messages": [{"role": "system", "content":
|
||||
"Your goal is to rewrite the user query so that it is optimized for retrieval
|
||||
from a vector database. Consider how the query will be used to find relevant
|
||||
documents, and aim to make it more specific and context-aware. \n\n Do not include
|
||||
any other text than the rewritten query, especially any preamble or postamble
|
||||
and only add expected output format if its relevant to the rewritten query.
|
||||
\n\n Focus on the key words of the intended task and to retrieve the most relevant
|
||||
information. \n\n There will be some extra context provided that might need
|
||||
to be removed such as expected_output formats structured_outputs and other instructions."},
|
||||
{"role": "user", "content": "The original query is: What is Vidit''s favorite
|
||||
color?\n\nThis is the expected criteria for your final answer: Vidit''s favorclearite
|
||||
color.\nyou MUST return the actual complete content as the final answer, not
|
||||
a summary.."}], "stream": false, "stop": ["\nObservation:"]}'
|
||||
headers:
|
||||
accept:
|
||||
- '*/*'
|
||||
accept-encoding:
|
||||
- gzip, deflate
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '1017'
|
||||
content-type:
|
||||
- application/json
|
||||
host:
|
||||
- openrouter.ai
|
||||
http-referer:
|
||||
- https://litellm.ai
|
||||
user-agent:
|
||||
- litellm/1.68.0
|
||||
x-title:
|
||||
- liteLLM
|
||||
method: POST
|
||||
uri: https://openrouter.ai/api/v1/chat/completions
|
||||
response:
|
||||
body:
|
||||
string: !!binary |
|
||||
H4sIAAAAAAAAAwAAAP//4lKAAS4AAAAA//90kE1vE0EMhv9K9V64TGCbNGQ7N46gIg6IXhBaTWed
|
||||
Xbez49HYiaii/e9oqRKKBFf7/XjsE7iHx0B5db272W2uN++b3ep585k+jcmo/XqnYXvX5m/3cChV
|
||||
jtxThceXQvnDRzhM0lOChxTKgd8NxVY3spo4Mxzk4ZGiwSOOwd5GmUoiY8lwiJWCUQ9/qW0d4igc
|
||||
SeG/n5BkKFUeFD4fUnLYc2Ydu0pBJcNDTQoccjA+UvefLeeefsI3DhOphoHgT6iSCB5BldVCtoVG
|
||||
slFeSO+5Z3ujV/twlMpGV1GSVDhU2h80pDPOSxPn4WUwzz8c9FmNpoVloFoq/w7cl67Z3K7b9bq5
|
||||
beBwOGOUKlOxzuSJsi5/2C4c5xdd5lsHEwvpj7Bt3N/mricLnHRJjSGO1F/EzfyP0Nf6yx2vLPP8
|
||||
CwAA//8DAOHu/cIiAgAA
|
||||
headers:
|
||||
Access-Control-Allow-Origin:
|
||||
- '*'
|
||||
CF-RAY:
|
||||
- 9402c73df9d8859c-BOM
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Encoding:
|
||||
- gzip
|
||||
Content-Type:
|
||||
- application/json
|
||||
Date:
|
||||
- Thu, 15 May 2025 12:53:27 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
Vary:
|
||||
- Accept-Encoding
|
||||
x-clerk-auth-message:
|
||||
- Invalid JWT form. A JWT consists of three parts separated by dots. (reason=token-invalid,
|
||||
token-carrier=header)
|
||||
x-clerk-auth-reason:
|
||||
- token-invalid
|
||||
x-clerk-auth-status:
|
||||
- signed-out
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
- request:
|
||||
body: '{"model": "openai/gpt-4o-mini", "messages": [{"role": "system", "content":
|
||||
"You are Information Agent. You have access to specific knowledge sources.\nYour
|
||||
personal goal is: Provide information based on knowledge sources\nTo give my
|
||||
best complete final answer to the task respond using the exact following format:\n\nThought:
|
||||
I now can give a great answer\nFinal Answer: Your final answer must be the great
|
||||
and the most complete as possible, it must be outcome described.\n\nI MUST use
|
||||
these formats, my job depends on it!"}, {"role": "user", "content": "\nCurrent
|
||||
Task: What is Vidit''s favorite color?\n\nThis is the expected criteria for
|
||||
your final answer: Vidit''s favorclearite color.\nyou MUST return the actual
|
||||
complete content as the final answer, not a summary.\n\nBegin! This is VERY
|
||||
important to you, use the tools available and give your best Final Answer, your
|
||||
job depends on it!\n\nThought:"}], "stream": false, "stop": ["\nObservation:"]}'
|
||||
headers:
|
||||
accept:
|
||||
- '*/*'
|
||||
accept-encoding:
|
||||
- gzip, deflate
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '951'
|
||||
content-type:
|
||||
- application/json
|
||||
host:
|
||||
- openrouter.ai
|
||||
http-referer:
|
||||
- https://litellm.ai
|
||||
user-agent:
|
||||
- litellm/1.68.0
|
||||
x-title:
|
||||
- liteLLM
|
||||
method: POST
|
||||
uri: https://openrouter.ai/api/v1/chat/completions
|
||||
response:
|
||||
body:
|
||||
string: !!binary |
|
||||
H4sIAAAAAAAAAwAAAP//4lKAAS4AAAAA///iQjABAAAA//90kUGPEzEMhf+K5QuXdJmlpbvkthIg
|
||||
emFXQoIDoMpNPFNDJo6STLul6n9H09KyIDjmxc9+/rxH8Wix4zi5vpndTK+n8+Z2wo9vXj28fHff
|
||||
vW4+PNT5j1l6/wkNpqwb8ZzR4n3ieLdAg716DmhRE0eS512qk5lOeomCBnX1jV1Fi25N9cppnwJX
|
||||
0YgGXWaq7NH+HmvQrVUcF7Sf9xi0S1lXBW0cQjDYSpSyXmamohEtlqoJDUaqsuHlf34len5E2xjs
|
||||
uRTqGO0eswZGi1SKlEqxjmk0Vo5j0gVE3YKjCJ1sGAi6MShQLFvOAF/iW4kU4O74tvBRvNRnBVra
|
||||
aJbK4DRoBikQtcJWPIcdeHVDz7GyB4mQhlUQF3ZAG5JAq8BQdMiOi4GisBiHj+ZftIHA87hePeY5
|
||||
5cjcUfYSO1hLgZLYSSvurxRXaDBzOxQKZ4gnPhK7k3A4fDVYdqVyPxLsOKcsRwxtWvoVOZo3vm3Q
|
||||
4HCGl7L2qS6rfudYxus1I73zYS/69NZg1UrhorwYD/yHe+m5koQytnXk1uwvxc3hH12f1l8WeWI5
|
||||
HH4CAAD//wMAhZKqO+QCAAA=
|
||||
headers:
|
||||
Access-Control-Allow-Origin:
|
||||
- '*'
|
||||
CF-RAY:
|
||||
- 9402c7459f3f859c-BOM
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Encoding:
|
||||
- gzip
|
||||
Content-Type:
|
||||
- application/json
|
||||
Date:
|
||||
- Thu, 15 May 2025 12:53:28 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
Vary:
|
||||
- Accept-Encoding
|
||||
x-clerk-auth-message:
|
||||
- Invalid JWT form. A JWT consists of three parts separated by dots. (reason=token-invalid,
|
||||
token-carrier=header)
|
||||
x-clerk-auth-reason:
|
||||
- token-invalid
|
||||
x-clerk-auth-status:
|
||||
- signed-out
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
version: 1
|
||||
150
tests/cassettes/test_agent_with_only_crewai_knowledge.yaml
Normal file
150
tests/cassettes/test_agent_with_only_crewai_knowledge.yaml
Normal file
@@ -0,0 +1,150 @@
|
||||
interactions:
|
||||
- request:
|
||||
body: '{"model": "openai/gpt-4o-mini", "messages": [{"role": "system", "content":
|
||||
"Your goal is to rewrite the user query so that it is optimized for retrieval
|
||||
from a vector database. Consider how the query will be used to find relevant
|
||||
documents, and aim to make it more specific and context-aware. \n\n Do not include
|
||||
any other text than the rewritten query, especially any preamble or postamble
|
||||
and only add expected output format if its relevant to the rewritten query.
|
||||
\n\n Focus on the key words of the intended task and to retrieve the most relevant
|
||||
information. \n\n There will be some extra context provided that might need
|
||||
to be removed such as expected_output formats structured_outputs and other instructions."},
|
||||
{"role": "user", "content": "The original query is: What is Vidit''s favorite
|
||||
color?\n\nThis is the expected criteria for your final answer: Vidit''s favorclearite
|
||||
color.\nyou MUST return the actual complete content as the final answer, not
|
||||
a summary.."}], "stream": false, "stop": ["\nObservation:"]}'
|
||||
headers:
|
||||
accept:
|
||||
- '*/*'
|
||||
accept-encoding:
|
||||
- gzip, deflate
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '1017'
|
||||
content-type:
|
||||
- application/json
|
||||
host:
|
||||
- openrouter.ai
|
||||
http-referer:
|
||||
- https://litellm.ai
|
||||
user-agent:
|
||||
- litellm/1.68.0
|
||||
x-title:
|
||||
- liteLLM
|
||||
method: POST
|
||||
uri: https://openrouter.ai/api/v1/chat/completions
|
||||
response:
|
||||
body:
|
||||
string: !!binary |
|
||||
H4sIAAAAAAAAAwAAAP//4lKAAS4AAAAA//90kE1PIzEMhv8Kei97Sdnplwq5gTgAF8ShcFitRmnG
|
||||
nTFk4ihxq11V899Xs6gFJLja78djH8ANLFqKk+lqsZpP56vpYqJhublfP1eP65v1i79Lt9fdMwxS
|
||||
lj03lGHxkChe3cGgl4YCLCRRdPyzTTpZyKTnyDCQzQt5hYXvnJ576VMgZYkw8JmcUgP7XmvgO2FP
|
||||
BfbXAUHalGVTYOMuBIMtRy5dnckVibAoKgkG0Snvqf5my7GhP7CVQU+luJZgD8gSCBauFC7qoo40
|
||||
EpXiSPrEDeuPcrZ1e8msdOYlSIZBpu2uuHDEeWvi2L4NhuG3QflblPqRpaWcMv8P3Ka6ml/OLmaz
|
||||
6rKCwe6IkbL0SWuVV4pl/MNy5Di+6DRfGqioC+/Ci8p8NtcNqeNQxlTvfEfNSVwNX4R+1J/u+GAZ
|
||||
hn8AAAD//wMAIwJ79CICAAA=
|
||||
headers:
|
||||
Access-Control-Allow-Origin:
|
||||
- '*'
|
||||
CF-RAY:
|
||||
- 9402c9db99ec4722-BOM
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Encoding:
|
||||
- gzip
|
||||
Content-Type:
|
||||
- application/json
|
||||
Date:
|
||||
- Thu, 15 May 2025 12:55:14 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
Vary:
|
||||
- Accept-Encoding
|
||||
x-clerk-auth-message:
|
||||
- Invalid JWT form. A JWT consists of three parts separated by dots. (reason=token-invalid,
|
||||
token-carrier=header)
|
||||
x-clerk-auth-reason:
|
||||
- token-invalid
|
||||
x-clerk-auth-status:
|
||||
- signed-out
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
- request:
|
||||
body: '{"model": "openai/gpt-4o-mini", "messages": [{"role": "system", "content":
|
||||
"You are Information Agent. You have access to specific knowledge sources.\nYour
|
||||
personal goal is: Provide information based on knowledge sources\nTo give my
|
||||
best complete final answer to the task respond using the exact following format:\n\nThought:
|
||||
I now can give a great answer\nFinal Answer: Your final answer must be the great
|
||||
and the most complete as possible, it must be outcome described.\n\nI MUST use
|
||||
these formats, my job depends on it!"}, {"role": "user", "content": "\nCurrent
|
||||
Task: What is Vidit''s favorite color?\n\nThis is the expected criteria for
|
||||
your final answer: Vidit''s favorclearite color.\nyou MUST return the actual
|
||||
complete content as the final answer, not a summary.\n\nBegin! This is VERY
|
||||
important to you, use the tools available and give your best Final Answer, your
|
||||
job depends on it!\n\nThought:"}], "stream": false, "stop": ["\nObservation:"]}'
|
||||
headers:
|
||||
accept:
|
||||
- '*/*'
|
||||
accept-encoding:
|
||||
- gzip, deflate
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '951'
|
||||
content-type:
|
||||
- application/json
|
||||
host:
|
||||
- openrouter.ai
|
||||
http-referer:
|
||||
- https://litellm.ai
|
||||
user-agent:
|
||||
- litellm/1.68.0
|
||||
x-title:
|
||||
- liteLLM
|
||||
method: POST
|
||||
uri: https://openrouter.ai/api/v1/chat/completions
|
||||
response:
|
||||
body:
|
||||
string: !!binary |
|
||||
H4sIAAAAAAAAAwAAAP//4lKAAS4AAAAA///iQjABAAAA//90kN1qGzEQRl9FfNdyul4nday73ARy
|
||||
VUpLE2jLIu+O15NoZ4QkOy1moa/R1+uTlE1wnEB7qU/zc84cwB0cepLZfHm+XMwXy/nF7II/3d7V
|
||||
H+tOPvsS3le3d+keFjHpnjtKcPgQSa5uYDFoRwEOGkk8v+tjmZ3rbGBhWOj6ntoCh3bry1mrQwxU
|
||||
WAUWbSJfqIM7rbVot8otZbivBwTtY9J1hpNdCBYbFs7bJpHPKnDIRSMsxBfeU/OfX5aOfsBVFgPl
|
||||
7HuCOyBpIDj4nDkXL2WiUSkkE+mNEX00rRfT856MN/0EarzkR0rGfJNrFh/M1dPbmS/ccfnz63c2
|
||||
G7/XxIVMq0GT4WzWYUdnsEi02WUfjiLPjCz9czCO3y3yz1xomCx6SjHxE8omNtViVV/WdbWqYLE7
|
||||
CsSkQyxN0QeSPF2wmgyOxz3lK4uixYdTcrmyb7ubjornkKexrW+31L0UV+M/pr6ufxF51TKOfwEA
|
||||
AP//AwBybekMaAIAAA==
|
||||
headers:
|
||||
Access-Control-Allow-Origin:
|
||||
- '*'
|
||||
CF-RAY:
|
||||
- 9402c9e1b94a4722-BOM
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Encoding:
|
||||
- gzip
|
||||
Content-Type:
|
||||
- application/json
|
||||
Date:
|
||||
- Thu, 15 May 2025 12:55:15 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
Vary:
|
||||
- Accept-Encoding
|
||||
x-clerk-auth-message:
|
||||
- Invalid JWT form. A JWT consists of three parts separated by dots. (reason=token-invalid,
|
||||
token-carrier=header)
|
||||
x-clerk-auth-reason:
|
||||
- token-invalid
|
||||
x-clerk-auth-status:
|
||||
- signed-out
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
version: 1
|
||||
@@ -27,7 +27,7 @@ class TestValidateToken(unittest.TestCase):
|
||||
audience="app_id_xxxx",
|
||||
)
|
||||
|
||||
mock_jwt.decode.assert_called_once_with(
|
||||
mock_jwt.decode.assert_called_with(
|
||||
"aaaaa.bbbbbb.cccccc",
|
||||
"mock_signing_key",
|
||||
algorithms=["RS256"],
|
||||
|
||||
0
tests/evaluation/__init__.py
Normal file
0
tests/evaluation/__init__.py
Normal file
0
tests/evaluation/metrics/__init__.py
Normal file
0
tests/evaluation/metrics/__init__.py
Normal file
28
tests/evaluation/metrics/base_evaluation_metrics_test.py
Normal file
28
tests/evaluation/metrics/base_evaluation_metrics_test.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import pytest
|
||||
from unittest.mock import MagicMock
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
|
||||
class BaseEvaluationMetricsTest:
|
||||
@pytest.fixture
|
||||
def mock_agent(self):
|
||||
agent = MagicMock(spec=Agent)
|
||||
agent.id = "test_agent_id"
|
||||
agent.role = "Test Agent"
|
||||
agent.goal = "Test goal"
|
||||
agent.tools = []
|
||||
return agent
|
||||
|
||||
@pytest.fixture
|
||||
def mock_task(self):
|
||||
task = MagicMock(spec=Task)
|
||||
task.description = "Test task description"
|
||||
task.expected_output = "Test expected output"
|
||||
return task
|
||||
|
||||
@pytest.fixture
|
||||
def execution_trace(self):
|
||||
return {
|
||||
"thinking": ["I need to analyze this data carefully"],
|
||||
"actions": ["Gathered information", "Analyzed data"]
|
||||
}
|
||||
59
tests/evaluation/metrics/test_goal_metrics.py
Normal file
59
tests/evaluation/metrics/test_goal_metrics.py
Normal file
@@ -0,0 +1,59 @@
|
||||
from unittest.mock import patch, MagicMock
|
||||
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||
|
||||
from crewai.evaluation.base_evaluator import EvaluationScore
|
||||
from crewai.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator
|
||||
from crewai.utilities.llm_utils import LLM
|
||||
|
||||
|
||||
class TestGoalAlignmentEvaluator(BaseEvaluationMetricsTest):
|
||||
@patch("crewai.utilities.llm_utils.create_llm")
|
||||
def test_evaluate_success(self, mock_create_llm, mock_agent, mock_task, execution_trace):
|
||||
mock_llm = MagicMock(spec=LLM)
|
||||
mock_llm.call.return_value = """
|
||||
{
|
||||
"score": 8.5,
|
||||
"feedback": "The agent correctly understood the task and produced relevant output."
|
||||
}
|
||||
"""
|
||||
mock_create_llm.return_value = mock_llm
|
||||
|
||||
evaluator = GoalAlignmentEvaluator(llm=mock_llm)
|
||||
|
||||
result = evaluator.evaluate(
|
||||
agent=mock_agent,
|
||||
task=mock_task,
|
||||
execution_trace=execution_trace,
|
||||
final_output="This is the final output"
|
||||
)
|
||||
|
||||
assert isinstance(result, EvaluationScore)
|
||||
assert result.score == 8.5
|
||||
assert "correctly understood the task" in result.feedback
|
||||
|
||||
mock_llm.call.assert_called_once()
|
||||
prompt = mock_llm.call.call_args[0][0]
|
||||
assert len(prompt) >= 2
|
||||
assert "system" in prompt[0]["role"]
|
||||
assert "user" in prompt[1]["role"]
|
||||
assert mock_agent.role in prompt[1]["content"]
|
||||
assert mock_task.description in prompt[1]["content"]
|
||||
|
||||
@patch("crewai.utilities.llm_utils.create_llm")
|
||||
def test_evaluate_error_handling(self, mock_create_llm, mock_agent, mock_task, execution_trace):
|
||||
mock_llm = MagicMock(spec=LLM)
|
||||
mock_llm.call.return_value = "Invalid JSON response"
|
||||
mock_create_llm.return_value = mock_llm
|
||||
|
||||
evaluator = GoalAlignmentEvaluator(llm=mock_llm)
|
||||
|
||||
result = evaluator.evaluate(
|
||||
agent=mock_agent,
|
||||
task=mock_task,
|
||||
execution_trace=execution_trace,
|
||||
final_output="This is the final output"
|
||||
)
|
||||
|
||||
assert isinstance(result, EvaluationScore)
|
||||
assert result.score is None
|
||||
assert "Failed to parse" in result.feedback
|
||||
166
tests/evaluation/metrics/test_reasoning_metrics.py
Normal file
166
tests/evaluation/metrics/test_reasoning_metrics.py
Normal file
@@ -0,0 +1,166 @@
|
||||
import pytest
|
||||
from unittest.mock import patch, MagicMock
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from crewai.tasks.task_output import TaskOutput
|
||||
from crewai.evaluation.metrics.reasoning_metrics import (
|
||||
ReasoningEfficiencyEvaluator,
|
||||
)
|
||||
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||
from crewai.utilities.llm_utils import LLM
|
||||
from crewai.evaluation.base_evaluator import EvaluationScore
|
||||
|
||||
class TestReasoningEfficiencyEvaluator(BaseEvaluationMetricsTest):
|
||||
@pytest.fixture
|
||||
def mock_output(self):
|
||||
output = MagicMock(spec=TaskOutput)
|
||||
output.raw = "This is the test output"
|
||||
return output
|
||||
|
||||
@pytest.fixture
|
||||
def llm_calls(self) -> List[Dict[str, Any]]:
|
||||
return [
|
||||
{
|
||||
"prompt": "How should I approach this task?",
|
||||
"response": "I'll first research the topic, then compile findings.",
|
||||
"timestamp": 1626987654
|
||||
},
|
||||
{
|
||||
"prompt": "What resources should I use?",
|
||||
"response": "I'll use relevant academic papers and reliable websites.",
|
||||
"timestamp": 1626987754
|
||||
},
|
||||
{
|
||||
"prompt": "How should I structure the output?",
|
||||
"response": "I'll organize information clearly with headings and bullet points.",
|
||||
"timestamp": 1626987854
|
||||
}
|
||||
]
|
||||
|
||||
def test_insufficient_llm_calls(self, mock_agent, mock_task, mock_output):
|
||||
execution_trace = {"llm_calls": []}
|
||||
|
||||
evaluator = ReasoningEfficiencyEvaluator()
|
||||
result = evaluator.evaluate(
|
||||
agent=mock_agent,
|
||||
task=mock_task,
|
||||
execution_trace=execution_trace,
|
||||
final_output=mock_output
|
||||
)
|
||||
|
||||
assert isinstance(result, EvaluationScore)
|
||||
assert result.score is None
|
||||
assert "Insufficient LLM calls" in result.feedback
|
||||
|
||||
@patch("crewai.utilities.llm_utils.create_llm")
|
||||
def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task, mock_output, llm_calls):
|
||||
mock_llm = MagicMock(spec=LLM)
|
||||
mock_llm.call.return_value = """
|
||||
{
|
||||
"scores": {
|
||||
"focus": 8.0,
|
||||
"progression": 7.0,
|
||||
"decision_quality": 7.5,
|
||||
"conciseness": 8.0,
|
||||
"loop_avoidance": 9.0
|
||||
},
|
||||
"overall_score": 7.9,
|
||||
"feedback": "The agent demonstrated good reasoning efficiency.",
|
||||
"optimization_suggestions": "The agent could improve by being more concise."
|
||||
}
|
||||
"""
|
||||
mock_create_llm.return_value = mock_llm
|
||||
|
||||
# Setup execution trace with sufficient LLM calls
|
||||
execution_trace = {"llm_calls": llm_calls}
|
||||
|
||||
# Mock the _detect_loops method to return a simple result
|
||||
evaluator = ReasoningEfficiencyEvaluator(llm=mock_llm)
|
||||
evaluator._detect_loops = MagicMock(return_value=(False, []))
|
||||
|
||||
# Evaluate
|
||||
result = evaluator.evaluate(
|
||||
agent=mock_agent,
|
||||
task=mock_task,
|
||||
execution_trace=execution_trace,
|
||||
final_output=mock_output
|
||||
)
|
||||
|
||||
# Assertions
|
||||
assert isinstance(result, EvaluationScore)
|
||||
assert result.score == 7.9
|
||||
assert "The agent demonstrated good reasoning efficiency" in result.feedback
|
||||
assert "Reasoning Efficiency Evaluation:" in result.feedback
|
||||
assert "• Focus: 8.0/10" in result.feedback
|
||||
|
||||
# Verify LLM was called
|
||||
mock_llm.call.assert_called_once()
|
||||
|
||||
@patch("crewai.utilities.llm_utils.create_llm")
|
||||
def test_parse_error_handling(self, mock_create_llm, mock_agent, mock_task, mock_output, llm_calls):
|
||||
mock_llm = MagicMock(spec=LLM)
|
||||
mock_llm.call.return_value = "Invalid JSON response"
|
||||
mock_create_llm.return_value = mock_llm
|
||||
|
||||
# Setup execution trace
|
||||
execution_trace = {"llm_calls": llm_calls}
|
||||
|
||||
# Mock the _detect_loops method
|
||||
evaluator = ReasoningEfficiencyEvaluator(llm=mock_llm)
|
||||
evaluator._detect_loops = MagicMock(return_value=(False, []))
|
||||
|
||||
# Evaluate
|
||||
result = evaluator.evaluate(
|
||||
agent=mock_agent,
|
||||
task=mock_task,
|
||||
execution_trace=execution_trace,
|
||||
final_output=mock_output
|
||||
)
|
||||
|
||||
# Assertions for error handling
|
||||
assert isinstance(result, EvaluationScore)
|
||||
assert result.score is None
|
||||
assert "Failed to parse reasoning efficiency evaluation" in result.feedback
|
||||
|
||||
@patch("crewai.utilities.llm_utils.create_llm")
|
||||
def test_loop_detection(self, mock_create_llm, mock_agent, mock_task, mock_output):
|
||||
# Setup LLM calls with a repeating pattern
|
||||
repetitive_llm_calls = [
|
||||
{"prompt": "How to solve?", "response": "I'll try method A", "timestamp": 1000},
|
||||
{"prompt": "Let me try method A", "response": "It didn't work", "timestamp": 1100},
|
||||
{"prompt": "How to solve?", "response": "I'll try method A again", "timestamp": 1200},
|
||||
{"prompt": "Let me try method A", "response": "It didn't work", "timestamp": 1300},
|
||||
{"prompt": "How to solve?", "response": "I'll try method A one more time", "timestamp": 1400}
|
||||
]
|
||||
|
||||
mock_llm = MagicMock(spec=LLM)
|
||||
mock_llm.call.return_value = """
|
||||
{
|
||||
"scores": {
|
||||
"focus": 6.0,
|
||||
"progression": 3.0,
|
||||
"decision_quality": 4.0,
|
||||
"conciseness": 6.0,
|
||||
"loop_avoidance": 2.0
|
||||
},
|
||||
"overall_score": 4.2,
|
||||
"feedback": "The agent is stuck in a reasoning loop.",
|
||||
"optimization_suggestions": "The agent should try different approaches when one fails."
|
||||
}
|
||||
"""
|
||||
mock_create_llm.return_value = mock_llm
|
||||
|
||||
execution_trace = {"llm_calls": repetitive_llm_calls}
|
||||
|
||||
evaluator = ReasoningEfficiencyEvaluator(llm=mock_llm)
|
||||
|
||||
result = evaluator.evaluate(
|
||||
agent=mock_agent,
|
||||
task=mock_task,
|
||||
execution_trace=execution_trace,
|
||||
final_output=mock_output
|
||||
)
|
||||
|
||||
assert isinstance(result, EvaluationScore)
|
||||
assert result.score == 4.2
|
||||
assert "• Loop Avoidance: 2.0/10" in result.feedback
|
||||
82
tests/evaluation/metrics/test_semantic_quality_metrics.py
Normal file
82
tests/evaluation/metrics/test_semantic_quality_metrics.py
Normal file
@@ -0,0 +1,82 @@
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from crewai.evaluation.base_evaluator import EvaluationScore
|
||||
from crewai.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator
|
||||
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||
from crewai.utilities.llm_utils import LLM
|
||||
|
||||
class TestSemanticQualityEvaluator(BaseEvaluationMetricsTest):
|
||||
@patch("crewai.utilities.llm_utils.create_llm")
|
||||
def test_evaluate_success(self, mock_create_llm, mock_agent, mock_task, execution_trace):
|
||||
mock_llm = MagicMock(spec=LLM)
|
||||
mock_llm.call.return_value = """
|
||||
{
|
||||
"score": 8.5,
|
||||
"feedback": "The output is clear, coherent, and logically structured."
|
||||
}
|
||||
"""
|
||||
mock_create_llm.return_value = mock_llm
|
||||
|
||||
evaluator = SemanticQualityEvaluator(llm=mock_llm)
|
||||
|
||||
result = evaluator.evaluate(
|
||||
agent=mock_agent,
|
||||
task=mock_task,
|
||||
execution_trace=execution_trace,
|
||||
final_output="This is a well-structured analysis of the data."
|
||||
)
|
||||
|
||||
assert isinstance(result, EvaluationScore)
|
||||
assert result.score == 8.5
|
||||
assert "clear, coherent" in result.feedback
|
||||
|
||||
mock_llm.call.assert_called_once()
|
||||
prompt = mock_llm.call.call_args[0][0]
|
||||
assert len(prompt) >= 2
|
||||
assert "system" in prompt[0]["role"]
|
||||
assert "user" in prompt[1]["role"]
|
||||
assert mock_agent.role in prompt[1]["content"]
|
||||
assert mock_task.description in prompt[1]["content"]
|
||||
|
||||
@patch("crewai.utilities.llm_utils.create_llm")
|
||||
def test_evaluate_with_empty_output(self, mock_create_llm, mock_agent, mock_task, execution_trace):
|
||||
mock_llm = MagicMock(spec=LLM)
|
||||
mock_llm.call.return_value = """
|
||||
{
|
||||
"score": 2.0,
|
||||
"feedback": "The output is empty or minimal, lacking substance."
|
||||
}
|
||||
"""
|
||||
mock_create_llm.return_value = mock_llm
|
||||
|
||||
evaluator = SemanticQualityEvaluator(llm=mock_llm)
|
||||
|
||||
result = evaluator.evaluate(
|
||||
agent=mock_agent,
|
||||
task=mock_task,
|
||||
execution_trace=execution_trace,
|
||||
final_output=""
|
||||
)
|
||||
|
||||
assert isinstance(result, EvaluationScore)
|
||||
assert result.score == 2.0
|
||||
assert "empty or minimal" in result.feedback
|
||||
|
||||
@patch("crewai.utilities.llm_utils.create_llm")
|
||||
def test_evaluate_error_handling(self, mock_create_llm, mock_agent, mock_task, execution_trace):
|
||||
mock_llm = MagicMock(spec=LLM)
|
||||
mock_llm.call.return_value = "Invalid JSON response"
|
||||
mock_create_llm.return_value = mock_llm
|
||||
|
||||
evaluator = SemanticQualityEvaluator(llm=mock_llm)
|
||||
|
||||
result = evaluator.evaluate(
|
||||
agent=mock_agent,
|
||||
task=mock_task,
|
||||
execution_trace=execution_trace,
|
||||
final_output="This is the output."
|
||||
)
|
||||
|
||||
assert isinstance(result, EvaluationScore)
|
||||
assert result.score is None
|
||||
assert "Failed to parse" in result.feedback
|
||||
230
tests/evaluation/metrics/test_tools_metrics.py
Normal file
230
tests/evaluation/metrics/test_tools_metrics.py
Normal file
@@ -0,0 +1,230 @@
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from crewai.evaluation.metrics.tools_metrics import (
|
||||
ToolSelectionEvaluator,
|
||||
ParameterExtractionEvaluator,
|
||||
ToolInvocationEvaluator
|
||||
)
|
||||
from crewai.utilities.llm_utils import LLM
|
||||
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||
|
||||
class TestToolSelectionEvaluator(BaseEvaluationMetricsTest):
|
||||
def test_no_tools_available(self, mock_task, mock_agent):
|
||||
# Create agent with no tools
|
||||
mock_agent.tools = []
|
||||
|
||||
execution_trace = {"tool_uses": []}
|
||||
|
||||
evaluator = ToolSelectionEvaluator()
|
||||
result = evaluator.evaluate(
|
||||
agent=mock_agent,
|
||||
task=mock_task,
|
||||
execution_trace=execution_trace,
|
||||
final_output="Final output"
|
||||
)
|
||||
|
||||
assert result.score is None
|
||||
assert "no tools available" in result.feedback.lower()
|
||||
|
||||
def test_tools_available_but_none_used(self, mock_agent, mock_task):
|
||||
mock_agent.tools = ["tool1", "tool2"]
|
||||
execution_trace = {"tool_uses": []}
|
||||
|
||||
evaluator = ToolSelectionEvaluator()
|
||||
result = evaluator.evaluate(
|
||||
agent=mock_agent,
|
||||
task=mock_task,
|
||||
execution_trace=execution_trace,
|
||||
final_output="Final output"
|
||||
)
|
||||
|
||||
assert result.score is None
|
||||
assert "had tools available but didn't use any" in result.feedback.lower()
|
||||
|
||||
@patch("crewai.utilities.llm_utils.create_llm")
|
||||
def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task):
|
||||
# Setup mock LLM response
|
||||
mock_llm = MagicMock(spec=LLM)
|
||||
mock_llm.call.return_value = """
|
||||
{
|
||||
"overall_score": 8.5,
|
||||
"feedback": "The agent made good tool selections."
|
||||
}
|
||||
"""
|
||||
mock_create_llm.return_value = mock_llm
|
||||
|
||||
# Setup execution trace with tool uses
|
||||
execution_trace = {
|
||||
"tool_uses": [
|
||||
{"tool": "search_tool", "input": {"query": "test query"}, "output": "search results"},
|
||||
{"tool": "calculator", "input": {"expression": "2+2"}, "output": "4"}
|
||||
]
|
||||
}
|
||||
|
||||
evaluator = ToolSelectionEvaluator(llm=mock_llm)
|
||||
result = evaluator.evaluate(
|
||||
agent=mock_agent,
|
||||
task=mock_task,
|
||||
execution_trace=execution_trace,
|
||||
final_output="Final output"
|
||||
)
|
||||
|
||||
assert result.score == 8.5
|
||||
assert "The agent made good tool selections" in result.feedback
|
||||
|
||||
# Verify LLM was called with correct prompt
|
||||
mock_llm.call.assert_called_once()
|
||||
prompt = mock_llm.call.call_args[0][0]
|
||||
assert isinstance(prompt, list)
|
||||
assert len(prompt) >= 2
|
||||
assert "system" in prompt[0]["role"]
|
||||
assert "user" in prompt[1]["role"]
|
||||
|
||||
|
||||
class TestParameterExtractionEvaluator(BaseEvaluationMetricsTest):
|
||||
def test_no_tool_uses(self, mock_agent, mock_task):
|
||||
execution_trace = {"tool_uses": []}
|
||||
|
||||
evaluator = ParameterExtractionEvaluator()
|
||||
result = evaluator.evaluate(
|
||||
agent=mock_agent,
|
||||
task=mock_task,
|
||||
execution_trace=execution_trace,
|
||||
final_output="Final output"
|
||||
)
|
||||
|
||||
assert result.score is None
|
||||
assert "no tool usage" in result.feedback.lower()
|
||||
|
||||
@patch("crewai.utilities.llm_utils.create_llm")
|
||||
def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task):
|
||||
mock_agent.tools = ["tool1", "tool2"]
|
||||
|
||||
# Setup mock LLM response
|
||||
mock_llm = MagicMock(spec=LLM)
|
||||
mock_llm.call.return_value = """
|
||||
{
|
||||
"overall_score": 9.0,
|
||||
"feedback": "The agent extracted parameters correctly."
|
||||
}
|
||||
"""
|
||||
mock_create_llm.return_value = mock_llm
|
||||
|
||||
# Setup execution trace with tool uses
|
||||
execution_trace = {
|
||||
"tool_uses": [
|
||||
{
|
||||
"tool": "search_tool",
|
||||
"input": {"query": "test query"},
|
||||
"output": "search results",
|
||||
"error": None
|
||||
},
|
||||
{
|
||||
"tool": "calculator",
|
||||
"input": {"expression": "2+2"},
|
||||
"output": "4",
|
||||
"error": None
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
evaluator = ParameterExtractionEvaluator(llm=mock_llm)
|
||||
result = evaluator.evaluate(
|
||||
agent=mock_agent,
|
||||
task=mock_task,
|
||||
execution_trace=execution_trace,
|
||||
final_output="Final output"
|
||||
)
|
||||
|
||||
assert result.score == 9.0
|
||||
assert "The agent extracted parameters correctly" in result.feedback
|
||||
|
||||
|
||||
class TestToolInvocationEvaluator(BaseEvaluationMetricsTest):
|
||||
def test_no_tool_uses(self, mock_agent, mock_task):
|
||||
execution_trace = {"tool_uses": []}
|
||||
|
||||
evaluator = ToolInvocationEvaluator()
|
||||
result = evaluator.evaluate(
|
||||
agent=mock_agent,
|
||||
task=mock_task,
|
||||
execution_trace=execution_trace,
|
||||
final_output="Final output"
|
||||
)
|
||||
|
||||
assert result.score is None
|
||||
assert "no tool usage" in result.feedback.lower()
|
||||
|
||||
@patch("crewai.utilities.llm_utils.create_llm")
|
||||
def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task):
|
||||
mock_agent.tools = ["tool1", "tool2"]
|
||||
# Setup mock LLM response
|
||||
mock_llm = MagicMock(spec=LLM)
|
||||
mock_llm.call.return_value = """
|
||||
{
|
||||
"overall_score": 8.0,
|
||||
"feedback": "The agent invoked tools correctly."
|
||||
}
|
||||
"""
|
||||
mock_create_llm.return_value = mock_llm
|
||||
|
||||
# Setup execution trace with tool uses
|
||||
execution_trace = {
|
||||
"tool_uses": [
|
||||
{"tool": "search_tool", "input": {"query": "test query"}, "output": "search results"},
|
||||
{"tool": "calculator", "input": {"expression": "2+2"}, "output": "4"}
|
||||
]
|
||||
}
|
||||
|
||||
evaluator = ToolInvocationEvaluator(llm=mock_llm)
|
||||
result = evaluator.evaluate(
|
||||
agent=mock_agent,
|
||||
task=mock_task,
|
||||
execution_trace=execution_trace,
|
||||
final_output="Final output"
|
||||
)
|
||||
|
||||
assert result.score == 8.0
|
||||
assert "The agent invoked tools correctly" in result.feedback
|
||||
|
||||
@patch("crewai.utilities.llm_utils.create_llm")
|
||||
def test_evaluation_with_errors(self, mock_create_llm, mock_agent, mock_task):
|
||||
mock_agent.tools = ["tool1", "tool2"]
|
||||
# Setup mock LLM response
|
||||
mock_llm = MagicMock(spec=LLM)
|
||||
mock_llm.call.return_value = """
|
||||
{
|
||||
"overall_score": 5.5,
|
||||
"feedback": "The agent had some errors in tool invocation."
|
||||
}
|
||||
"""
|
||||
mock_create_llm.return_value = mock_llm
|
||||
|
||||
# Setup execution trace with tool uses including errors
|
||||
execution_trace = {
|
||||
"tool_uses": [
|
||||
{
|
||||
"tool": "search_tool",
|
||||
"input": {"query": "test query"},
|
||||
"output": "search results",
|
||||
"error": None
|
||||
},
|
||||
{
|
||||
"tool": "calculator",
|
||||
"input": {"expression": "2+"},
|
||||
"output": None,
|
||||
"error": "Invalid expression"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
evaluator = ToolInvocationEvaluator(llm=mock_llm)
|
||||
result = evaluator.evaluate(
|
||||
agent=mock_agent,
|
||||
task=mock_task,
|
||||
execution_trace=execution_trace,
|
||||
final_output="Final output"
|
||||
)
|
||||
|
||||
assert result.score == 5.5
|
||||
assert "The agent had some errors in tool invocation" in result.feedback
|
||||
95
tests/evaluation/test_agent_evaluator.py
Normal file
95
tests/evaluation/test_agent_evaluator.py
Normal file
@@ -0,0 +1,95 @@
|
||||
import pytest
|
||||
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
from crewai.crew import Crew
|
||||
from crewai.evaluation.agent_evaluator import AgentEvaluator
|
||||
from crewai.evaluation.base_evaluator import AgentEvaluationResult
|
||||
from crewai.evaluation import (
|
||||
GoalAlignmentEvaluator,
|
||||
SemanticQualityEvaluator,
|
||||
ToolSelectionEvaluator,
|
||||
ParameterExtractionEvaluator,
|
||||
ToolInvocationEvaluator,
|
||||
ReasoningEfficiencyEvaluator
|
||||
)
|
||||
|
||||
from crewai.evaluation import create_default_evaluator
|
||||
class TestAgentEvaluator:
|
||||
@pytest.fixture
|
||||
def mock_crew(self):
|
||||
agent = Agent(
|
||||
role="Test Agent",
|
||||
goal="Complete test tasks successfully",
|
||||
backstory="An agent created for testing purposes",
|
||||
allow_delegation=False,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
task = Task(
|
||||
description="Test task description",
|
||||
agent=agent,
|
||||
expected_output="Expected test output"
|
||||
)
|
||||
|
||||
crew = Crew(
|
||||
agents=[agent],
|
||||
tasks=[task]
|
||||
)
|
||||
return crew
|
||||
|
||||
def test_set_iteration(self):
|
||||
agent_evaluator = AgentEvaluator()
|
||||
|
||||
agent_evaluator.set_iteration(3)
|
||||
assert agent_evaluator.iteration == 3
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
def test_evaluate_current_iteration(self, mock_crew):
|
||||
agent_evaluator = AgentEvaluator(crew=mock_crew, evaluators=[GoalAlignmentEvaluator()])
|
||||
|
||||
mock_crew.kickoff()
|
||||
|
||||
results = agent_evaluator.evaluate_current_iteration()
|
||||
|
||||
assert isinstance(results, dict)
|
||||
|
||||
agent, = mock_crew.agents
|
||||
task, = mock_crew.tasks
|
||||
|
||||
assert len(mock_crew.agents) == 1
|
||||
assert agent.role in results
|
||||
assert len(results[agent.role]) == 1
|
||||
|
||||
result, = results[agent.role]
|
||||
assert isinstance(result, AgentEvaluationResult)
|
||||
|
||||
assert result.agent_id == str(agent.id)
|
||||
assert result.task_id == str(task.id)
|
||||
|
||||
goal_alignment, = result.metrics.values()
|
||||
assert goal_alignment.score == 5.0
|
||||
|
||||
expected_feedback = "The agent's output demonstrates an understanding of the need for a comprehensive document"
|
||||
assert expected_feedback in goal_alignment.feedback
|
||||
|
||||
assert goal_alignment.raw_response is not None
|
||||
assert '"score": 5' in goal_alignment.raw_response
|
||||
|
||||
def test_create_default_evaluator(self, mock_crew):
|
||||
agent_evaluator = create_default_evaluator(crew=mock_crew)
|
||||
assert isinstance(agent_evaluator, AgentEvaluator)
|
||||
assert agent_evaluator.crew == mock_crew
|
||||
|
||||
expected_types = [
|
||||
GoalAlignmentEvaluator,
|
||||
SemanticQualityEvaluator,
|
||||
ToolSelectionEvaluator,
|
||||
ParameterExtractionEvaluator,
|
||||
ToolInvocationEvaluator,
|
||||
ReasoningEfficiencyEvaluator
|
||||
]
|
||||
|
||||
assert len(agent_evaluator.evaluators) == len(expected_types)
|
||||
for evaluator, expected_type in zip(agent_evaluator.evaluators, expected_types):
|
||||
assert isinstance(evaluator, expected_type)
|
||||
@@ -601,7 +601,7 @@ def test_handle_streaming_tool_calls(get_weather_tool_schema, mock_emit):
|
||||
def test_handle_streaming_tool_calls_with_error(get_weather_tool_schema, mock_emit):
|
||||
def get_weather_error(location):
|
||||
raise Exception("Error")
|
||||
|
||||
|
||||
llm = LLM(model="openai/gpt-4o", stream=True)
|
||||
response = llm.call(
|
||||
messages=[
|
||||
@@ -619,7 +619,7 @@ def test_handle_streaming_tool_calls_with_error(get_weather_tool_schema, mock_em
|
||||
expected_stream_chunk=9,
|
||||
expected_completed_llm_call=1,
|
||||
expected_tool_usage_started=1,
|
||||
expected_tool_usage_error=1,
|
||||
expected_tool_usage_error=1,
|
||||
expected_final_chunk_result=expected_final_chunk_result,
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user