Merge branch 'main' of github.com:crewAIInc/crewAI into devin/1751908431-fix-lite-agent-llm-isinstance-check

This commit is contained in:
lorenzejay
2025-07-11 17:10:54 -07:00
48 changed files with 3994 additions and 218 deletions

View File

@@ -1896,6 +1896,80 @@ def test_agent_with_knowledge_sources_generate_search_query():
assert "red" in result.raw.lower()
@pytest.mark.vcr(record_mode='none', filter_headers=["authorization"])
def test_agent_with_knowledge_with_no_crewai_knowledge():
mock_knowledge = MagicMock(spec=Knowledge)
agent = Agent(
role="Information Agent",
goal="Provide information based on knowledge sources",
backstory="You have access to specific knowledge sources.",
llm=LLM(model="openrouter/openai/gpt-4o-mini",api_key=os.getenv('OPENROUTER_API_KEY')),
knowledge=mock_knowledge
)
# Create a task that requires the agent to use the knowledge
task = Task(
description="What is Vidit's favorite color?",
expected_output="Vidit's favorclearite color.",
agent=agent,
)
crew = Crew(agents=[agent], tasks=[task])
crew.kickoff()
mock_knowledge.query.assert_called_once()
@pytest.mark.vcr(record_mode='none', filter_headers=["authorization"])
def test_agent_with_only_crewai_knowledge():
mock_knowledge = MagicMock(spec=Knowledge)
agent = Agent(
role="Information Agent",
goal="Provide information based on knowledge sources",
backstory="You have access to specific knowledge sources.",
llm=LLM(model="openrouter/openai/gpt-4o-mini",api_key=os.getenv('OPENROUTER_API_KEY'))
)
# Create a task that requires the agent to use the knowledge
task = Task(
description="What is Vidit's favorite color?",
expected_output="Vidit's favorclearite color.",
agent=agent
)
crew = Crew(agents=[agent], tasks=[task],knowledge=mock_knowledge)
crew.kickoff()
mock_knowledge.query.assert_called_once()
@pytest.mark.vcr(record_mode='none', filter_headers=["authorization"])
def test_agent_knowledege_with_crewai_knowledge():
crew_knowledge = MagicMock(spec=Knowledge)
agent_knowledge = MagicMock(spec=Knowledge)
agent = Agent(
role="Information Agent",
goal="Provide information based on knowledge sources",
backstory="You have access to specific knowledge sources.",
llm=LLM(model="openrouter/openai/gpt-4o-mini",api_key=os.getenv('OPENROUTER_API_KEY')),
knowledge=agent_knowledge
)
# Create a task that requires the agent to use the knowledge
task = Task(
description="What is Vidit's favorite color?",
expected_output="Vidit's favorclearite color.",
agent=agent,
)
crew = Crew(agents=[agent],tasks=[task],knowledge=crew_knowledge)
crew.kickoff()
agent_knowledge.query.assert_called_once()
crew_knowledge.query.assert_called_once()
@pytest.mark.vcr(filter_headers=["authorization"])
def test_litellm_auth_error_handling():
"""Test that LiteLLM authentication errors are handled correctly and not retried."""

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,150 @@
interactions:
- request:
body: '{"model": "openai/gpt-4o-mini", "messages": [{"role": "system", "content":
"Your goal is to rewrite the user query so that it is optimized for retrieval
from a vector database. Consider how the query will be used to find relevant
documents, and aim to make it more specific and context-aware. \n\n Do not include
any other text than the rewritten query, especially any preamble or postamble
and only add expected output format if its relevant to the rewritten query.
\n\n Focus on the key words of the intended task and to retrieve the most relevant
information. \n\n There will be some extra context provided that might need
to be removed such as expected_output formats structured_outputs and other instructions."},
{"role": "user", "content": "The original query is: What is Vidit''s favorite
color?\n\nThis is the expected criteria for your final answer: Vidit''s favorclearite
color.\nyou MUST return the actual complete content as the final answer, not
a summary.."}], "stream": false, "stop": ["\nObservation:"]}'
headers:
accept:
- '*/*'
accept-encoding:
- gzip, deflate
connection:
- keep-alive
content-length:
- '1017'
content-type:
- application/json
host:
- openrouter.ai
http-referer:
- https://litellm.ai
user-agent:
- litellm/1.68.0
x-title:
- liteLLM
method: POST
uri: https://openrouter.ai/api/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAAwAAAP//4lKAAS4AAAAA//90kE1vE0EMhv9K9V64TMrmgyadG8ceECAhhIrQarrj
3bidHY/GTgSK9r+jpUpaJLja78djn8ARHgPlxXK72a6X6+12szhq7Id72d2V8b58/nbzQb98gkOp
cuRIFR4fC+X3d3AYJVKChxTKgd8OxRYbWYycGQ7y8EidwaPbB7vuZCyJjCXDoasUjCL8S61Dtxfu
SOG/n5BkKFUeFD4fUnLoObPu20pBJcNDTQoccjA+UvufLedIP+Ebh5FUw0DwJ1RJBI+gymoh20wj
2SjPpF85sr3Rqz4cpbLRVSdJ6jUcKvUHDenM81zFeXgeTNMPB/2lRuMMM1Atlf8k9qVt1rer3WrV
3DZwOJw5SpWxWGvyRFnnR7ybQc4/usxvHEwspBfhbun+NreRLHDSObUL3Z7iRdxM/wh9rb/c8coy
Tb8BAAD//wMAqVt3JyMCAAA=
headers:
Access-Control-Allow-Origin:
- '*'
CF-RAY:
- 9402cb503aec46c0-BOM
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Thu, 15 May 2025 12:56:14 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
Vary:
- Accept-Encoding
x-clerk-auth-message:
- Invalid JWT form. A JWT consists of three parts separated by dots. (reason=token-invalid,
token-carrier=header)
x-clerk-auth-reason:
- token-invalid
x-clerk-auth-status:
- signed-out
status:
code: 200
message: OK
- request:
body: '{"model": "openai/gpt-4o-mini", "messages": [{"role": "system", "content":
"You are Information Agent. You have access to specific knowledge sources.\nYour
personal goal is: Provide information based on knowledge sources\nTo give my
best complete final answer to the task respond using the exact following format:\n\nThought:
I now can give a great answer\nFinal Answer: Your final answer must be the great
and the most complete as possible, it must be outcome described.\n\nI MUST use
these formats, my job depends on it!"}, {"role": "user", "content": "\nCurrent
Task: What is Vidit''s favorite color?\n\nThis is the expected criteria for
your final answer: Vidit''s favorclearite color.\nyou MUST return the actual
complete content as the final answer, not a summary.\n\nBegin! This is VERY
important to you, use the tools available and give your best Final Answer, your
job depends on it!\n\nThought:"}], "stream": false, "stop": ["\nObservation:"]}'
headers:
accept:
- '*/*'
accept-encoding:
- gzip, deflate
connection:
- keep-alive
content-length:
- '951'
content-type:
- application/json
host:
- openrouter.ai
http-referer:
- https://litellm.ai
user-agent:
- litellm/1.68.0
x-title:
- liteLLM
method: POST
uri: https://openrouter.ai/api/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAAwAAAP//4lKAAS4AAAAA///iQjABAAAA//90kE9rG0EMxb/K8C69jNON7WJ7boFS
CD2ENm2g/1jGs/Ja7aw0zIydBuPvXjbBcQrtUU9P0u/pAO7g0JNMLhfzxexytli8mdy8r7c6/3Lb
v13eff00088fPj7AImXdc0cZDjeJ5OoaFoN2FOGgicTz6z7VyVwnAwvDQtc/KVQ4hK2vF0GHFKmy
CixCJl+pgzuftQhb5UAF7tsBUfuUdV3gZBejxYaFy7bN5IsKHErVBAvxlffU/qfL0tFvuMZioFJ8
T3AHZI0EB18Kl+qljjQqlWQkvTai9yZ4MT3vyXjTj6DGS7mnbMx3ecfio7l6rJ25447rq2I2fq+Z
K5mgUbPhYtZxRxewyLTZFR9PMZ4IWfon4Xj8YVEeSqVhzNBTTpkfQTapbWar6XI6bVYNLHYn/JR1
SLWt+oukjP9rRv7Ta8/6yqJq9fGsLFf27+m2o+o5lnFt8GFL3bO5Of5j60v/c5AXI8fjHwAAAP//
AwDEkP8dZgIAAA==
headers:
Access-Control-Allow-Origin:
- '*'
CF-RAY:
- 9402cb55c9fe46c0-BOM
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Thu, 15 May 2025 12:56:15 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
Vary:
- Accept-Encoding
x-clerk-auth-message:
- Invalid JWT form. A JWT consists of three parts separated by dots. (reason=token-invalid,
token-carrier=header)
x-clerk-auth-reason:
- token-invalid
x-clerk-auth-status:
- signed-out
status:
code: 200
message: OK
version: 1

View File

@@ -0,0 +1,151 @@
interactions:
- request:
body: '{"model": "openai/gpt-4o-mini", "messages": [{"role": "system", "content":
"Your goal is to rewrite the user query so that it is optimized for retrieval
from a vector database. Consider how the query will be used to find relevant
documents, and aim to make it more specific and context-aware. \n\n Do not include
any other text than the rewritten query, especially any preamble or postamble
and only add expected output format if its relevant to the rewritten query.
\n\n Focus on the key words of the intended task and to retrieve the most relevant
information. \n\n There will be some extra context provided that might need
to be removed such as expected_output formats structured_outputs and other instructions."},
{"role": "user", "content": "The original query is: What is Vidit''s favorite
color?\n\nThis is the expected criteria for your final answer: Vidit''s favorclearite
color.\nyou MUST return the actual complete content as the final answer, not
a summary.."}], "stream": false, "stop": ["\nObservation:"]}'
headers:
accept:
- '*/*'
accept-encoding:
- gzip, deflate
connection:
- keep-alive
content-length:
- '1017'
content-type:
- application/json
host:
- openrouter.ai
http-referer:
- https://litellm.ai
user-agent:
- litellm/1.68.0
x-title:
- liteLLM
method: POST
uri: https://openrouter.ai/api/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAAwAAAP//4lKAAS4AAAAA//90kE1vE0EMhv9K9V64TGCbNGQ7N46gIg6IXhBaTWed
Xbez49HYiaii/e9oqRKKBFf7/XjsE7iHx0B5db272W2uN++b3ep585k+jcmo/XqnYXvX5m/3cChV
jtxThceXQvnDRzhM0lOChxTKgd8NxVY3spo4Mxzk4ZGiwSOOwd5GmUoiY8lwiJWCUQ9/qW0d4igc
SeG/n5BkKFUeFD4fUnLYc2Ydu0pBJcNDTQoccjA+UvefLeeefsI3DhOphoHgT6iSCB5BldVCtoVG
slFeSO+5Z3ujV/twlMpGV1GSVDhU2h80pDPOSxPn4WUwzz8c9FmNpoVloFoq/w7cl67Z3K7b9bq5
beBwOGOUKlOxzuSJsi5/2C4c5xdd5lsHEwvpj7Bt3N/mricLnHRJjSGO1F/EzfyP0Nf6yx2vLPP8
CwAA//8DAOHu/cIiAgAA
headers:
Access-Control-Allow-Origin:
- '*'
CF-RAY:
- 9402c73df9d8859c-BOM
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Thu, 15 May 2025 12:53:27 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
Vary:
- Accept-Encoding
x-clerk-auth-message:
- Invalid JWT form. A JWT consists of three parts separated by dots. (reason=token-invalid,
token-carrier=header)
x-clerk-auth-reason:
- token-invalid
x-clerk-auth-status:
- signed-out
status:
code: 200
message: OK
- request:
body: '{"model": "openai/gpt-4o-mini", "messages": [{"role": "system", "content":
"You are Information Agent. You have access to specific knowledge sources.\nYour
personal goal is: Provide information based on knowledge sources\nTo give my
best complete final answer to the task respond using the exact following format:\n\nThought:
I now can give a great answer\nFinal Answer: Your final answer must be the great
and the most complete as possible, it must be outcome described.\n\nI MUST use
these formats, my job depends on it!"}, {"role": "user", "content": "\nCurrent
Task: What is Vidit''s favorite color?\n\nThis is the expected criteria for
your final answer: Vidit''s favorclearite color.\nyou MUST return the actual
complete content as the final answer, not a summary.\n\nBegin! This is VERY
important to you, use the tools available and give your best Final Answer, your
job depends on it!\n\nThought:"}], "stream": false, "stop": ["\nObservation:"]}'
headers:
accept:
- '*/*'
accept-encoding:
- gzip, deflate
connection:
- keep-alive
content-length:
- '951'
content-type:
- application/json
host:
- openrouter.ai
http-referer:
- https://litellm.ai
user-agent:
- litellm/1.68.0
x-title:
- liteLLM
method: POST
uri: https://openrouter.ai/api/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAAwAAAP//4lKAAS4AAAAA///iQjABAAAA//90kUGPEzEMhf+K5QuXdJmlpbvkthIg
emFXQoIDoMpNPFNDJo6STLul6n9H09KyIDjmxc9+/rxH8Wix4zi5vpndTK+n8+Z2wo9vXj28fHff
vW4+PNT5j1l6/wkNpqwb8ZzR4n3ieLdAg716DmhRE0eS512qk5lOeomCBnX1jV1Fi25N9cppnwJX
0YgGXWaq7NH+HmvQrVUcF7Sf9xi0S1lXBW0cQjDYSpSyXmamohEtlqoJDUaqsuHlf34len5E2xjs
uRTqGO0eswZGi1SKlEqxjmk0Vo5j0gVE3YKjCJ1sGAi6MShQLFvOAF/iW4kU4O74tvBRvNRnBVra
aJbK4DRoBikQtcJWPIcdeHVDz7GyB4mQhlUQF3ZAG5JAq8BQdMiOi4GisBiHj+ZftIHA87hePeY5
5cjcUfYSO1hLgZLYSSvurxRXaDBzOxQKZ4gnPhK7k3A4fDVYdqVyPxLsOKcsRwxtWvoVOZo3vm3Q
4HCGl7L2qS6rfudYxus1I73zYS/69NZg1UrhorwYD/yHe+m5koQytnXk1uwvxc3hH12f1l8WeWI5
HH4CAAD//wMAhZKqO+QCAAA=
headers:
Access-Control-Allow-Origin:
- '*'
CF-RAY:
- 9402c7459f3f859c-BOM
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Thu, 15 May 2025 12:53:28 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
Vary:
- Accept-Encoding
x-clerk-auth-message:
- Invalid JWT form. A JWT consists of three parts separated by dots. (reason=token-invalid,
token-carrier=header)
x-clerk-auth-reason:
- token-invalid
x-clerk-auth-status:
- signed-out
status:
code: 200
message: OK
version: 1

View File

@@ -0,0 +1,150 @@
interactions:
- request:
body: '{"model": "openai/gpt-4o-mini", "messages": [{"role": "system", "content":
"Your goal is to rewrite the user query so that it is optimized for retrieval
from a vector database. Consider how the query will be used to find relevant
documents, and aim to make it more specific and context-aware. \n\n Do not include
any other text than the rewritten query, especially any preamble or postamble
and only add expected output format if its relevant to the rewritten query.
\n\n Focus on the key words of the intended task and to retrieve the most relevant
information. \n\n There will be some extra context provided that might need
to be removed such as expected_output formats structured_outputs and other instructions."},
{"role": "user", "content": "The original query is: What is Vidit''s favorite
color?\n\nThis is the expected criteria for your final answer: Vidit''s favorclearite
color.\nyou MUST return the actual complete content as the final answer, not
a summary.."}], "stream": false, "stop": ["\nObservation:"]}'
headers:
accept:
- '*/*'
accept-encoding:
- gzip, deflate
connection:
- keep-alive
content-length:
- '1017'
content-type:
- application/json
host:
- openrouter.ai
http-referer:
- https://litellm.ai
user-agent:
- litellm/1.68.0
x-title:
- liteLLM
method: POST
uri: https://openrouter.ai/api/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAAwAAAP//4lKAAS4AAAAA//90kE1PIzEMhv8Kei97Sdnplwq5gTgAF8ShcFitRmnG
nTFk4ihxq11V899Xs6gFJLja78djH8ANLFqKk+lqsZpP56vpYqJhublfP1eP65v1i79Lt9fdMwxS
lj03lGHxkChe3cGgl4YCLCRRdPyzTTpZyKTnyDCQzQt5hYXvnJ576VMgZYkw8JmcUgP7XmvgO2FP
BfbXAUHalGVTYOMuBIMtRy5dnckVibAoKgkG0Snvqf5my7GhP7CVQU+luJZgD8gSCBauFC7qoo40
EpXiSPrEDeuPcrZ1e8msdOYlSIZBpu2uuHDEeWvi2L4NhuG3QflblPqRpaWcMv8P3Ka6ml/OLmaz
6rKCwe6IkbL0SWuVV4pl/MNy5Di+6DRfGqioC+/Ci8p8NtcNqeNQxlTvfEfNSVwNX4R+1J/u+GAZ
hn8AAAD//wMAIwJ79CICAAA=
headers:
Access-Control-Allow-Origin:
- '*'
CF-RAY:
- 9402c9db99ec4722-BOM
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Thu, 15 May 2025 12:55:14 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
Vary:
- Accept-Encoding
x-clerk-auth-message:
- Invalid JWT form. A JWT consists of three parts separated by dots. (reason=token-invalid,
token-carrier=header)
x-clerk-auth-reason:
- token-invalid
x-clerk-auth-status:
- signed-out
status:
code: 200
message: OK
- request:
body: '{"model": "openai/gpt-4o-mini", "messages": [{"role": "system", "content":
"You are Information Agent. You have access to specific knowledge sources.\nYour
personal goal is: Provide information based on knowledge sources\nTo give my
best complete final answer to the task respond using the exact following format:\n\nThought:
I now can give a great answer\nFinal Answer: Your final answer must be the great
and the most complete as possible, it must be outcome described.\n\nI MUST use
these formats, my job depends on it!"}, {"role": "user", "content": "\nCurrent
Task: What is Vidit''s favorite color?\n\nThis is the expected criteria for
your final answer: Vidit''s favorclearite color.\nyou MUST return the actual
complete content as the final answer, not a summary.\n\nBegin! This is VERY
important to you, use the tools available and give your best Final Answer, your
job depends on it!\n\nThought:"}], "stream": false, "stop": ["\nObservation:"]}'
headers:
accept:
- '*/*'
accept-encoding:
- gzip, deflate
connection:
- keep-alive
content-length:
- '951'
content-type:
- application/json
host:
- openrouter.ai
http-referer:
- https://litellm.ai
user-agent:
- litellm/1.68.0
x-title:
- liteLLM
method: POST
uri: https://openrouter.ai/api/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAAwAAAP//4lKAAS4AAAAA///iQjABAAAA//90kN1qGzEQRl9FfNdyul4nday73ARy
VUpLE2jLIu+O15NoZ4QkOy1moa/R1+uTlE1wnEB7qU/zc84cwB0cepLZfHm+XMwXy/nF7II/3d7V
H+tOPvsS3le3d+keFjHpnjtKcPgQSa5uYDFoRwEOGkk8v+tjmZ3rbGBhWOj6ntoCh3bry1mrQwxU
WAUWbSJfqIM7rbVot8otZbivBwTtY9J1hpNdCBYbFs7bJpHPKnDIRSMsxBfeU/OfX5aOfsBVFgPl
7HuCOyBpIDj4nDkXL2WiUSkkE+mNEX00rRfT856MN/0EarzkR0rGfJNrFh/M1dPbmS/ccfnz63c2
G7/XxIVMq0GT4WzWYUdnsEi02WUfjiLPjCz9czCO3y3yz1xomCx6SjHxE8omNtViVV/WdbWqYLE7
CsSkQyxN0QeSPF2wmgyOxz3lK4uixYdTcrmyb7ubjornkKexrW+31L0UV+M/pr6ufxF51TKOfwEA
AP//AwBybekMaAIAAA==
headers:
Access-Control-Allow-Origin:
- '*'
CF-RAY:
- 9402c9e1b94a4722-BOM
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Thu, 15 May 2025 12:55:15 GMT
Server:
- cloudflare
Transfer-Encoding:
- chunked
Vary:
- Accept-Encoding
x-clerk-auth-message:
- Invalid JWT form. A JWT consists of three parts separated by dots. (reason=token-invalid,
token-carrier=header)
x-clerk-auth-reason:
- token-invalid
x-clerk-auth-status:
- signed-out
status:
code: 200
message: OK
version: 1

View File

@@ -27,7 +27,7 @@ class TestValidateToken(unittest.TestCase):
audience="app_id_xxxx",
)
mock_jwt.decode.assert_called_once_with(
mock_jwt.decode.assert_called_with(
"aaaaa.bbbbbb.cccccc",
"mock_signing_key",
algorithms=["RS256"],

View File

View File

View File

@@ -0,0 +1,28 @@
import pytest
from unittest.mock import MagicMock
from crewai.agent import Agent
from crewai.task import Task
class BaseEvaluationMetricsTest:
@pytest.fixture
def mock_agent(self):
agent = MagicMock(spec=Agent)
agent.id = "test_agent_id"
agent.role = "Test Agent"
agent.goal = "Test goal"
agent.tools = []
return agent
@pytest.fixture
def mock_task(self):
task = MagicMock(spec=Task)
task.description = "Test task description"
task.expected_output = "Test expected output"
return task
@pytest.fixture
def execution_trace(self):
return {
"thinking": ["I need to analyze this data carefully"],
"actions": ["Gathered information", "Analyzed data"]
}

View File

@@ -0,0 +1,59 @@
from unittest.mock import patch, MagicMock
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
from crewai.evaluation.base_evaluator import EvaluationScore
from crewai.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator
from crewai.utilities.llm_utils import LLM
class TestGoalAlignmentEvaluator(BaseEvaluationMetricsTest):
@patch("crewai.utilities.llm_utils.create_llm")
def test_evaluate_success(self, mock_create_llm, mock_agent, mock_task, execution_trace):
mock_llm = MagicMock(spec=LLM)
mock_llm.call.return_value = """
{
"score": 8.5,
"feedback": "The agent correctly understood the task and produced relevant output."
}
"""
mock_create_llm.return_value = mock_llm
evaluator = GoalAlignmentEvaluator(llm=mock_llm)
result = evaluator.evaluate(
agent=mock_agent,
task=mock_task,
execution_trace=execution_trace,
final_output="This is the final output"
)
assert isinstance(result, EvaluationScore)
assert result.score == 8.5
assert "correctly understood the task" in result.feedback
mock_llm.call.assert_called_once()
prompt = mock_llm.call.call_args[0][0]
assert len(prompt) >= 2
assert "system" in prompt[0]["role"]
assert "user" in prompt[1]["role"]
assert mock_agent.role in prompt[1]["content"]
assert mock_task.description in prompt[1]["content"]
@patch("crewai.utilities.llm_utils.create_llm")
def test_evaluate_error_handling(self, mock_create_llm, mock_agent, mock_task, execution_trace):
mock_llm = MagicMock(spec=LLM)
mock_llm.call.return_value = "Invalid JSON response"
mock_create_llm.return_value = mock_llm
evaluator = GoalAlignmentEvaluator(llm=mock_llm)
result = evaluator.evaluate(
agent=mock_agent,
task=mock_task,
execution_trace=execution_trace,
final_output="This is the final output"
)
assert isinstance(result, EvaluationScore)
assert result.score is None
assert "Failed to parse" in result.feedback

View File

@@ -0,0 +1,166 @@
import pytest
from unittest.mock import patch, MagicMock
from typing import List, Dict, Any
from crewai.tasks.task_output import TaskOutput
from crewai.evaluation.metrics.reasoning_metrics import (
ReasoningEfficiencyEvaluator,
)
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
from crewai.utilities.llm_utils import LLM
from crewai.evaluation.base_evaluator import EvaluationScore
class TestReasoningEfficiencyEvaluator(BaseEvaluationMetricsTest):
@pytest.fixture
def mock_output(self):
output = MagicMock(spec=TaskOutput)
output.raw = "This is the test output"
return output
@pytest.fixture
def llm_calls(self) -> List[Dict[str, Any]]:
return [
{
"prompt": "How should I approach this task?",
"response": "I'll first research the topic, then compile findings.",
"timestamp": 1626987654
},
{
"prompt": "What resources should I use?",
"response": "I'll use relevant academic papers and reliable websites.",
"timestamp": 1626987754
},
{
"prompt": "How should I structure the output?",
"response": "I'll organize information clearly with headings and bullet points.",
"timestamp": 1626987854
}
]
def test_insufficient_llm_calls(self, mock_agent, mock_task, mock_output):
execution_trace = {"llm_calls": []}
evaluator = ReasoningEfficiencyEvaluator()
result = evaluator.evaluate(
agent=mock_agent,
task=mock_task,
execution_trace=execution_trace,
final_output=mock_output
)
assert isinstance(result, EvaluationScore)
assert result.score is None
assert "Insufficient LLM calls" in result.feedback
@patch("crewai.utilities.llm_utils.create_llm")
def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task, mock_output, llm_calls):
mock_llm = MagicMock(spec=LLM)
mock_llm.call.return_value = """
{
"scores": {
"focus": 8.0,
"progression": 7.0,
"decision_quality": 7.5,
"conciseness": 8.0,
"loop_avoidance": 9.0
},
"overall_score": 7.9,
"feedback": "The agent demonstrated good reasoning efficiency.",
"optimization_suggestions": "The agent could improve by being more concise."
}
"""
mock_create_llm.return_value = mock_llm
# Setup execution trace with sufficient LLM calls
execution_trace = {"llm_calls": llm_calls}
# Mock the _detect_loops method to return a simple result
evaluator = ReasoningEfficiencyEvaluator(llm=mock_llm)
evaluator._detect_loops = MagicMock(return_value=(False, []))
# Evaluate
result = evaluator.evaluate(
agent=mock_agent,
task=mock_task,
execution_trace=execution_trace,
final_output=mock_output
)
# Assertions
assert isinstance(result, EvaluationScore)
assert result.score == 7.9
assert "The agent demonstrated good reasoning efficiency" in result.feedback
assert "Reasoning Efficiency Evaluation:" in result.feedback
assert "• Focus: 8.0/10" in result.feedback
# Verify LLM was called
mock_llm.call.assert_called_once()
@patch("crewai.utilities.llm_utils.create_llm")
def test_parse_error_handling(self, mock_create_llm, mock_agent, mock_task, mock_output, llm_calls):
mock_llm = MagicMock(spec=LLM)
mock_llm.call.return_value = "Invalid JSON response"
mock_create_llm.return_value = mock_llm
# Setup execution trace
execution_trace = {"llm_calls": llm_calls}
# Mock the _detect_loops method
evaluator = ReasoningEfficiencyEvaluator(llm=mock_llm)
evaluator._detect_loops = MagicMock(return_value=(False, []))
# Evaluate
result = evaluator.evaluate(
agent=mock_agent,
task=mock_task,
execution_trace=execution_trace,
final_output=mock_output
)
# Assertions for error handling
assert isinstance(result, EvaluationScore)
assert result.score is None
assert "Failed to parse reasoning efficiency evaluation" in result.feedback
@patch("crewai.utilities.llm_utils.create_llm")
def test_loop_detection(self, mock_create_llm, mock_agent, mock_task, mock_output):
# Setup LLM calls with a repeating pattern
repetitive_llm_calls = [
{"prompt": "How to solve?", "response": "I'll try method A", "timestamp": 1000},
{"prompt": "Let me try method A", "response": "It didn't work", "timestamp": 1100},
{"prompt": "How to solve?", "response": "I'll try method A again", "timestamp": 1200},
{"prompt": "Let me try method A", "response": "It didn't work", "timestamp": 1300},
{"prompt": "How to solve?", "response": "I'll try method A one more time", "timestamp": 1400}
]
mock_llm = MagicMock(spec=LLM)
mock_llm.call.return_value = """
{
"scores": {
"focus": 6.0,
"progression": 3.0,
"decision_quality": 4.0,
"conciseness": 6.0,
"loop_avoidance": 2.0
},
"overall_score": 4.2,
"feedback": "The agent is stuck in a reasoning loop.",
"optimization_suggestions": "The agent should try different approaches when one fails."
}
"""
mock_create_llm.return_value = mock_llm
execution_trace = {"llm_calls": repetitive_llm_calls}
evaluator = ReasoningEfficiencyEvaluator(llm=mock_llm)
result = evaluator.evaluate(
agent=mock_agent,
task=mock_task,
execution_trace=execution_trace,
final_output=mock_output
)
assert isinstance(result, EvaluationScore)
assert result.score == 4.2
assert "• Loop Avoidance: 2.0/10" in result.feedback

View File

@@ -0,0 +1,82 @@
from unittest.mock import patch, MagicMock
from crewai.evaluation.base_evaluator import EvaluationScore
from crewai.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
from crewai.utilities.llm_utils import LLM
class TestSemanticQualityEvaluator(BaseEvaluationMetricsTest):
@patch("crewai.utilities.llm_utils.create_llm")
def test_evaluate_success(self, mock_create_llm, mock_agent, mock_task, execution_trace):
mock_llm = MagicMock(spec=LLM)
mock_llm.call.return_value = """
{
"score": 8.5,
"feedback": "The output is clear, coherent, and logically structured."
}
"""
mock_create_llm.return_value = mock_llm
evaluator = SemanticQualityEvaluator(llm=mock_llm)
result = evaluator.evaluate(
agent=mock_agent,
task=mock_task,
execution_trace=execution_trace,
final_output="This is a well-structured analysis of the data."
)
assert isinstance(result, EvaluationScore)
assert result.score == 8.5
assert "clear, coherent" in result.feedback
mock_llm.call.assert_called_once()
prompt = mock_llm.call.call_args[0][0]
assert len(prompt) >= 2
assert "system" in prompt[0]["role"]
assert "user" in prompt[1]["role"]
assert mock_agent.role in prompt[1]["content"]
assert mock_task.description in prompt[1]["content"]
@patch("crewai.utilities.llm_utils.create_llm")
def test_evaluate_with_empty_output(self, mock_create_llm, mock_agent, mock_task, execution_trace):
mock_llm = MagicMock(spec=LLM)
mock_llm.call.return_value = """
{
"score": 2.0,
"feedback": "The output is empty or minimal, lacking substance."
}
"""
mock_create_llm.return_value = mock_llm
evaluator = SemanticQualityEvaluator(llm=mock_llm)
result = evaluator.evaluate(
agent=mock_agent,
task=mock_task,
execution_trace=execution_trace,
final_output=""
)
assert isinstance(result, EvaluationScore)
assert result.score == 2.0
assert "empty or minimal" in result.feedback
@patch("crewai.utilities.llm_utils.create_llm")
def test_evaluate_error_handling(self, mock_create_llm, mock_agent, mock_task, execution_trace):
mock_llm = MagicMock(spec=LLM)
mock_llm.call.return_value = "Invalid JSON response"
mock_create_llm.return_value = mock_llm
evaluator = SemanticQualityEvaluator(llm=mock_llm)
result = evaluator.evaluate(
agent=mock_agent,
task=mock_task,
execution_trace=execution_trace,
final_output="This is the output."
)
assert isinstance(result, EvaluationScore)
assert result.score is None
assert "Failed to parse" in result.feedback

View File

@@ -0,0 +1,230 @@
from unittest.mock import patch, MagicMock
from crewai.evaluation.metrics.tools_metrics import (
ToolSelectionEvaluator,
ParameterExtractionEvaluator,
ToolInvocationEvaluator
)
from crewai.utilities.llm_utils import LLM
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
class TestToolSelectionEvaluator(BaseEvaluationMetricsTest):
def test_no_tools_available(self, mock_task, mock_agent):
# Create agent with no tools
mock_agent.tools = []
execution_trace = {"tool_uses": []}
evaluator = ToolSelectionEvaluator()
result = evaluator.evaluate(
agent=mock_agent,
task=mock_task,
execution_trace=execution_trace,
final_output="Final output"
)
assert result.score is None
assert "no tools available" in result.feedback.lower()
def test_tools_available_but_none_used(self, mock_agent, mock_task):
mock_agent.tools = ["tool1", "tool2"]
execution_trace = {"tool_uses": []}
evaluator = ToolSelectionEvaluator()
result = evaluator.evaluate(
agent=mock_agent,
task=mock_task,
execution_trace=execution_trace,
final_output="Final output"
)
assert result.score is None
assert "had tools available but didn't use any" in result.feedback.lower()
@patch("crewai.utilities.llm_utils.create_llm")
def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task):
# Setup mock LLM response
mock_llm = MagicMock(spec=LLM)
mock_llm.call.return_value = """
{
"overall_score": 8.5,
"feedback": "The agent made good tool selections."
}
"""
mock_create_llm.return_value = mock_llm
# Setup execution trace with tool uses
execution_trace = {
"tool_uses": [
{"tool": "search_tool", "input": {"query": "test query"}, "output": "search results"},
{"tool": "calculator", "input": {"expression": "2+2"}, "output": "4"}
]
}
evaluator = ToolSelectionEvaluator(llm=mock_llm)
result = evaluator.evaluate(
agent=mock_agent,
task=mock_task,
execution_trace=execution_trace,
final_output="Final output"
)
assert result.score == 8.5
assert "The agent made good tool selections" in result.feedback
# Verify LLM was called with correct prompt
mock_llm.call.assert_called_once()
prompt = mock_llm.call.call_args[0][0]
assert isinstance(prompt, list)
assert len(prompt) >= 2
assert "system" in prompt[0]["role"]
assert "user" in prompt[1]["role"]
class TestParameterExtractionEvaluator(BaseEvaluationMetricsTest):
def test_no_tool_uses(self, mock_agent, mock_task):
execution_trace = {"tool_uses": []}
evaluator = ParameterExtractionEvaluator()
result = evaluator.evaluate(
agent=mock_agent,
task=mock_task,
execution_trace=execution_trace,
final_output="Final output"
)
assert result.score is None
assert "no tool usage" in result.feedback.lower()
@patch("crewai.utilities.llm_utils.create_llm")
def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task):
mock_agent.tools = ["tool1", "tool2"]
# Setup mock LLM response
mock_llm = MagicMock(spec=LLM)
mock_llm.call.return_value = """
{
"overall_score": 9.0,
"feedback": "The agent extracted parameters correctly."
}
"""
mock_create_llm.return_value = mock_llm
# Setup execution trace with tool uses
execution_trace = {
"tool_uses": [
{
"tool": "search_tool",
"input": {"query": "test query"},
"output": "search results",
"error": None
},
{
"tool": "calculator",
"input": {"expression": "2+2"},
"output": "4",
"error": None
}
]
}
evaluator = ParameterExtractionEvaluator(llm=mock_llm)
result = evaluator.evaluate(
agent=mock_agent,
task=mock_task,
execution_trace=execution_trace,
final_output="Final output"
)
assert result.score == 9.0
assert "The agent extracted parameters correctly" in result.feedback
class TestToolInvocationEvaluator(BaseEvaluationMetricsTest):
def test_no_tool_uses(self, mock_agent, mock_task):
execution_trace = {"tool_uses": []}
evaluator = ToolInvocationEvaluator()
result = evaluator.evaluate(
agent=mock_agent,
task=mock_task,
execution_trace=execution_trace,
final_output="Final output"
)
assert result.score is None
assert "no tool usage" in result.feedback.lower()
@patch("crewai.utilities.llm_utils.create_llm")
def test_successful_evaluation(self, mock_create_llm, mock_agent, mock_task):
mock_agent.tools = ["tool1", "tool2"]
# Setup mock LLM response
mock_llm = MagicMock(spec=LLM)
mock_llm.call.return_value = """
{
"overall_score": 8.0,
"feedback": "The agent invoked tools correctly."
}
"""
mock_create_llm.return_value = mock_llm
# Setup execution trace with tool uses
execution_trace = {
"tool_uses": [
{"tool": "search_tool", "input": {"query": "test query"}, "output": "search results"},
{"tool": "calculator", "input": {"expression": "2+2"}, "output": "4"}
]
}
evaluator = ToolInvocationEvaluator(llm=mock_llm)
result = evaluator.evaluate(
agent=mock_agent,
task=mock_task,
execution_trace=execution_trace,
final_output="Final output"
)
assert result.score == 8.0
assert "The agent invoked tools correctly" in result.feedback
@patch("crewai.utilities.llm_utils.create_llm")
def test_evaluation_with_errors(self, mock_create_llm, mock_agent, mock_task):
mock_agent.tools = ["tool1", "tool2"]
# Setup mock LLM response
mock_llm = MagicMock(spec=LLM)
mock_llm.call.return_value = """
{
"overall_score": 5.5,
"feedback": "The agent had some errors in tool invocation."
}
"""
mock_create_llm.return_value = mock_llm
# Setup execution trace with tool uses including errors
execution_trace = {
"tool_uses": [
{
"tool": "search_tool",
"input": {"query": "test query"},
"output": "search results",
"error": None
},
{
"tool": "calculator",
"input": {"expression": "2+"},
"output": None,
"error": "Invalid expression"
}
]
}
evaluator = ToolInvocationEvaluator(llm=mock_llm)
result = evaluator.evaluate(
agent=mock_agent,
task=mock_task,
execution_trace=execution_trace,
final_output="Final output"
)
assert result.score == 5.5
assert "The agent had some errors in tool invocation" in result.feedback

View File

@@ -0,0 +1,95 @@
import pytest
from crewai.agent import Agent
from crewai.task import Task
from crewai.crew import Crew
from crewai.evaluation.agent_evaluator import AgentEvaluator
from crewai.evaluation.base_evaluator import AgentEvaluationResult
from crewai.evaluation import (
GoalAlignmentEvaluator,
SemanticQualityEvaluator,
ToolSelectionEvaluator,
ParameterExtractionEvaluator,
ToolInvocationEvaluator,
ReasoningEfficiencyEvaluator
)
from crewai.evaluation import create_default_evaluator
class TestAgentEvaluator:
@pytest.fixture
def mock_crew(self):
agent = Agent(
role="Test Agent",
goal="Complete test tasks successfully",
backstory="An agent created for testing purposes",
allow_delegation=False,
verbose=False
)
task = Task(
description="Test task description",
agent=agent,
expected_output="Expected test output"
)
crew = Crew(
agents=[agent],
tasks=[task]
)
return crew
def test_set_iteration(self):
agent_evaluator = AgentEvaluator()
agent_evaluator.set_iteration(3)
assert agent_evaluator.iteration == 3
@pytest.mark.vcr(filter_headers=["authorization"])
def test_evaluate_current_iteration(self, mock_crew):
agent_evaluator = AgentEvaluator(crew=mock_crew, evaluators=[GoalAlignmentEvaluator()])
mock_crew.kickoff()
results = agent_evaluator.evaluate_current_iteration()
assert isinstance(results, dict)
agent, = mock_crew.agents
task, = mock_crew.tasks
assert len(mock_crew.agents) == 1
assert agent.role in results
assert len(results[agent.role]) == 1
result, = results[agent.role]
assert isinstance(result, AgentEvaluationResult)
assert result.agent_id == str(agent.id)
assert result.task_id == str(task.id)
goal_alignment, = result.metrics.values()
assert goal_alignment.score == 5.0
expected_feedback = "The agent's output demonstrates an understanding of the need for a comprehensive document"
assert expected_feedback in goal_alignment.feedback
assert goal_alignment.raw_response is not None
assert '"score": 5' in goal_alignment.raw_response
def test_create_default_evaluator(self, mock_crew):
agent_evaluator = create_default_evaluator(crew=mock_crew)
assert isinstance(agent_evaluator, AgentEvaluator)
assert agent_evaluator.crew == mock_crew
expected_types = [
GoalAlignmentEvaluator,
SemanticQualityEvaluator,
ToolSelectionEvaluator,
ParameterExtractionEvaluator,
ToolInvocationEvaluator,
ReasoningEfficiencyEvaluator
]
assert len(agent_evaluator.evaluators) == len(expected_types)
for evaluator, expected_type in zip(agent_evaluator.evaluators, expected_types):
assert isinstance(evaluator, expected_type)

View File

@@ -601,7 +601,7 @@ def test_handle_streaming_tool_calls(get_weather_tool_schema, mock_emit):
def test_handle_streaming_tool_calls_with_error(get_weather_tool_schema, mock_emit):
def get_weather_error(location):
raise Exception("Error")
llm = LLM(model="openai/gpt-4o", stream=True)
response = llm.call(
messages=[
@@ -619,7 +619,7 @@ def test_handle_streaming_tool_calls_with_error(get_weather_tool_schema, mock_em
expected_stream_chunk=9,
expected_completed_llm_call=1,
expected_tool_usage_started=1,
expected_tool_usage_error=1,
expected_tool_usage_error=1,
expected_final_chunk_result=expected_final_chunk_result,
)