Introducing Agent evaluation (#3130)

* feat: add exchanged messages in LLMCallCompletedEvent * feat: add GoalAlignment metric for Agent evaluation * feat: add SemanticQuality metric for Agent evaluation * feat: add Tool Metrics for Agent evaluation * feat: add Reasoning Metrics for Agent evaluation, still in progress * feat: add AgentEvaluator class This class will evaluate Agent' results and report to user * fix: do not evaluate Agent by default This is a experimental feature we still need refine it further * test: add Agent eval tests * fix: render all feedback per iteration * style: resolve linter issues * style: fix mypy issues * fix: allow messages be empty on LLMCallCompletedEvent
2026-01-09 08:08:32 +00:00 · 2025-07-11 14:18:03 -03:00
parent bf8fa3232b
commit 08fa3797ca
26 changed files with 2930 additions and 14 deletions
--- a/tests/llm_test.py
+++ b/tests/llm_test.py
@@ -601,7 +601,7 @@ def test_handle_streaming_tool_calls(get_weather_tool_schema, mock_emit):
 def test_handle_streaming_tool_calls_with_error(get_weather_tool_schema, mock_emit):
    def get_weather_error(location):
        raise Exception("Error")
-        
+
    llm = LLM(model="openai/gpt-4o", stream=True)
    response = llm.call(
        messages=[
@@ -619,7 +619,7 @@ def test_handle_streaming_tool_calls_with_error(get_weather_tool_schema, mock_em
        expected_stream_chunk=9,
        expected_completed_llm_call=1,
        expected_tool_usage_started=1,
-        expected_tool_usage_error=1,    
+        expected_tool_usage_error=1,
        expected_final_chunk_result=expected_final_chunk_result,
    )