mirror of
https://github.com/crewAIInc/crewAI.git
synced 2025-12-16 04:18:35 +00:00
Fix TaskEvaluation validation errors for missing quality and dict suggestions
Fixes #3915 This commit addresses Pydantic validation errors that occur when the LLM output doesn't match the expected TaskEvaluation schema: 1. Missing 'quality' field - LLM sometimes omits this field 2. 'suggestions' as list of dicts - LLM returns [{'point': '...', 'priority': 'high'}] instead of list[str] 3. 'score' field instead of 'quality' - LLM uses 'score' as alternate field name Changes: - Make 'quality' field optional (float | None) with default None - Make 'suggestions' field optional with default empty list - Make 'entities' field optional with default empty list - Add ConfigDict(extra='ignore') to ignore unexpected fields - Add model_validator to map 'score' to 'quality' when quality is missing - Add field_validator for 'suggestions' to normalize dict format to list[str] - Extracts 'point' value from dicts with 'point' key - Handles single dict, single string, list of mixed types, and None - Add field_validator for 'quality' to coerce int/str to float The fix is backward compatible - LongTermMemoryItem already accepts quality=None, and the strict crew evaluation path uses a separate TaskEvaluationPydanticOutput model that remains unchanged. Tests: - Added 16 comprehensive unit tests covering all edge cases - All existing tests continue to pass - Tests replicate exact error scenarios from issue #3915 Co-Authored-By: João <joao@crewai.com>
This commit is contained in:
@@ -1,8 +1,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, cast
|
||||
from typing import TYPE_CHECKING, Any, cast
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
|
||||
|
||||
from crewai.events.event_bus import crewai_event_bus
|
||||
from crewai.events.types.task_events import TaskEvaluationEvent
|
||||
@@ -25,16 +25,90 @@ class Entity(BaseModel):
|
||||
|
||||
|
||||
class TaskEvaluation(BaseModel):
|
||||
model_config = ConfigDict(extra="ignore")
|
||||
|
||||
suggestions: list[str] = Field(
|
||||
default_factory=list,
|
||||
description="Suggestions to improve future similar tasks."
|
||||
)
|
||||
quality: float = Field(
|
||||
quality: float | None = Field(
|
||||
default=None,
|
||||
description="A score from 0 to 10 evaluating on completion, quality, and overall performance, all taking into account the task description, expected output, and the result of the task."
|
||||
)
|
||||
entities: list[Entity] = Field(
|
||||
default_factory=list,
|
||||
description="Entities extracted from the task output."
|
||||
)
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def map_score_to_quality(cls, data: Any) -> Any:
|
||||
"""Map 'score' field to 'quality' if quality is missing."""
|
||||
if isinstance(data, dict):
|
||||
if "quality" not in data and "score" in data:
|
||||
data["quality"] = data["score"]
|
||||
return data
|
||||
|
||||
@field_validator("suggestions", mode="before")
|
||||
@classmethod
|
||||
def normalize_suggestions(cls, v: Any) -> list[str]:
|
||||
"""Normalize suggestions from various formats to list[str].
|
||||
|
||||
Handles:
|
||||
- None → []
|
||||
- str → [str]
|
||||
- dict with "point" → [point]
|
||||
- dict without "point" → [str(dict)]
|
||||
- list → flatten using same rules per item
|
||||
"""
|
||||
if v is None:
|
||||
return []
|
||||
|
||||
if isinstance(v, str):
|
||||
return [v]
|
||||
|
||||
if isinstance(v, dict):
|
||||
if "point" in v:
|
||||
return [str(v["point"])]
|
||||
return [str(v)]
|
||||
|
||||
if isinstance(v, list):
|
||||
result = []
|
||||
for item in v:
|
||||
if isinstance(item, str):
|
||||
result.append(item)
|
||||
elif isinstance(item, dict):
|
||||
if "point" in item:
|
||||
result.append(str(item["point"]))
|
||||
else:
|
||||
result.append(str(item))
|
||||
else:
|
||||
result.append(str(item))
|
||||
return result
|
||||
|
||||
return [str(v)]
|
||||
|
||||
@field_validator("quality", mode="before")
|
||||
@classmethod
|
||||
def coerce_quality(cls, v: Any) -> float | None:
|
||||
"""Coerce quality to float, accepting int and numeric strings.
|
||||
|
||||
Returns None if value is None, empty string, or cannot be parsed.
|
||||
"""
|
||||
if v is None or v == "":
|
||||
return None
|
||||
|
||||
if isinstance(v, (int, float)):
|
||||
return float(v)
|
||||
|
||||
if isinstance(v, str):
|
||||
try:
|
||||
return float(v)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class TrainingTaskEvaluation(BaseModel):
|
||||
suggestions: list[str] = Field(
|
||||
|
||||
@@ -0,0 +1,267 @@
|
||||
"""Tests for TaskEvaluation model validation and normalization.
|
||||
|
||||
These tests verify that TaskEvaluation correctly handles malformed LLM output
|
||||
as reported in issue #3915, including:
|
||||
- Missing quality field
|
||||
- Suggestions as list of dicts with 'point' and 'priority' keys
|
||||
- Score field instead of quality field
|
||||
- Extra fields that should be ignored
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from pydantic import ValidationError
|
||||
|
||||
from crewai.utilities.evaluators.task_evaluator import Entity, TaskEvaluation
|
||||
|
||||
|
||||
def test_missing_quality_and_dict_suggestions_normalize():
|
||||
"""Test that missing quality and dict suggestions are normalized correctly.
|
||||
|
||||
This replicates the exact error from issue #3915 where:
|
||||
- quality field is missing
|
||||
- suggestions is a list of dicts with 'point' and 'priority' keys
|
||||
"""
|
||||
payload = {
|
||||
"suggestions": [
|
||||
{"point": "Proceed immediately with the task", "priority": "high"},
|
||||
{"point": "When asking for information, be specific", "priority": "medium"},
|
||||
{"point": "Use markdown formatting", "priority": "medium"},
|
||||
],
|
||||
"entities": [],
|
||||
}
|
||||
|
||||
result = TaskEvaluation.model_validate(payload)
|
||||
|
||||
assert result.quality is None
|
||||
assert result.suggestions == [
|
||||
"Proceed immediately with the task",
|
||||
"When asking for information, be specific",
|
||||
"Use markdown formatting",
|
||||
]
|
||||
assert result.entities == []
|
||||
|
||||
|
||||
def test_single_dict_suggestion():
|
||||
"""Test that a single dict suggestion is normalized to a list with extracted point."""
|
||||
payload = {
|
||||
"suggestions": {"point": "Improve response quality", "priority": "high"},
|
||||
"quality": 8.0,
|
||||
}
|
||||
|
||||
result = TaskEvaluation.model_validate(payload)
|
||||
|
||||
assert result.suggestions == ["Improve response quality"]
|
||||
assert result.quality == 8.0
|
||||
|
||||
|
||||
def test_single_string_suggestion():
|
||||
"""Test that a single string suggestion is normalized to a list."""
|
||||
payload = {
|
||||
"suggestions": "This is a single suggestion",
|
||||
"quality": 7.5,
|
||||
}
|
||||
|
||||
result = TaskEvaluation.model_validate(payload)
|
||||
|
||||
assert result.suggestions == ["This is a single suggestion"]
|
||||
assert result.quality == 7.5
|
||||
|
||||
|
||||
def test_mixed_suggestions():
|
||||
"""Test that mixed suggestion types are normalized correctly."""
|
||||
payload = {
|
||||
"suggestions": [
|
||||
"String suggestion",
|
||||
{"point": "Dict with point", "priority": "high"},
|
||||
{"other": "Dict without point"},
|
||||
123,
|
||||
],
|
||||
"quality": 6.0,
|
||||
}
|
||||
|
||||
result = TaskEvaluation.model_validate(payload)
|
||||
|
||||
assert result.suggestions == [
|
||||
"String suggestion",
|
||||
"Dict with point",
|
||||
"{'other': 'Dict without point'}",
|
||||
"123",
|
||||
]
|
||||
|
||||
|
||||
def test_quality_from_score():
|
||||
"""Test that 'score' field is mapped to 'quality' when quality is missing."""
|
||||
payload = {
|
||||
"score": 3,
|
||||
"suggestions": ["Improve performance"],
|
||||
}
|
||||
|
||||
result = TaskEvaluation.model_validate(payload)
|
||||
|
||||
assert result.quality == 3.0
|
||||
assert result.suggestions == ["Improve performance"]
|
||||
|
||||
|
||||
def test_quality_str_number():
|
||||
"""Test that quality as a string number is coerced to float."""
|
||||
payload = {
|
||||
"quality": "7.5",
|
||||
"suggestions": ["Good work"],
|
||||
}
|
||||
|
||||
result = TaskEvaluation.model_validate(payload)
|
||||
|
||||
assert result.quality == 7.5
|
||||
|
||||
|
||||
def test_quality_int():
|
||||
"""Test that quality as an int is coerced to float."""
|
||||
payload = {
|
||||
"quality": 8,
|
||||
"suggestions": ["Excellent"],
|
||||
}
|
||||
|
||||
result = TaskEvaluation.model_validate(payload)
|
||||
|
||||
assert result.quality == 8.0
|
||||
|
||||
|
||||
def test_quality_invalid_string():
|
||||
"""Test that invalid quality string returns None."""
|
||||
payload = {
|
||||
"quality": "not a number",
|
||||
"suggestions": ["Test"],
|
||||
}
|
||||
|
||||
result = TaskEvaluation.model_validate(payload)
|
||||
|
||||
assert result.quality is None
|
||||
|
||||
|
||||
def test_quality_empty_string():
|
||||
"""Test that empty quality string returns None."""
|
||||
payload = {
|
||||
"quality": "",
|
||||
"suggestions": ["Test"],
|
||||
}
|
||||
|
||||
result = TaskEvaluation.model_validate(payload)
|
||||
|
||||
assert result.quality is None
|
||||
|
||||
|
||||
def test_extra_fields_ignored():
|
||||
"""Test that extra fields in the payload are ignored."""
|
||||
payload = {
|
||||
"suggestions": ["Test suggestion"],
|
||||
"quality": 5.0,
|
||||
"relationships": [],
|
||||
"extra_field": "should be ignored",
|
||||
"another_extra": 123,
|
||||
}
|
||||
|
||||
result = TaskEvaluation.model_validate(payload)
|
||||
|
||||
assert result.suggestions == ["Test suggestion"]
|
||||
assert result.quality == 5.0
|
||||
assert result.entities == []
|
||||
|
||||
|
||||
def test_none_suggestions():
|
||||
"""Test that None suggestions are normalized to empty list."""
|
||||
payload = {
|
||||
"suggestions": None,
|
||||
"quality": 5.0,
|
||||
}
|
||||
|
||||
result = TaskEvaluation.model_validate(payload)
|
||||
|
||||
assert result.suggestions == []
|
||||
|
||||
|
||||
def test_missing_all_optional_fields():
|
||||
"""Test that all optional fields can be missing."""
|
||||
payload = {}
|
||||
|
||||
result = TaskEvaluation.model_validate(payload)
|
||||
|
||||
assert result.suggestions == []
|
||||
assert result.quality is None
|
||||
assert result.entities == []
|
||||
|
||||
|
||||
def test_entities_with_valid_data():
|
||||
"""Test that entities are parsed correctly when provided."""
|
||||
payload = {
|
||||
"suggestions": ["Test"],
|
||||
"quality": 8.0,
|
||||
"entities": [
|
||||
{
|
||||
"name": "John Doe",
|
||||
"type": "Person",
|
||||
"description": "A test person",
|
||||
"relationships": ["knows Jane"],
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
result = TaskEvaluation.model_validate(payload)
|
||||
|
||||
assert len(result.entities) == 1
|
||||
assert result.entities[0].name == "John Doe"
|
||||
assert result.entities[0].type == "Person"
|
||||
|
||||
|
||||
def test_score_and_quality_both_present():
|
||||
"""Test that quality takes precedence when both score and quality are present."""
|
||||
payload = {
|
||||
"score": 3,
|
||||
"quality": 9.0,
|
||||
"suggestions": ["Test"],
|
||||
}
|
||||
|
||||
result = TaskEvaluation.model_validate(payload)
|
||||
|
||||
assert result.quality == 9.0
|
||||
|
||||
|
||||
def test_issue_3915_exact_payload():
|
||||
"""Test the exact payload structure from issue #3915.
|
||||
|
||||
This is the actual error case reported in the issue where:
|
||||
- quality field is missing
|
||||
- suggestions contains dicts with 'point' and 'priority'
|
||||
- relationships field is present (should be ignored)
|
||||
"""
|
||||
payload = {
|
||||
"score": 3,
|
||||
"suggestions": [
|
||||
{"point": "Complete the task immediately", "priority": "high"},
|
||||
{"point": "Provide detailed explanations", "priority": "medium"},
|
||||
{"point": "Use proper formatting", "priority": "medium"},
|
||||
],
|
||||
"relationships": ["suggested"],
|
||||
}
|
||||
|
||||
result = TaskEvaluation.model_validate(payload)
|
||||
|
||||
assert result.quality == 3.0
|
||||
assert result.suggestions == [
|
||||
"Complete the task immediately",
|
||||
"Provide detailed explanations",
|
||||
"Use proper formatting",
|
||||
]
|
||||
assert result.entities == []
|
||||
|
||||
|
||||
def test_empty_suggestions_list():
|
||||
"""Test that empty suggestions list is handled correctly."""
|
||||
payload = {
|
||||
"suggestions": [],
|
||||
"quality": 5.0,
|
||||
}
|
||||
|
||||
result = TaskEvaluation.model_validate(payload)
|
||||
|
||||
assert result.suggestions == []
|
||||
assert result.quality == 5.0
|
||||
Reference in New Issue
Block a user