Fix TaskEvaluation validation errors for missing quality and dict suggestions

Fixes #3915

This commit addresses Pydantic validation errors that occur when the LLM
output doesn't match the expected TaskEvaluation schema:

1. Missing 'quality' field - LLM sometimes omits this field
2. 'suggestions' as list of dicts - LLM returns [{'point': '...', 'priority': 'high'}]
   instead of list[str]
3. 'score' field instead of 'quality' - LLM uses 'score' as alternate field name

Changes:
- Make 'quality' field optional (float | None) with default None
- Make 'suggestions' field optional with default empty list
- Make 'entities' field optional with default empty list
- Add ConfigDict(extra='ignore') to ignore unexpected fields
- Add model_validator to map 'score' to 'quality' when quality is missing
- Add field_validator for 'suggestions' to normalize dict format to list[str]
  - Extracts 'point' value from dicts with 'point' key
  - Handles single dict, single string, list of mixed types, and None
- Add field_validator for 'quality' to coerce int/str to float

The fix is backward compatible - LongTermMemoryItem already accepts
quality=None, and the strict crew evaluation path uses a separate
TaskEvaluationPydanticOutput model that remains unchanged.

Tests:
- Added 16 comprehensive unit tests covering all edge cases
- All existing tests continue to pass
- Tests replicate exact error scenarios from issue #3915

Co-Authored-By: João <joao@crewai.com>
This commit is contained in:
Devin AI
2025-11-14 12:42:01 +00:00
parent d7bdac12a2
commit 60dd25aba8
3 changed files with 4570 additions and 4037 deletions

View File

@@ -1,8 +1,8 @@
from __future__ import annotations
from typing import TYPE_CHECKING, cast
from typing import TYPE_CHECKING, Any, cast
from pydantic import BaseModel, Field
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
from crewai.events.event_bus import crewai_event_bus
from crewai.events.types.task_events import TaskEvaluationEvent
@@ -25,16 +25,90 @@ class Entity(BaseModel):
class TaskEvaluation(BaseModel):
model_config = ConfigDict(extra="ignore")
suggestions: list[str] = Field(
default_factory=list,
description="Suggestions to improve future similar tasks."
)
quality: float = Field(
quality: float | None = Field(
default=None,
description="A score from 0 to 10 evaluating on completion, quality, and overall performance, all taking into account the task description, expected output, and the result of the task."
)
entities: list[Entity] = Field(
default_factory=list,
description="Entities extracted from the task output."
)
@model_validator(mode="before")
@classmethod
def map_score_to_quality(cls, data: Any) -> Any:
"""Map 'score' field to 'quality' if quality is missing."""
if isinstance(data, dict):
if "quality" not in data and "score" in data:
data["quality"] = data["score"]
return data
@field_validator("suggestions", mode="before")
@classmethod
def normalize_suggestions(cls, v: Any) -> list[str]:
"""Normalize suggestions from various formats to list[str].
Handles:
- None → []
- str → [str]
- dict with "point" → [point]
- dict without "point" → [str(dict)]
- list → flatten using same rules per item
"""
if v is None:
return []
if isinstance(v, str):
return [v]
if isinstance(v, dict):
if "point" in v:
return [str(v["point"])]
return [str(v)]
if isinstance(v, list):
result = []
for item in v:
if isinstance(item, str):
result.append(item)
elif isinstance(item, dict):
if "point" in item:
result.append(str(item["point"]))
else:
result.append(str(item))
else:
result.append(str(item))
return result
return [str(v)]
@field_validator("quality", mode="before")
@classmethod
def coerce_quality(cls, v: Any) -> float | None:
"""Coerce quality to float, accepting int and numeric strings.
Returns None if value is None, empty string, or cannot be parsed.
"""
if v is None or v == "":
return None
if isinstance(v, (int, float)):
return float(v)
if isinstance(v, str):
try:
return float(v)
except ValueError:
return None
return None
class TrainingTaskEvaluation(BaseModel):
suggestions: list[str] = Field(

View File

@@ -0,0 +1,267 @@
"""Tests for TaskEvaluation model validation and normalization.
These tests verify that TaskEvaluation correctly handles malformed LLM output
as reported in issue #3915, including:
- Missing quality field
- Suggestions as list of dicts with 'point' and 'priority' keys
- Score field instead of quality field
- Extra fields that should be ignored
"""
import pytest
from pydantic import ValidationError
from crewai.utilities.evaluators.task_evaluator import Entity, TaskEvaluation
def test_missing_quality_and_dict_suggestions_normalize():
"""Test that missing quality and dict suggestions are normalized correctly.
This replicates the exact error from issue #3915 where:
- quality field is missing
- suggestions is a list of dicts with 'point' and 'priority' keys
"""
payload = {
"suggestions": [
{"point": "Proceed immediately with the task", "priority": "high"},
{"point": "When asking for information, be specific", "priority": "medium"},
{"point": "Use markdown formatting", "priority": "medium"},
],
"entities": [],
}
result = TaskEvaluation.model_validate(payload)
assert result.quality is None
assert result.suggestions == [
"Proceed immediately with the task",
"When asking for information, be specific",
"Use markdown formatting",
]
assert result.entities == []
def test_single_dict_suggestion():
"""Test that a single dict suggestion is normalized to a list with extracted point."""
payload = {
"suggestions": {"point": "Improve response quality", "priority": "high"},
"quality": 8.0,
}
result = TaskEvaluation.model_validate(payload)
assert result.suggestions == ["Improve response quality"]
assert result.quality == 8.0
def test_single_string_suggestion():
"""Test that a single string suggestion is normalized to a list."""
payload = {
"suggestions": "This is a single suggestion",
"quality": 7.5,
}
result = TaskEvaluation.model_validate(payload)
assert result.suggestions == ["This is a single suggestion"]
assert result.quality == 7.5
def test_mixed_suggestions():
"""Test that mixed suggestion types are normalized correctly."""
payload = {
"suggestions": [
"String suggestion",
{"point": "Dict with point", "priority": "high"},
{"other": "Dict without point"},
123,
],
"quality": 6.0,
}
result = TaskEvaluation.model_validate(payload)
assert result.suggestions == [
"String suggestion",
"Dict with point",
"{'other': 'Dict without point'}",
"123",
]
def test_quality_from_score():
"""Test that 'score' field is mapped to 'quality' when quality is missing."""
payload = {
"score": 3,
"suggestions": ["Improve performance"],
}
result = TaskEvaluation.model_validate(payload)
assert result.quality == 3.0
assert result.suggestions == ["Improve performance"]
def test_quality_str_number():
"""Test that quality as a string number is coerced to float."""
payload = {
"quality": "7.5",
"suggestions": ["Good work"],
}
result = TaskEvaluation.model_validate(payload)
assert result.quality == 7.5
def test_quality_int():
"""Test that quality as an int is coerced to float."""
payload = {
"quality": 8,
"suggestions": ["Excellent"],
}
result = TaskEvaluation.model_validate(payload)
assert result.quality == 8.0
def test_quality_invalid_string():
"""Test that invalid quality string returns None."""
payload = {
"quality": "not a number",
"suggestions": ["Test"],
}
result = TaskEvaluation.model_validate(payload)
assert result.quality is None
def test_quality_empty_string():
"""Test that empty quality string returns None."""
payload = {
"quality": "",
"suggestions": ["Test"],
}
result = TaskEvaluation.model_validate(payload)
assert result.quality is None
def test_extra_fields_ignored():
"""Test that extra fields in the payload are ignored."""
payload = {
"suggestions": ["Test suggestion"],
"quality": 5.0,
"relationships": [],
"extra_field": "should be ignored",
"another_extra": 123,
}
result = TaskEvaluation.model_validate(payload)
assert result.suggestions == ["Test suggestion"]
assert result.quality == 5.0
assert result.entities == []
def test_none_suggestions():
"""Test that None suggestions are normalized to empty list."""
payload = {
"suggestions": None,
"quality": 5.0,
}
result = TaskEvaluation.model_validate(payload)
assert result.suggestions == []
def test_missing_all_optional_fields():
"""Test that all optional fields can be missing."""
payload = {}
result = TaskEvaluation.model_validate(payload)
assert result.suggestions == []
assert result.quality is None
assert result.entities == []
def test_entities_with_valid_data():
"""Test that entities are parsed correctly when provided."""
payload = {
"suggestions": ["Test"],
"quality": 8.0,
"entities": [
{
"name": "John Doe",
"type": "Person",
"description": "A test person",
"relationships": ["knows Jane"],
}
],
}
result = TaskEvaluation.model_validate(payload)
assert len(result.entities) == 1
assert result.entities[0].name == "John Doe"
assert result.entities[0].type == "Person"
def test_score_and_quality_both_present():
"""Test that quality takes precedence when both score and quality are present."""
payload = {
"score": 3,
"quality": 9.0,
"suggestions": ["Test"],
}
result = TaskEvaluation.model_validate(payload)
assert result.quality == 9.0
def test_issue_3915_exact_payload():
"""Test the exact payload structure from issue #3915.
This is the actual error case reported in the issue where:
- quality field is missing
- suggestions contains dicts with 'point' and 'priority'
- relationships field is present (should be ignored)
"""
payload = {
"score": 3,
"suggestions": [
{"point": "Complete the task immediately", "priority": "high"},
{"point": "Provide detailed explanations", "priority": "medium"},
{"point": "Use proper formatting", "priority": "medium"},
],
"relationships": ["suggested"],
}
result = TaskEvaluation.model_validate(payload)
assert result.quality == 3.0
assert result.suggestions == [
"Complete the task immediately",
"Provide detailed explanations",
"Use proper formatting",
]
assert result.entities == []
def test_empty_suggestions_list():
"""Test that empty suggestions list is handled correctly."""
payload = {
"suggestions": [],
"quality": 5.0,
}
result = TaskEvaluation.model_validate(payload)
assert result.suggestions == []
assert result.quality == 5.0

8260
uv.lock generated

File diff suppressed because it is too large Load Diff