mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-19 12:58:14 +00:00
Compare commits
2 Commits
devin/1768
...
devin/1763
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
76318c6689 | ||
|
|
60dd25aba8 |
@@ -1,8 +1,8 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import TYPE_CHECKING, cast
|
from typing import TYPE_CHECKING, Any, cast
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
|
||||||
|
|
||||||
from crewai.events.event_bus import crewai_event_bus
|
from crewai.events.event_bus import crewai_event_bus
|
||||||
from crewai.events.types.task_events import TaskEvaluationEvent
|
from crewai.events.types.task_events import TaskEvaluationEvent
|
||||||
@@ -25,16 +25,90 @@ class Entity(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
class TaskEvaluation(BaseModel):
|
class TaskEvaluation(BaseModel):
|
||||||
|
model_config = ConfigDict(extra="ignore")
|
||||||
|
|
||||||
suggestions: list[str] = Field(
|
suggestions: list[str] = Field(
|
||||||
|
default_factory=list,
|
||||||
description="Suggestions to improve future similar tasks."
|
description="Suggestions to improve future similar tasks."
|
||||||
)
|
)
|
||||||
quality: float = Field(
|
quality: float | None = Field(
|
||||||
|
default=None,
|
||||||
description="A score from 0 to 10 evaluating on completion, quality, and overall performance, all taking into account the task description, expected output, and the result of the task."
|
description="A score from 0 to 10 evaluating on completion, quality, and overall performance, all taking into account the task description, expected output, and the result of the task."
|
||||||
)
|
)
|
||||||
entities: list[Entity] = Field(
|
entities: list[Entity] = Field(
|
||||||
|
default_factory=list,
|
||||||
description="Entities extracted from the task output."
|
description="Entities extracted from the task output."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@model_validator(mode="before")
|
||||||
|
@classmethod
|
||||||
|
def map_score_to_quality(cls, data: Any) -> Any:
|
||||||
|
"""Map 'score' field to 'quality' if quality is missing."""
|
||||||
|
if isinstance(data, dict):
|
||||||
|
if "quality" not in data and "score" in data:
|
||||||
|
data["quality"] = data["score"]
|
||||||
|
return data
|
||||||
|
|
||||||
|
@field_validator("suggestions", mode="before")
|
||||||
|
@classmethod
|
||||||
|
def normalize_suggestions(cls, v: Any) -> list[str]:
|
||||||
|
"""Normalize suggestions from various formats to list[str].
|
||||||
|
|
||||||
|
Handles:
|
||||||
|
- None → []
|
||||||
|
- str → [str]
|
||||||
|
- dict with "point" → [point]
|
||||||
|
- dict without "point" → [str(dict)]
|
||||||
|
- list → flatten using same rules per item
|
||||||
|
"""
|
||||||
|
if v is None:
|
||||||
|
return []
|
||||||
|
|
||||||
|
if isinstance(v, str):
|
||||||
|
return [v]
|
||||||
|
|
||||||
|
if isinstance(v, dict):
|
||||||
|
if "point" in v:
|
||||||
|
return [str(v["point"])]
|
||||||
|
return [str(v)]
|
||||||
|
|
||||||
|
if isinstance(v, list):
|
||||||
|
result = []
|
||||||
|
for item in v:
|
||||||
|
if isinstance(item, str):
|
||||||
|
result.append(item)
|
||||||
|
elif isinstance(item, dict):
|
||||||
|
if "point" in item:
|
||||||
|
result.append(str(item["point"]))
|
||||||
|
else:
|
||||||
|
result.append(str(item))
|
||||||
|
else:
|
||||||
|
result.append(str(item))
|
||||||
|
return result
|
||||||
|
|
||||||
|
return [str(v)]
|
||||||
|
|
||||||
|
@field_validator("quality", mode="before")
|
||||||
|
@classmethod
|
||||||
|
def coerce_quality(cls, v: Any) -> float | None:
|
||||||
|
"""Coerce quality to float, accepting int and numeric strings.
|
||||||
|
|
||||||
|
Returns None if value is None, empty string, or cannot be parsed.
|
||||||
|
"""
|
||||||
|
if v is None or v == "":
|
||||||
|
return None
|
||||||
|
|
||||||
|
if isinstance(v, (int, float)):
|
||||||
|
return float(v)
|
||||||
|
|
||||||
|
if isinstance(v, str):
|
||||||
|
try:
|
||||||
|
return float(v)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
class TrainingTaskEvaluation(BaseModel):
|
class TrainingTaskEvaluation(BaseModel):
|
||||||
suggestions: list[str] = Field(
|
suggestions: list[str] = Field(
|
||||||
|
|||||||
@@ -0,0 +1,268 @@
|
|||||||
|
"""Tests for TaskEvaluation model validation and normalization.
|
||||||
|
|
||||||
|
These tests verify that TaskEvaluation correctly handles malformed LLM output
|
||||||
|
as reported in issue #3915, including:
|
||||||
|
- Missing quality field
|
||||||
|
- Suggestions as list of dicts with 'point' and 'priority' keys
|
||||||
|
- Score field instead of quality field
|
||||||
|
- Extra fields that should be ignored
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from pydantic import ValidationError
|
||||||
|
|
||||||
|
from crewai.utilities.evaluators.task_evaluator import Entity, TaskEvaluation
|
||||||
|
|
||||||
|
|
||||||
|
def test_missing_quality_and_dict_suggestions_normalize():
|
||||||
|
"""Test that missing quality and dict suggestions are normalized correctly.
|
||||||
|
|
||||||
|
This replicates the exact error from issue #3915 where:
|
||||||
|
- quality field is missing
|
||||||
|
- suggestions is a list of dicts with 'point' and 'priority' keys
|
||||||
|
"""
|
||||||
|
payload = {
|
||||||
|
"suggestions": [
|
||||||
|
{"point": "Proceed immediately with the task", "priority": "high"},
|
||||||
|
{"point": "When asking for information, be specific", "priority": "medium"},
|
||||||
|
{"point": "Use markdown formatting", "priority": "medium"},
|
||||||
|
],
|
||||||
|
"entities": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
result = TaskEvaluation.model_validate(payload)
|
||||||
|
|
||||||
|
assert result.quality is None
|
||||||
|
assert result.suggestions == [
|
||||||
|
"Proceed immediately with the task",
|
||||||
|
"When asking for information, be specific",
|
||||||
|
"Use markdown formatting",
|
||||||
|
]
|
||||||
|
assert result.entities == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_single_dict_suggestion():
|
||||||
|
"""Test that a single dict suggestion is normalized to a list with extracted point."""
|
||||||
|
payload = {
|
||||||
|
"suggestions": {"point": "Improve response quality", "priority": "high"},
|
||||||
|
"quality": 8.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
result = TaskEvaluation.model_validate(payload)
|
||||||
|
|
||||||
|
assert result.suggestions == ["Improve response quality"]
|
||||||
|
assert result.quality == 8.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_single_string_suggestion():
|
||||||
|
"""Test that a single string suggestion is normalized to a list."""
|
||||||
|
payload = {
|
||||||
|
"suggestions": "This is a single suggestion",
|
||||||
|
"quality": 7.5,
|
||||||
|
}
|
||||||
|
|
||||||
|
result = TaskEvaluation.model_validate(payload)
|
||||||
|
|
||||||
|
assert result.suggestions == ["This is a single suggestion"]
|
||||||
|
assert result.quality == 7.5
|
||||||
|
|
||||||
|
|
||||||
|
def test_mixed_suggestions():
|
||||||
|
"""Test that mixed suggestion types are normalized correctly."""
|
||||||
|
payload = {
|
||||||
|
"suggestions": [
|
||||||
|
"String suggestion",
|
||||||
|
{"point": "Dict with point", "priority": "high"},
|
||||||
|
{"other": "Dict without point"},
|
||||||
|
123,
|
||||||
|
],
|
||||||
|
"quality": 6.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
result = TaskEvaluation.model_validate(payload)
|
||||||
|
|
||||||
|
assert result.suggestions == [
|
||||||
|
"String suggestion",
|
||||||
|
"Dict with point",
|
||||||
|
"{'other': 'Dict without point'}",
|
||||||
|
"123",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_quality_from_score():
|
||||||
|
"""Test that 'score' field is mapped to 'quality' when quality is missing."""
|
||||||
|
payload = {
|
||||||
|
"score": 3,
|
||||||
|
"suggestions": ["Improve performance"],
|
||||||
|
}
|
||||||
|
|
||||||
|
result = TaskEvaluation.model_validate(payload)
|
||||||
|
|
||||||
|
assert result.quality == 3.0
|
||||||
|
assert result.suggestions == ["Improve performance"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_quality_str_number():
|
||||||
|
"""Test that quality as a string number is coerced to float."""
|
||||||
|
payload = {
|
||||||
|
"quality": "7.5",
|
||||||
|
"suggestions": ["Good work"],
|
||||||
|
}
|
||||||
|
|
||||||
|
result = TaskEvaluation.model_validate(payload)
|
||||||
|
|
||||||
|
assert result.quality == 7.5
|
||||||
|
|
||||||
|
|
||||||
|
def test_quality_int():
|
||||||
|
"""Test that quality as an int is coerced to float."""
|
||||||
|
payload = {
|
||||||
|
"quality": 8,
|
||||||
|
"suggestions": ["Excellent"],
|
||||||
|
}
|
||||||
|
|
||||||
|
result = TaskEvaluation.model_validate(payload)
|
||||||
|
|
||||||
|
assert result.quality == 8.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_quality_invalid_string():
|
||||||
|
"""Test that invalid quality string returns None."""
|
||||||
|
payload = {
|
||||||
|
"quality": "not a number",
|
||||||
|
"suggestions": ["Test"],
|
||||||
|
}
|
||||||
|
|
||||||
|
result = TaskEvaluation.model_validate(payload)
|
||||||
|
|
||||||
|
assert result.quality is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_quality_empty_string():
|
||||||
|
"""Test that empty quality string returns None."""
|
||||||
|
payload = {
|
||||||
|
"quality": "",
|
||||||
|
"suggestions": ["Test"],
|
||||||
|
}
|
||||||
|
|
||||||
|
result = TaskEvaluation.model_validate(payload)
|
||||||
|
|
||||||
|
assert result.quality is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_extra_fields_ignored():
|
||||||
|
"""Test that extra fields in the payload are ignored."""
|
||||||
|
payload = {
|
||||||
|
"suggestions": ["Test suggestion"],
|
||||||
|
"quality": 5.0,
|
||||||
|
"relationships": [],
|
||||||
|
"extra_field": "should be ignored",
|
||||||
|
"another_extra": 123,
|
||||||
|
}
|
||||||
|
|
||||||
|
result = TaskEvaluation.model_validate(payload)
|
||||||
|
|
||||||
|
assert result.suggestions == ["Test suggestion"]
|
||||||
|
assert result.quality == 5.0
|
||||||
|
assert result.entities == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_none_suggestions():
|
||||||
|
"""Test that None suggestions are normalized to empty list."""
|
||||||
|
payload = {
|
||||||
|
"suggestions": None,
|
||||||
|
"quality": 5.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
result = TaskEvaluation.model_validate(payload)
|
||||||
|
|
||||||
|
assert result.suggestions == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_missing_all_optional_fields():
|
||||||
|
"""Test that all optional fields can be missing."""
|
||||||
|
payload = {}
|
||||||
|
|
||||||
|
result = TaskEvaluation.model_validate(payload)
|
||||||
|
|
||||||
|
assert result.suggestions == []
|
||||||
|
assert result.quality is None
|
||||||
|
assert result.entities == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_entities_with_valid_data():
|
||||||
|
"""Test that entities are parsed correctly when provided."""
|
||||||
|
payload = {
|
||||||
|
"suggestions": ["Test"],
|
||||||
|
"quality": 8.0,
|
||||||
|
"entities": [
|
||||||
|
{
|
||||||
|
"name": "John Doe",
|
||||||
|
"type": "Person",
|
||||||
|
"description": "A test person",
|
||||||
|
"relationships": ["knows Jane"],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
result = TaskEvaluation.model_validate(payload)
|
||||||
|
|
||||||
|
assert len(result.entities) == 1
|
||||||
|
assert result.entities[0].name == "John Doe"
|
||||||
|
assert result.entities[0].type == "Person"
|
||||||
|
|
||||||
|
|
||||||
|
def test_score_and_quality_both_present():
|
||||||
|
"""Test that quality takes precedence when both score and quality are present."""
|
||||||
|
payload = {
|
||||||
|
"score": 3,
|
||||||
|
"quality": 9.0,
|
||||||
|
"suggestions": ["Test"],
|
||||||
|
}
|
||||||
|
|
||||||
|
result = TaskEvaluation.model_validate(payload)
|
||||||
|
|
||||||
|
assert result.quality == 9.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue_3915_exact_payload():
|
||||||
|
"""Test the exact payload structure from issue #3915.
|
||||||
|
|
||||||
|
This is the actual error case reported in the issue where:
|
||||||
|
- quality field is missing
|
||||||
|
- suggestions contains dicts with 'point' and 'priority'
|
||||||
|
- relationships field is present (should be ignored)
|
||||||
|
"""
|
||||||
|
payload = {
|
||||||
|
"score": 3,
|
||||||
|
"suggestions": [
|
||||||
|
{"point": "Complete the task immediately", "priority": "high"},
|
||||||
|
{"point": "Provide detailed explanations", "priority": "medium"},
|
||||||
|
{"point": "Use proper formatting", "priority": "medium"},
|
||||||
|
],
|
||||||
|
"relationships": ["suggested"],
|
||||||
|
}
|
||||||
|
|
||||||
|
result = TaskEvaluation.model_validate(payload)
|
||||||
|
|
||||||
|
assert result.quality == 3.0
|
||||||
|
assert result.suggestions == [
|
||||||
|
"Complete the task immediately",
|
||||||
|
"Provide detailed explanations",
|
||||||
|
"Use proper formatting",
|
||||||
|
]
|
||||||
|
assert result.entities == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_suggestions_list():
|
||||||
|
"""Test that empty suggestions list is handled correctly."""
|
||||||
|
payload = {
|
||||||
|
"suggestions": [],
|
||||||
|
"quality": 5.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
result = TaskEvaluation.model_validate(payload)
|
||||||
|
|
||||||
|
assert result.suggestions == []
|
||||||
|
assert result.quality == 5.0
|
||||||
|
|
||||||
Reference in New Issue
Block a user