style: resolve linter issues

This commit is contained in:
Lucas Gomide
2025-07-09 19:56:46 -03:00
parent 5ea221e54e
commit 43f339fa84
12 changed files with 34 additions and 34 deletions

View File

@@ -1,4 +1,3 @@
# First, import the core base classes without AgentEvaluator
from crewai.evaluation.base_evaluator import ( from crewai.evaluation.base_evaluator import (
BaseEvaluator, BaseEvaluator,
EvaluationScore, EvaluationScore,
@@ -6,7 +5,6 @@ from crewai.evaluation.base_evaluator import (
AgentEvaluationResult AgentEvaluationResult
) )
# Now import the evaluators which depend on base classes
from crewai.evaluation.metrics.semantic_quality_metrics import ( from crewai.evaluation.metrics.semantic_quality_metrics import (
SemanticQualityEvaluator SemanticQualityEvaluator
) )
@@ -26,7 +24,6 @@ from crewai.evaluation.metrics.tools_metrics import (
ToolInvocationEvaluator ToolInvocationEvaluator
) )
# Next import integration which uses the base classes but not AgentEvaluator
from crewai.evaluation.evaluation_listener import ( from crewai.evaluation.evaluation_listener import (
EvaluationTraceCallback, EvaluationTraceCallback,
create_evaluation_callbacks create_evaluation_callbacks
@@ -36,4 +33,21 @@ from crewai.evaluation.evaluation_listener import (
from crewai.evaluation.agent_evaluator import ( from crewai.evaluation.agent_evaluator import (
AgentEvaluator, AgentEvaluator,
create_default_evaluator create_default_evaluator
) )
__all__ = [
"BaseEvaluator",
"EvaluationScore",
"MetricCategory",
"AgentEvaluationResult",
"SemanticQualityEvaluator",
"GoalAlignmentEvaluator",
"ReasoningEfficiencyEvaluator",
"ToolSelectionEvaluator",
"ParameterExtractionEvaluator",
"ToolInvocationEvaluator",
"EvaluationTraceCallback",
"create_evaluation_callbacks",
"AgentEvaluator",
"create_default_evaluator"
]

View File

@@ -1,15 +1,12 @@
from crewai.evaluation.base_evaluator import AgentEvaluationResult, AgentAggregatedEvaluationResult, AggregationStrategy from crewai.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy
from crewai.utilities.events.base_event_listener import BaseEventListener
from crewai.agent import Agent from crewai.agent import Agent
from crewai.task import Task from crewai.task import Task
from crewai.utilities.llm_utils import create_llm
from crewai.evaluation.evaluation_display import EvaluationDisplayFormatter from crewai.evaluation.evaluation_display import EvaluationDisplayFormatter
from typing import List, Optional, Dict, Any, Tuple from typing import List, Optional, Dict, Any
from collections import defaultdict from collections import defaultdict
from crewai.evaluation import EvaluationScore, BaseEvaluator, create_evaluation_callbacks from crewai.evaluation import BaseEvaluator, create_evaluation_callbacks
from crewai.crew import Crew from crewai.crew import Crew
from rich.table import Table
from crewai.utilities.events.crewai_event_bus import crewai_event_bus from crewai.utilities.events.crewai_event_bus import crewai_event_bus
from crewai.utilities.events.utils.console_formatter import ConsoleFormatter from crewai.utilities.events.utils.console_formatter import ConsoleFormatter

View File

@@ -340,5 +340,5 @@ class EvaluationDisplayFormatter:
return response return response
except Exception as e: except Exception:
return "Synthesized from multiple tasks: " + "\n\n".join([f"- {fb[:500]}..." for fb in feedbacks]) return "Synthesized from multiple tasks: " + "\n\n".join([f"- {fb[:500]}..." for fb in feedbacks])

View File

@@ -56,7 +56,7 @@ Evaluate how well the agent's output aligns with the assigned task goal.
feedback=evaluation_data.get("feedback", response), feedback=evaluation_data.get("feedback", response),
raw_response=response raw_response=response
) )
except Exception as e: except Exception:
return EvaluationScore( return EvaluationScore(
score=None, score=None,
feedback=f"Failed to parse evaluation. Raw response: {response}", feedback=f"Failed to parse evaluation. Raw response: {response}",

View File

@@ -14,7 +14,6 @@ import numpy as np
from crewai.agent import Agent from crewai.agent import Agent
from crewai.task import Task from crewai.task import Task
from crewai.llm import BaseLLM, LLM
from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
from crewai.evaluation.json_parser import extract_json_from_llm_response from crewai.evaluation.json_parser import extract_json_from_llm_response
@@ -60,7 +59,7 @@ class ReasoningEfficiencyEvaluator(BaseEvaluator):
try: try:
interval = end_time - start_time interval = end_time - start_time
time_intervals.append(interval.total_seconds() if hasattr(interval, 'total_seconds') else 0) time_intervals.append(interval.total_seconds() if hasattr(interval, 'total_seconds') else 0)
except: except Exception:
has_reliable_timing = False has_reliable_timing = False
else: else:
has_reliable_timing = False has_reliable_timing = False
@@ -241,7 +240,7 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
if start_time and end_time: if start_time and end_time:
try: try:
response_times.append(end_time - start_time) response_times.append(end_time - start_time)
except: except Exception:
pass pass
avg_length = np.mean(call_lengths) if call_lengths else 0 avg_length = np.mean(call_lengths) if call_lengths else 0
@@ -293,7 +292,7 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
normalized_slope = slope / max_possible_slope normalized_slope = slope / max_possible_slope
return max(min(normalized_slope, 1.0), -1.0) return max(min(normalized_slope, 1.0), -1.0)
return 0.0 return 0.0
except: except Exception:
return 0.0 return 0.0
def _calculate_loop_likelihood(self, call_lengths: List[float], response_times: List[float]) -> float: def _calculate_loop_likelihood(self, call_lengths: List[float], response_times: List[float]) -> float:
@@ -319,7 +318,7 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
if mean_time > 0: if mean_time > 0:
time_consistency = 1.0 - (std_time / mean_time) time_consistency = 1.0 - (std_time / mean_time)
indicators.append(max(0, time_consistency - 0.3) * 1.5) indicators.append(max(0, time_consistency - 0.3) * 1.5)
except: except Exception:
pass pass
return np.mean(indicators) if indicators else 0.0 return np.mean(indicators) if indicators else 0.0

View File

@@ -55,7 +55,7 @@ Evaluate the semantic quality and reasoning of this output.
feedback=evaluation_data.get("feedback", response), feedback=evaluation_data.get("feedback", response),
raw_response=response raw_response=response
) )
except Exception as e: except Exception:
return EvaluationScore( return EvaluationScore(
score=None, score=None,
feedback=f"Failed to parse evaluation. Raw response: {response}", feedback=f"Failed to parse evaluation. Raw response: {response}",

View File

@@ -97,7 +97,7 @@ IMPORTANT:
coverage = scores.get("coverage", 5.0) coverage = scores.get("coverage", 5.0)
overall_score = float(evaluation_data.get("overall_score", 5.0)) overall_score = float(evaluation_data.get("overall_score", 5.0))
feedback = f"Tool Selection Evaluation:\n" feedback = "Tool Selection Evaluation:\n"
feedback += f"• Relevance: {relevance}/10 - Selection of appropriate tool types for the task\n" feedback += f"• Relevance: {relevance}/10 - Selection of appropriate tool types for the task\n"
feedback += f"• Coverage: {coverage}/10 - Selection of all necessary tool types\n" feedback += f"• Coverage: {coverage}/10 - Selection of all necessary tool types\n"
if "improvement_suggestions" in evaluation_data: if "improvement_suggestions" in evaluation_data:
@@ -164,7 +164,7 @@ class ParameterExtractionEvaluator(BaseEvaluator):
sample += f"- Success: {'No' if not success else 'Yes'}" sample += f"- Success: {'No' if not success else 'Yes'}"
if is_validation_error: if is_validation_error:
sample += f" (PARAMETER VALIDATION ERROR)\n" sample += " (PARAMETER VALIDATION ERROR)\n"
sample += f"- Error: {tool_use.get('result', 'Unknown error')}" sample += f"- Error: {tool_use.get('result', 'Unknown error')}"
elif not success: elif not success:
sample += f" (Other error: {error_type})\n" sample += f" (Other error: {error_type})\n"
@@ -231,7 +231,7 @@ Evaluate the quality of the agent's parameter extraction for this task.
overall_score = float(evaluation_data.get("overall_score", 5.0)) overall_score = float(evaluation_data.get("overall_score", 5.0))
feedback = f"Parameter Extraction Evaluation:\n" feedback = "Parameter Extraction Evaluation:\n"
feedback += f"• Accuracy: {accuracy}/10 - Correctly identifying required parameters\n" feedback += f"• Accuracy: {accuracy}/10 - Correctly identifying required parameters\n"
feedback += f"• Formatting: {formatting}/10 - Properly formatting parameters for tools\n" feedback += f"• Formatting: {formatting}/10 - Properly formatting parameters for tools\n"
feedback += f"• Completeness: {completeness}/10 - Including all necessary information\n\n" feedback += f"• Completeness: {completeness}/10 - Including all necessary information\n\n"
@@ -370,7 +370,7 @@ Evaluate the quality of the agent's tool invocation structure during this task.
overall_score = float(evaluation_data.get("overall_score", 5.0)) overall_score = float(evaluation_data.get("overall_score", 5.0))
feedback = f"Tool Invocation Evaluation:\n" feedback = "Tool Invocation Evaluation:\n"
feedback += f"• Structure: {structure}/10 - Following proper syntax and format\n" feedback += f"• Structure: {structure}/10 - Following proper syntax and format\n"
feedback += f"• Error Handling: {error_handling}/10 - Appropriately handling tool errors\n" feedback += f"• Error Handling: {error_handling}/10 - Appropriately handling tool errors\n"
feedback += f"• Invocation Patterns: {invocation_patterns}/10 - Proper sequencing and management of calls\n\n" feedback += f"• Invocation Patterns: {invocation_patterns}/10 - Proper sequencing and management of calls\n\n"

View File

@@ -2,7 +2,6 @@ from enum import Enum
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
from pydantic import BaseModel from pydantic import BaseModel
from datetime import datetime
from crewai.utilities.events.base_events import BaseEvent from crewai.utilities.events.base_events import BaseEvent

View File

@@ -1,4 +1,3 @@
import pytest
from unittest.mock import patch, MagicMock from unittest.mock import patch, MagicMock
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest

View File

@@ -5,7 +5,6 @@ from typing import List, Dict, Any
from crewai.tasks.task_output import TaskOutput from crewai.tasks.task_output import TaskOutput
from crewai.evaluation.metrics.reasoning_metrics import ( from crewai.evaluation.metrics.reasoning_metrics import (
ReasoningEfficiencyEvaluator, ReasoningEfficiencyEvaluator,
ReasoningPatternType
) )
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
from crewai.utilities.llm_utils import LLM from crewai.utilities.llm_utils import LLM

View File

@@ -1,8 +1,5 @@
import pytest from unittest.mock import patch, MagicMock
from unittest.mock import patch, MagicMock, ANY
from crewai.agent import Agent
from crewai.task import Task
from crewai.evaluation.base_evaluator import EvaluationScore from crewai.evaluation.base_evaluator import EvaluationScore
from crewai.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator from crewai.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest

View File

@@ -1,9 +1,5 @@
import pytest from unittest.mock import patch, MagicMock
from unittest.mock import patch, MagicMock, ANY
from crewai.agent import Agent
from crewai.task import Task
from crewai.evaluation.base_evaluator import EvaluationScore
from crewai.evaluation.metrics.tools_metrics import ( from crewai.evaluation.metrics.tools_metrics import (
ToolSelectionEvaluator, ToolSelectionEvaluator,
ParameterExtractionEvaluator, ParameterExtractionEvaluator,