mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-09 16:18:30 +00:00
style: resolve linter issues
This commit is contained in:
@@ -1,4 +1,3 @@
|
|||||||
# First, import the core base classes without AgentEvaluator
|
|
||||||
from crewai.evaluation.base_evaluator import (
|
from crewai.evaluation.base_evaluator import (
|
||||||
BaseEvaluator,
|
BaseEvaluator,
|
||||||
EvaluationScore,
|
EvaluationScore,
|
||||||
@@ -6,7 +5,6 @@ from crewai.evaluation.base_evaluator import (
|
|||||||
AgentEvaluationResult
|
AgentEvaluationResult
|
||||||
)
|
)
|
||||||
|
|
||||||
# Now import the evaluators which depend on base classes
|
|
||||||
from crewai.evaluation.metrics.semantic_quality_metrics import (
|
from crewai.evaluation.metrics.semantic_quality_metrics import (
|
||||||
SemanticQualityEvaluator
|
SemanticQualityEvaluator
|
||||||
)
|
)
|
||||||
@@ -26,7 +24,6 @@ from crewai.evaluation.metrics.tools_metrics import (
|
|||||||
ToolInvocationEvaluator
|
ToolInvocationEvaluator
|
||||||
)
|
)
|
||||||
|
|
||||||
# Next import integration which uses the base classes but not AgentEvaluator
|
|
||||||
from crewai.evaluation.evaluation_listener import (
|
from crewai.evaluation.evaluation_listener import (
|
||||||
EvaluationTraceCallback,
|
EvaluationTraceCallback,
|
||||||
create_evaluation_callbacks
|
create_evaluation_callbacks
|
||||||
@@ -36,4 +33,21 @@ from crewai.evaluation.evaluation_listener import (
|
|||||||
from crewai.evaluation.agent_evaluator import (
|
from crewai.evaluation.agent_evaluator import (
|
||||||
AgentEvaluator,
|
AgentEvaluator,
|
||||||
create_default_evaluator
|
create_default_evaluator
|
||||||
)
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"BaseEvaluator",
|
||||||
|
"EvaluationScore",
|
||||||
|
"MetricCategory",
|
||||||
|
"AgentEvaluationResult",
|
||||||
|
"SemanticQualityEvaluator",
|
||||||
|
"GoalAlignmentEvaluator",
|
||||||
|
"ReasoningEfficiencyEvaluator",
|
||||||
|
"ToolSelectionEvaluator",
|
||||||
|
"ParameterExtractionEvaluator",
|
||||||
|
"ToolInvocationEvaluator",
|
||||||
|
"EvaluationTraceCallback",
|
||||||
|
"create_evaluation_callbacks",
|
||||||
|
"AgentEvaluator",
|
||||||
|
"create_default_evaluator"
|
||||||
|
]
|
||||||
@@ -1,15 +1,12 @@
|
|||||||
from crewai.evaluation.base_evaluator import AgentEvaluationResult, AgentAggregatedEvaluationResult, AggregationStrategy
|
from crewai.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy
|
||||||
from crewai.utilities.events.base_event_listener import BaseEventListener
|
|
||||||
from crewai.agent import Agent
|
from crewai.agent import Agent
|
||||||
from crewai.task import Task
|
from crewai.task import Task
|
||||||
from crewai.utilities.llm_utils import create_llm
|
|
||||||
from crewai.evaluation.evaluation_display import EvaluationDisplayFormatter
|
from crewai.evaluation.evaluation_display import EvaluationDisplayFormatter
|
||||||
|
|
||||||
from typing import List, Optional, Dict, Any, Tuple
|
from typing import List, Optional, Dict, Any
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from crewai.evaluation import EvaluationScore, BaseEvaluator, create_evaluation_callbacks
|
from crewai.evaluation import BaseEvaluator, create_evaluation_callbacks
|
||||||
from crewai.crew import Crew
|
from crewai.crew import Crew
|
||||||
from rich.table import Table
|
|
||||||
from crewai.utilities.events.crewai_event_bus import crewai_event_bus
|
from crewai.utilities.events.crewai_event_bus import crewai_event_bus
|
||||||
from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
|
from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
|
||||||
|
|
||||||
|
|||||||
@@ -340,5 +340,5 @@ class EvaluationDisplayFormatter:
|
|||||||
|
|
||||||
return response
|
return response
|
||||||
|
|
||||||
except Exception as e:
|
except Exception:
|
||||||
return "Synthesized from multiple tasks: " + "\n\n".join([f"- {fb[:500]}..." for fb in feedbacks])
|
return "Synthesized from multiple tasks: " + "\n\n".join([f"- {fb[:500]}..." for fb in feedbacks])
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ Evaluate how well the agent's output aligns with the assigned task goal.
|
|||||||
feedback=evaluation_data.get("feedback", response),
|
feedback=evaluation_data.get("feedback", response),
|
||||||
raw_response=response
|
raw_response=response
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception:
|
||||||
return EvaluationScore(
|
return EvaluationScore(
|
||||||
score=None,
|
score=None,
|
||||||
feedback=f"Failed to parse evaluation. Raw response: {response}",
|
feedback=f"Failed to parse evaluation. Raw response: {response}",
|
||||||
|
|||||||
@@ -14,7 +14,6 @@ import numpy as np
|
|||||||
|
|
||||||
from crewai.agent import Agent
|
from crewai.agent import Agent
|
||||||
from crewai.task import Task
|
from crewai.task import Task
|
||||||
from crewai.llm import BaseLLM, LLM
|
|
||||||
|
|
||||||
from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
|
from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
|
||||||
from crewai.evaluation.json_parser import extract_json_from_llm_response
|
from crewai.evaluation.json_parser import extract_json_from_llm_response
|
||||||
@@ -60,7 +59,7 @@ class ReasoningEfficiencyEvaluator(BaseEvaluator):
|
|||||||
try:
|
try:
|
||||||
interval = end_time - start_time
|
interval = end_time - start_time
|
||||||
time_intervals.append(interval.total_seconds() if hasattr(interval, 'total_seconds') else 0)
|
time_intervals.append(interval.total_seconds() if hasattr(interval, 'total_seconds') else 0)
|
||||||
except:
|
except Exception:
|
||||||
has_reliable_timing = False
|
has_reliable_timing = False
|
||||||
else:
|
else:
|
||||||
has_reliable_timing = False
|
has_reliable_timing = False
|
||||||
@@ -241,7 +240,7 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
|
|||||||
if start_time and end_time:
|
if start_time and end_time:
|
||||||
try:
|
try:
|
||||||
response_times.append(end_time - start_time)
|
response_times.append(end_time - start_time)
|
||||||
except:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
avg_length = np.mean(call_lengths) if call_lengths else 0
|
avg_length = np.mean(call_lengths) if call_lengths else 0
|
||||||
@@ -293,7 +292,7 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
|
|||||||
normalized_slope = slope / max_possible_slope
|
normalized_slope = slope / max_possible_slope
|
||||||
return max(min(normalized_slope, 1.0), -1.0)
|
return max(min(normalized_slope, 1.0), -1.0)
|
||||||
return 0.0
|
return 0.0
|
||||||
except:
|
except Exception:
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
def _calculate_loop_likelihood(self, call_lengths: List[float], response_times: List[float]) -> float:
|
def _calculate_loop_likelihood(self, call_lengths: List[float], response_times: List[float]) -> float:
|
||||||
@@ -319,7 +318,7 @@ Identify any inefficient reasoning patterns and provide specific suggestions for
|
|||||||
if mean_time > 0:
|
if mean_time > 0:
|
||||||
time_consistency = 1.0 - (std_time / mean_time)
|
time_consistency = 1.0 - (std_time / mean_time)
|
||||||
indicators.append(max(0, time_consistency - 0.3) * 1.5)
|
indicators.append(max(0, time_consistency - 0.3) * 1.5)
|
||||||
except:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return np.mean(indicators) if indicators else 0.0
|
return np.mean(indicators) if indicators else 0.0
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ Evaluate the semantic quality and reasoning of this output.
|
|||||||
feedback=evaluation_data.get("feedback", response),
|
feedback=evaluation_data.get("feedback", response),
|
||||||
raw_response=response
|
raw_response=response
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception:
|
||||||
return EvaluationScore(
|
return EvaluationScore(
|
||||||
score=None,
|
score=None,
|
||||||
feedback=f"Failed to parse evaluation. Raw response: {response}",
|
feedback=f"Failed to parse evaluation. Raw response: {response}",
|
||||||
|
|||||||
@@ -97,7 +97,7 @@ IMPORTANT:
|
|||||||
coverage = scores.get("coverage", 5.0)
|
coverage = scores.get("coverage", 5.0)
|
||||||
overall_score = float(evaluation_data.get("overall_score", 5.0))
|
overall_score = float(evaluation_data.get("overall_score", 5.0))
|
||||||
|
|
||||||
feedback = f"Tool Selection Evaluation:\n"
|
feedback = "Tool Selection Evaluation:\n"
|
||||||
feedback += f"• Relevance: {relevance}/10 - Selection of appropriate tool types for the task\n"
|
feedback += f"• Relevance: {relevance}/10 - Selection of appropriate tool types for the task\n"
|
||||||
feedback += f"• Coverage: {coverage}/10 - Selection of all necessary tool types\n"
|
feedback += f"• Coverage: {coverage}/10 - Selection of all necessary tool types\n"
|
||||||
if "improvement_suggestions" in evaluation_data:
|
if "improvement_suggestions" in evaluation_data:
|
||||||
@@ -164,7 +164,7 @@ class ParameterExtractionEvaluator(BaseEvaluator):
|
|||||||
sample += f"- Success: {'No' if not success else 'Yes'}"
|
sample += f"- Success: {'No' if not success else 'Yes'}"
|
||||||
|
|
||||||
if is_validation_error:
|
if is_validation_error:
|
||||||
sample += f" (PARAMETER VALIDATION ERROR)\n"
|
sample += " (PARAMETER VALIDATION ERROR)\n"
|
||||||
sample += f"- Error: {tool_use.get('result', 'Unknown error')}"
|
sample += f"- Error: {tool_use.get('result', 'Unknown error')}"
|
||||||
elif not success:
|
elif not success:
|
||||||
sample += f" (Other error: {error_type})\n"
|
sample += f" (Other error: {error_type})\n"
|
||||||
@@ -231,7 +231,7 @@ Evaluate the quality of the agent's parameter extraction for this task.
|
|||||||
|
|
||||||
overall_score = float(evaluation_data.get("overall_score", 5.0))
|
overall_score = float(evaluation_data.get("overall_score", 5.0))
|
||||||
|
|
||||||
feedback = f"Parameter Extraction Evaluation:\n"
|
feedback = "Parameter Extraction Evaluation:\n"
|
||||||
feedback += f"• Accuracy: {accuracy}/10 - Correctly identifying required parameters\n"
|
feedback += f"• Accuracy: {accuracy}/10 - Correctly identifying required parameters\n"
|
||||||
feedback += f"• Formatting: {formatting}/10 - Properly formatting parameters for tools\n"
|
feedback += f"• Formatting: {formatting}/10 - Properly formatting parameters for tools\n"
|
||||||
feedback += f"• Completeness: {completeness}/10 - Including all necessary information\n\n"
|
feedback += f"• Completeness: {completeness}/10 - Including all necessary information\n\n"
|
||||||
@@ -370,7 +370,7 @@ Evaluate the quality of the agent's tool invocation structure during this task.
|
|||||||
|
|
||||||
overall_score = float(evaluation_data.get("overall_score", 5.0))
|
overall_score = float(evaluation_data.get("overall_score", 5.0))
|
||||||
|
|
||||||
feedback = f"Tool Invocation Evaluation:\n"
|
feedback = "Tool Invocation Evaluation:\n"
|
||||||
feedback += f"• Structure: {structure}/10 - Following proper syntax and format\n"
|
feedback += f"• Structure: {structure}/10 - Following proper syntax and format\n"
|
||||||
feedback += f"• Error Handling: {error_handling}/10 - Appropriately handling tool errors\n"
|
feedback += f"• Error Handling: {error_handling}/10 - Appropriately handling tool errors\n"
|
||||||
feedback += f"• Invocation Patterns: {invocation_patterns}/10 - Proper sequencing and management of calls\n\n"
|
feedback += f"• Invocation Patterns: {invocation_patterns}/10 - Proper sequencing and management of calls\n\n"
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ from enum import Enum
|
|||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Any, Dict, List, Optional, Union
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
from crewai.utilities.events.base_events import BaseEvent
|
from crewai.utilities.events.base_events import BaseEvent
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
import pytest
|
|
||||||
from unittest.mock import patch, MagicMock
|
from unittest.mock import patch, MagicMock
|
||||||
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ from typing import List, Dict, Any
|
|||||||
from crewai.tasks.task_output import TaskOutput
|
from crewai.tasks.task_output import TaskOutput
|
||||||
from crewai.evaluation.metrics.reasoning_metrics import (
|
from crewai.evaluation.metrics.reasoning_metrics import (
|
||||||
ReasoningEfficiencyEvaluator,
|
ReasoningEfficiencyEvaluator,
|
||||||
ReasoningPatternType
|
|
||||||
)
|
)
|
||||||
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||||
from crewai.utilities.llm_utils import LLM
|
from crewai.utilities.llm_utils import LLM
|
||||||
|
|||||||
@@ -1,8 +1,5 @@
|
|||||||
import pytest
|
from unittest.mock import patch, MagicMock
|
||||||
from unittest.mock import patch, MagicMock, ANY
|
|
||||||
|
|
||||||
from crewai.agent import Agent
|
|
||||||
from crewai.task import Task
|
|
||||||
from crewai.evaluation.base_evaluator import EvaluationScore
|
from crewai.evaluation.base_evaluator import EvaluationScore
|
||||||
from crewai.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator
|
from crewai.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator
|
||||||
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||||
|
|||||||
@@ -1,9 +1,5 @@
|
|||||||
import pytest
|
from unittest.mock import patch, MagicMock
|
||||||
from unittest.mock import patch, MagicMock, ANY
|
|
||||||
|
|
||||||
from crewai.agent import Agent
|
|
||||||
from crewai.task import Task
|
|
||||||
from crewai.evaluation.base_evaluator import EvaluationScore
|
|
||||||
from crewai.evaluation.metrics.tools_metrics import (
|
from crewai.evaluation.metrics.tools_metrics import (
|
||||||
ToolSelectionEvaluator,
|
ToolSelectionEvaluator,
|
||||||
ParameterExtractionEvaluator,
|
ParameterExtractionEvaluator,
|
||||||
|
|||||||
Reference in New Issue
Block a user