mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-02-05 05:28:25 +00:00
Compare commits
13 Commits
lg-agent-e
...
lg-experim
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a810c31ced | ||
|
|
e3b044c044 | ||
|
|
ee490a19fb | ||
|
|
de425450d4 | ||
|
|
3fee619798 | ||
|
|
bb2a0f95ba | ||
|
|
22d46999fe | ||
|
|
9f00760437 | ||
|
|
ffab51ce2c | ||
|
|
007f396f4d | ||
|
|
cf91084f21 | ||
|
|
ccea8870d7 | ||
|
|
15f1912c2e |
@@ -1337,7 +1337,7 @@ class Crew(FlowTrackable, BaseModel):
|
||||
evaluator = CrewEvaluator(test_crew, llm_instance)
|
||||
|
||||
if include_agent_eval:
|
||||
from crewai.evaluation import create_default_evaluator
|
||||
from crewai.experimental.evaluation import create_default_evaluator
|
||||
agent_evaluator = create_default_evaluator(crew=test_crew)
|
||||
|
||||
for i in range(1, n_iterations + 1):
|
||||
|
||||
40
src/crewai/experimental/__init__.py
Normal file
40
src/crewai/experimental/__init__.py
Normal file
@@ -0,0 +1,40 @@
|
||||
from crewai.experimental.evaluation import (
|
||||
BaseEvaluator,
|
||||
EvaluationScore,
|
||||
MetricCategory,
|
||||
AgentEvaluationResult,
|
||||
SemanticQualityEvaluator,
|
||||
GoalAlignmentEvaluator,
|
||||
ReasoningEfficiencyEvaluator,
|
||||
ToolSelectionEvaluator,
|
||||
ParameterExtractionEvaluator,
|
||||
ToolInvocationEvaluator,
|
||||
EvaluationTraceCallback,
|
||||
create_evaluation_callbacks,
|
||||
AgentEvaluator,
|
||||
create_default_evaluator,
|
||||
ExperimentRunner,
|
||||
ExperimentResults,
|
||||
ExperimentResult,
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"BaseEvaluator",
|
||||
"EvaluationScore",
|
||||
"MetricCategory",
|
||||
"AgentEvaluationResult",
|
||||
"SemanticQualityEvaluator",
|
||||
"GoalAlignmentEvaluator",
|
||||
"ReasoningEfficiencyEvaluator",
|
||||
"ToolSelectionEvaluator",
|
||||
"ParameterExtractionEvaluator",
|
||||
"ToolInvocationEvaluator",
|
||||
"EvaluationTraceCallback",
|
||||
"create_evaluation_callbacks",
|
||||
"AgentEvaluator",
|
||||
"create_default_evaluator",
|
||||
"ExperimentRunner",
|
||||
"ExperimentResults",
|
||||
"ExperimentResult"
|
||||
]
|
||||
@@ -1,40 +1,35 @@
|
||||
from crewai.evaluation.base_evaluator import (
|
||||
from crewai.experimental.evaluation.base_evaluator import (
|
||||
BaseEvaluator,
|
||||
EvaluationScore,
|
||||
MetricCategory,
|
||||
AgentEvaluationResult
|
||||
)
|
||||
|
||||
from crewai.evaluation.metrics.semantic_quality_metrics import (
|
||||
SemanticQualityEvaluator
|
||||
)
|
||||
|
||||
from crewai.evaluation.metrics.goal_metrics import (
|
||||
GoalAlignmentEvaluator
|
||||
)
|
||||
|
||||
from crewai.evaluation.metrics.reasoning_metrics import (
|
||||
ReasoningEfficiencyEvaluator
|
||||
)
|
||||
|
||||
|
||||
from crewai.evaluation.metrics.tools_metrics import (
|
||||
from crewai.experimental.evaluation.metrics import (
|
||||
SemanticQualityEvaluator,
|
||||
GoalAlignmentEvaluator,
|
||||
ReasoningEfficiencyEvaluator,
|
||||
ToolSelectionEvaluator,
|
||||
ParameterExtractionEvaluator,
|
||||
ToolInvocationEvaluator
|
||||
)
|
||||
|
||||
from crewai.evaluation.evaluation_listener import (
|
||||
from crewai.experimental.evaluation.evaluation_listener import (
|
||||
EvaluationTraceCallback,
|
||||
create_evaluation_callbacks
|
||||
)
|
||||
|
||||
|
||||
from crewai.evaluation.agent_evaluator import (
|
||||
from crewai.experimental.evaluation.agent_evaluator import (
|
||||
AgentEvaluator,
|
||||
create_default_evaluator
|
||||
)
|
||||
|
||||
from crewai.experimental.evaluation.experiment import (
|
||||
ExperimentRunner,
|
||||
ExperimentResults,
|
||||
ExperimentResult
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"BaseEvaluator",
|
||||
"EvaluationScore",
|
||||
@@ -49,5 +44,8 @@ __all__ = [
|
||||
"EvaluationTraceCallback",
|
||||
"create_evaluation_callbacks",
|
||||
"AgentEvaluator",
|
||||
"create_default_evaluator"
|
||||
]
|
||||
"create_default_evaluator",
|
||||
"ExperimentRunner",
|
||||
"ExperimentResults",
|
||||
"ExperimentResult"
|
||||
]
|
||||
@@ -1,15 +1,16 @@
|
||||
from crewai.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy
|
||||
from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
from crewai.evaluation.evaluation_display import EvaluationDisplayFormatter
|
||||
from crewai.experimental.evaluation.evaluation_display import EvaluationDisplayFormatter
|
||||
|
||||
from typing import Any, Dict
|
||||
from collections import defaultdict
|
||||
from crewai.evaluation import BaseEvaluator, create_evaluation_callbacks
|
||||
from crewai.experimental.evaluation import BaseEvaluator, create_evaluation_callbacks
|
||||
from collections.abc import Sequence
|
||||
from crewai.crew import Crew
|
||||
from crewai.utilities.events.crewai_event_bus import crewai_event_bus
|
||||
from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
|
||||
from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
|
||||
|
||||
class AgentEvaluator:
|
||||
def __init__(
|
||||
@@ -36,6 +37,9 @@ class AgentEvaluator:
|
||||
def set_iteration(self, iteration: int) -> None:
|
||||
self.iteration = iteration
|
||||
|
||||
def reset_iterations_results(self):
|
||||
self.iterations_results = {}
|
||||
|
||||
def evaluate_current_iteration(self) -> dict[str, list[AgentEvaluationResult]]:
|
||||
if not self.crew:
|
||||
raise ValueError("Cannot evaluate: no crew was provided to the evaluator.")
|
||||
@@ -99,7 +103,7 @@ class AgentEvaluator:
|
||||
def display_results_with_iterations(self):
|
||||
self.display_formatter.display_summary_results(self.iterations_results)
|
||||
|
||||
def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = False):
|
||||
def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = False) -> Dict[str, AgentAggregatedEvaluationResult]:
|
||||
agent_results = {}
|
||||
with crewai_event_bus.scoped_handlers():
|
||||
task_results = self.get_evaluation_results()
|
||||
@@ -157,7 +161,7 @@ class AgentEvaluator:
|
||||
return result
|
||||
|
||||
def create_default_evaluator(crew, llm=None):
|
||||
from crewai.evaluation import (
|
||||
from crewai.experimental.evaluation import (
|
||||
GoalAlignmentEvaluator,
|
||||
SemanticQualityEvaluator,
|
||||
ToolSelectionEvaluator,
|
||||
@@ -3,8 +3,8 @@ from typing import Dict, Any, List
|
||||
from rich.table import Table
|
||||
from rich.box import HEAVY_EDGE, ROUNDED
|
||||
from collections.abc import Sequence
|
||||
from crewai.evaluation.base_evaluator import AgentAggregatedEvaluationResult, AggregationStrategy, AgentEvaluationResult, MetricCategory
|
||||
from crewai.evaluation import EvaluationScore
|
||||
from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult, AggregationStrategy, AgentEvaluationResult, MetricCategory
|
||||
from crewai.experimental.evaluation import EvaluationScore
|
||||
from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
|
||||
from crewai.utilities.llm_utils import create_llm
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
from crewai.experimental.evaluation.experiment.runner import ExperimentRunner
|
||||
from crewai.experimental.evaluation.experiment.result import ExperimentResults, ExperimentResult
|
||||
|
||||
__all__ = [
|
||||
"ExperimentRunner",
|
||||
"ExperimentResults",
|
||||
"ExperimentResult"
|
||||
]
|
||||
122
src/crewai/experimental/evaluation/experiment/result.py
Normal file
122
src/crewai/experimental/evaluation/experiment/result.py
Normal file
@@ -0,0 +1,122 @@
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
from pydantic import BaseModel
|
||||
|
||||
class ExperimentResult(BaseModel):
|
||||
identifier: str
|
||||
inputs: dict[str, Any]
|
||||
score: int | dict[str, int | float]
|
||||
expected_score: int | dict[str, int | float]
|
||||
passed: bool
|
||||
agent_evaluations: dict[str, Any] | None = None
|
||||
|
||||
class ExperimentResults:
|
||||
def __init__(self, results: list[ExperimentResult], metadata: dict[str, Any] | None = None):
|
||||
self.results = results
|
||||
self.metadata = metadata or {}
|
||||
self.timestamp = datetime.now(timezone.utc)
|
||||
|
||||
from crewai.experimental.evaluation.experiment.result_display import ExperimentResultsDisplay
|
||||
self.display = ExperimentResultsDisplay()
|
||||
|
||||
def to_json(self, filepath: str | None = None) -> dict[str, Any]:
|
||||
data = {
|
||||
"timestamp": self.timestamp.isoformat(),
|
||||
"metadata": self.metadata,
|
||||
"results": [r.model_dump(exclude={"agent_evaluations"}) for r in self.results]
|
||||
}
|
||||
|
||||
if filepath:
|
||||
with open(filepath, 'w') as f:
|
||||
json.dump(data, f, indent=2)
|
||||
self.display.console.print(f"[green]Results saved to {filepath}[/green]")
|
||||
|
||||
return data
|
||||
|
||||
def compare_with_baseline(self, baseline_filepath: str, save_current: bool = True, print_summary: bool = False) -> dict[str, Any]:
|
||||
baseline_runs = []
|
||||
|
||||
if os.path.exists(baseline_filepath) and os.path.getsize(baseline_filepath) > 0:
|
||||
try:
|
||||
with open(baseline_filepath, 'r') as f:
|
||||
baseline_data = json.load(f)
|
||||
|
||||
if isinstance(baseline_data, dict) and "timestamp" in baseline_data:
|
||||
baseline_runs = [baseline_data]
|
||||
elif isinstance(baseline_data, list):
|
||||
baseline_runs = baseline_data
|
||||
except (json.JSONDecodeError, FileNotFoundError) as e:
|
||||
self.display.console.print(f"[yellow]Warning: Could not load baseline file: {str(e)}[/yellow]")
|
||||
|
||||
if not baseline_runs:
|
||||
if save_current:
|
||||
current_data = self.to_json()
|
||||
with open(baseline_filepath, 'w') as f:
|
||||
json.dump([current_data], f, indent=2)
|
||||
self.display.console.print(f"[green]Saved current results as new baseline to {baseline_filepath}[/green]")
|
||||
return {"is_baseline": True, "changes": {}}
|
||||
|
||||
baseline_runs.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
|
||||
latest_run = baseline_runs[0]
|
||||
|
||||
comparison = self._compare_with_run(latest_run)
|
||||
|
||||
if print_summary:
|
||||
self.display.comparison_summary(comparison, latest_run["timestamp"])
|
||||
|
||||
if save_current:
|
||||
current_data = self.to_json()
|
||||
baseline_runs.append(current_data)
|
||||
with open(baseline_filepath, 'w') as f:
|
||||
json.dump(baseline_runs, f, indent=2)
|
||||
self.display.console.print(f"[green]Added current results to baseline file {baseline_filepath}[/green]")
|
||||
|
||||
return comparison
|
||||
|
||||
def _compare_with_run(self, baseline_run: dict[str, Any]) -> dict[str, Any]:
|
||||
baseline_results = baseline_run.get("results", [])
|
||||
|
||||
baseline_lookup = {}
|
||||
for result in baseline_results:
|
||||
test_identifier = result.get("identifier")
|
||||
if test_identifier:
|
||||
baseline_lookup[test_identifier] = result
|
||||
|
||||
improved = []
|
||||
regressed = []
|
||||
unchanged = []
|
||||
new_tests = []
|
||||
|
||||
for result in self.results:
|
||||
test_identifier = result.identifier
|
||||
if not test_identifier or test_identifier not in baseline_lookup:
|
||||
new_tests.append(test_identifier)
|
||||
continue
|
||||
|
||||
baseline_result = baseline_lookup[test_identifier]
|
||||
baseline_passed = baseline_result.get("passed", False)
|
||||
if result.passed and not baseline_passed:
|
||||
improved.append(test_identifier)
|
||||
elif not result.passed and baseline_passed:
|
||||
regressed.append(test_identifier)
|
||||
else:
|
||||
unchanged.append(test_identifier)
|
||||
|
||||
missing_tests = []
|
||||
current_test_identifiers = {result.identifier for result in self.results}
|
||||
for result in baseline_results:
|
||||
test_identifier = result.get("identifier")
|
||||
if test_identifier and test_identifier not in current_test_identifiers:
|
||||
missing_tests.append(test_identifier)
|
||||
|
||||
return {
|
||||
"improved": improved,
|
||||
"regressed": regressed,
|
||||
"unchanged": unchanged,
|
||||
"new_tests": new_tests,
|
||||
"missing_tests": missing_tests,
|
||||
"total_compared": len(improved) + len(regressed) + len(unchanged),
|
||||
"baseline_timestamp": baseline_run.get("timestamp", "unknown")
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
from typing import Dict, Any
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
from rich.panel import Panel
|
||||
from crewai.experimental.evaluation.experiment.result import ExperimentResults
|
||||
|
||||
class ExperimentResultsDisplay:
|
||||
def __init__(self):
|
||||
self.console = Console()
|
||||
|
||||
def summary(self, experiment_results: ExperimentResults):
|
||||
total = len(experiment_results.results)
|
||||
passed = sum(1 for r in experiment_results.results if r.passed)
|
||||
|
||||
table = Table(title="Experiment Summary")
|
||||
table.add_column("Metric", style="cyan")
|
||||
table.add_column("Value", style="green")
|
||||
|
||||
table.add_row("Total Test Cases", str(total))
|
||||
table.add_row("Passed", str(passed))
|
||||
table.add_row("Failed", str(total - passed))
|
||||
table.add_row("Success Rate", f"{(passed / total * 100):.1f}%" if total > 0 else "N/A")
|
||||
|
||||
self.console.print(table)
|
||||
|
||||
def comparison_summary(self, comparison: Dict[str, Any], baseline_timestamp: str):
|
||||
self.console.print(Panel(f"[bold]Comparison with baseline run from {baseline_timestamp}[/bold]",
|
||||
expand=False))
|
||||
|
||||
table = Table(title="Results Comparison")
|
||||
table.add_column("Metric", style="cyan")
|
||||
table.add_column("Count", style="white")
|
||||
table.add_column("Details", style="dim")
|
||||
|
||||
improved = comparison.get("improved", [])
|
||||
if improved:
|
||||
details = ", ".join([f"{test_identifier}" for test_identifier in improved[:3]])
|
||||
if len(improved) > 3:
|
||||
details += f" and {len(improved) - 3} more"
|
||||
table.add_row("✅ Improved", str(len(improved)), details)
|
||||
else:
|
||||
table.add_row("✅ Improved", "0", "")
|
||||
|
||||
regressed = comparison.get("regressed", [])
|
||||
if regressed:
|
||||
details = ", ".join([f"{test_identifier}" for test_identifier in regressed[:3]])
|
||||
if len(regressed) > 3:
|
||||
details += f" and {len(regressed) - 3} more"
|
||||
table.add_row("❌ Regressed", str(len(regressed)), details, style="red")
|
||||
else:
|
||||
table.add_row("❌ Regressed", "0", "")
|
||||
|
||||
unchanged = comparison.get("unchanged", [])
|
||||
table.add_row("⏺ Unchanged", str(len(unchanged)), "")
|
||||
|
||||
new_tests = comparison.get("new_tests", [])
|
||||
if new_tests:
|
||||
details = ", ".join(new_tests[:3])
|
||||
if len(new_tests) > 3:
|
||||
details += f" and {len(new_tests) - 3} more"
|
||||
table.add_row("➕ New Tests", str(len(new_tests)), details)
|
||||
|
||||
missing_tests = comparison.get("missing_tests", [])
|
||||
if missing_tests:
|
||||
details = ", ".join(missing_tests[:3])
|
||||
if len(missing_tests) > 3:
|
||||
details += f" and {len(missing_tests) - 3} more"
|
||||
table.add_row("➖ Missing Tests", str(len(missing_tests)), details)
|
||||
|
||||
self.console.print(table)
|
||||
117
src/crewai/experimental/evaluation/experiment/runner.py
Normal file
117
src/crewai/experimental/evaluation/experiment/runner.py
Normal file
@@ -0,0 +1,117 @@
|
||||
from collections import defaultdict
|
||||
from hashlib import md5
|
||||
from typing import Any
|
||||
|
||||
from crewai import Crew
|
||||
from crewai.experimental.evaluation import AgentEvaluator, create_default_evaluator
|
||||
from crewai.experimental.evaluation.experiment.result_display import ExperimentResultsDisplay
|
||||
from crewai.experimental.evaluation.experiment.result import ExperimentResults, ExperimentResult
|
||||
from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
|
||||
|
||||
class ExperimentRunner:
|
||||
def __init__(self, dataset: list[dict[str, Any]]):
|
||||
self.dataset = dataset or []
|
||||
self.evaluator: AgentEvaluator | None = None
|
||||
self.display = ExperimentResultsDisplay()
|
||||
|
||||
def run(self, crew: Crew, print_summary: bool = False) -> ExperimentResults:
|
||||
self.evaluator = create_default_evaluator(crew=crew)
|
||||
|
||||
results = []
|
||||
|
||||
for test_case in self.dataset:
|
||||
self.evaluator.reset_iterations_results()
|
||||
result = self._run_test_case(test_case, crew)
|
||||
results.append(result)
|
||||
|
||||
experiment_results = ExperimentResults(results)
|
||||
|
||||
if print_summary:
|
||||
self.display.summary(experiment_results)
|
||||
|
||||
return experiment_results
|
||||
|
||||
def _run_test_case(self, test_case: dict[str, Any], crew: Crew) -> ExperimentResult:
|
||||
inputs = test_case["inputs"]
|
||||
expected_score = test_case["expected_score"]
|
||||
identifier = test_case.get("identifier") or md5(str(test_case).encode(), usedforsecurity=False).hexdigest()
|
||||
|
||||
try:
|
||||
self.display.console.print(f"[dim]Running crew with input: {str(inputs)[:50]}...[/dim]")
|
||||
self.display.console.print("\n")
|
||||
crew.kickoff(inputs=inputs)
|
||||
|
||||
assert self.evaluator is not None
|
||||
agent_evaluations = self.evaluator.get_agent_evaluation()
|
||||
|
||||
actual_score = self._extract_scores(agent_evaluations)
|
||||
|
||||
passed = self._assert_scores(expected_score, actual_score)
|
||||
return ExperimentResult(
|
||||
identifier=identifier,
|
||||
inputs=inputs,
|
||||
score=actual_score,
|
||||
expected_score=expected_score,
|
||||
passed=passed,
|
||||
agent_evaluations=agent_evaluations
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.display.console.print(f"[red]Error running test case: {str(e)}[/red]")
|
||||
return ExperimentResult(
|
||||
identifier=identifier,
|
||||
inputs=inputs,
|
||||
score=0,
|
||||
expected_score=expected_score,
|
||||
passed=False
|
||||
)
|
||||
|
||||
def _extract_scores(self, agent_evaluations: dict[str, AgentAggregatedEvaluationResult]) -> float | dict[str, float]:
|
||||
all_scores: dict[str, list[float]] = defaultdict(list)
|
||||
for evaluation in agent_evaluations.values():
|
||||
for metric_name, score in evaluation.metrics.items():
|
||||
if score.score is not None:
|
||||
all_scores[metric_name.value].append(score.score)
|
||||
|
||||
avg_scores = {m: sum(s)/len(s) for m, s in all_scores.items()}
|
||||
|
||||
if len(avg_scores) == 1:
|
||||
return list(avg_scores.values())[0]
|
||||
|
||||
return avg_scores
|
||||
|
||||
def _assert_scores(self, expected: float | dict[str, float],
|
||||
actual: float | dict[str, float]) -> bool:
|
||||
"""
|
||||
Compare expected and actual scores, and return whether the test case passed.
|
||||
|
||||
The rules for comparison are as follows:
|
||||
- If both expected and actual scores are single numbers, the actual score must be >= expected.
|
||||
- If expected is a single number and actual is a dict, compare against the average of actual values.
|
||||
- If expected is a dict and actual is a single number, actual must be >= all expected values.
|
||||
- If both are dicts, actual must have matching keys with values >= expected values.
|
||||
"""
|
||||
|
||||
if isinstance(expected, (int, float)) and isinstance(actual, (int, float)):
|
||||
return actual >= expected
|
||||
|
||||
if isinstance(expected, dict) and isinstance(actual, (int, float)):
|
||||
return all(actual >= exp_score for exp_score in expected.values())
|
||||
|
||||
if isinstance(expected, (int, float)) and isinstance(actual, dict):
|
||||
if not actual:
|
||||
return False
|
||||
avg_score = sum(actual.values()) / len(actual)
|
||||
return avg_score >= expected
|
||||
|
||||
if isinstance(expected, dict) and isinstance(actual, dict):
|
||||
if not expected:
|
||||
return True
|
||||
matching_keys = set(expected.keys()) & set(actual.keys())
|
||||
if not matching_keys:
|
||||
return False
|
||||
|
||||
# All matching keys must have actual >= expected
|
||||
return all(actual[key] >= expected[key] for key in matching_keys)
|
||||
|
||||
return False
|
||||
26
src/crewai/experimental/evaluation/metrics/__init__.py
Normal file
26
src/crewai/experimental/evaluation/metrics/__init__.py
Normal file
@@ -0,0 +1,26 @@
|
||||
from crewai.experimental.evaluation.metrics.reasoning_metrics import (
|
||||
ReasoningEfficiencyEvaluator
|
||||
)
|
||||
|
||||
from crewai.experimental.evaluation.metrics.tools_metrics import (
|
||||
ToolSelectionEvaluator,
|
||||
ParameterExtractionEvaluator,
|
||||
ToolInvocationEvaluator
|
||||
)
|
||||
|
||||
from crewai.experimental.evaluation.metrics.goal_metrics import (
|
||||
GoalAlignmentEvaluator
|
||||
)
|
||||
|
||||
from crewai.experimental.evaluation.metrics.semantic_quality_metrics import (
|
||||
SemanticQualityEvaluator
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"ReasoningEfficiencyEvaluator",
|
||||
"ToolSelectionEvaluator",
|
||||
"ParameterExtractionEvaluator",
|
||||
"ToolInvocationEvaluator",
|
||||
"GoalAlignmentEvaluator",
|
||||
"SemanticQualityEvaluator"
|
||||
]
|
||||
@@ -3,8 +3,8 @@ from typing import Any, Dict
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
|
||||
from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
|
||||
from crewai.evaluation.json_parser import extract_json_from_llm_response
|
||||
from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
|
||||
from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
|
||||
|
||||
class GoalAlignmentEvaluator(BaseEvaluator):
|
||||
@property
|
||||
@@ -16,8 +16,8 @@ from collections.abc import Sequence
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
|
||||
from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
|
||||
from crewai.evaluation.json_parser import extract_json_from_llm_response
|
||||
from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
|
||||
from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
|
||||
from crewai.tasks.task_output import TaskOutput
|
||||
|
||||
class ReasoningPatternType(Enum):
|
||||
@@ -3,8 +3,8 @@ from typing import Any, Dict
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
|
||||
from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
|
||||
from crewai.evaluation.json_parser import extract_json_from_llm_response
|
||||
from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
|
||||
from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
|
||||
|
||||
class SemanticQualityEvaluator(BaseEvaluator):
|
||||
@property
|
||||
@@ -1,8 +1,8 @@
|
||||
import json
|
||||
from typing import Dict, Any
|
||||
|
||||
from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
|
||||
from crewai.evaluation.json_parser import extract_json_from_llm_response
|
||||
from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
|
||||
from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
from unittest.mock import patch, MagicMock
|
||||
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||
|
||||
from crewai.evaluation.base_evaluator import EvaluationScore
|
||||
from crewai.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator
|
||||
from crewai.experimental.evaluation.base_evaluator import EvaluationScore
|
||||
from crewai.experimental.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator
|
||||
from crewai.utilities.llm_utils import LLM
|
||||
|
||||
|
||||
|
||||
@@ -3,12 +3,12 @@ from unittest.mock import patch, MagicMock
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from crewai.tasks.task_output import TaskOutput
|
||||
from crewai.evaluation.metrics.reasoning_metrics import (
|
||||
from crewai.experimental.evaluation.metrics.reasoning_metrics import (
|
||||
ReasoningEfficiencyEvaluator,
|
||||
)
|
||||
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||
from crewai.utilities.llm_utils import LLM
|
||||
from crewai.evaluation.base_evaluator import EvaluationScore
|
||||
from crewai.experimental.evaluation.base_evaluator import EvaluationScore
|
||||
|
||||
class TestReasoningEfficiencyEvaluator(BaseEvaluationMetricsTest):
|
||||
@pytest.fixture
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from crewai.evaluation.base_evaluator import EvaluationScore
|
||||
from crewai.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator
|
||||
from crewai.experimental.evaluation.base_evaluator import EvaluationScore
|
||||
from crewai.experimental.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator
|
||||
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||
from crewai.utilities.llm_utils import LLM
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from crewai.evaluation.metrics.tools_metrics import (
|
||||
from crewai.experimental.evaluation.metrics.tools_metrics import (
|
||||
ToolSelectionEvaluator,
|
||||
ParameterExtractionEvaluator,
|
||||
ToolInvocationEvaluator
|
||||
|
||||
@@ -3,9 +3,9 @@ import pytest
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
from crewai.crew import Crew
|
||||
from crewai.evaluation.agent_evaluator import AgentEvaluator
|
||||
from crewai.evaluation.base_evaluator import AgentEvaluationResult
|
||||
from crewai.evaluation import (
|
||||
from crewai.experimental.evaluation.agent_evaluator import AgentEvaluator
|
||||
from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult
|
||||
from crewai.experimental.evaluation import (
|
||||
GoalAlignmentEvaluator,
|
||||
SemanticQualityEvaluator,
|
||||
ToolSelectionEvaluator,
|
||||
@@ -14,7 +14,7 @@ from crewai.evaluation import (
|
||||
ReasoningEfficiencyEvaluator
|
||||
)
|
||||
|
||||
from crewai.evaluation import create_default_evaluator
|
||||
from crewai.experimental.evaluation import create_default_evaluator
|
||||
class TestAgentEvaluator:
|
||||
@pytest.fixture
|
||||
def mock_crew(self):
|
||||
|
||||
111
tests/evaluation/test_experiment_result.py
Normal file
111
tests/evaluation/test_experiment_result.py
Normal file
@@ -0,0 +1,111 @@
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from crewai.experimental.evaluation.experiment.result import ExperimentResult, ExperimentResults
|
||||
|
||||
|
||||
class TestExperimentResult:
|
||||
@pytest.fixture
|
||||
def mock_results(self):
|
||||
return [
|
||||
ExperimentResult(
|
||||
identifier="test-1",
|
||||
inputs={"query": "What is the capital of France?"},
|
||||
score=10,
|
||||
expected_score=7,
|
||||
passed=True
|
||||
),
|
||||
ExperimentResult(
|
||||
identifier="test-2",
|
||||
inputs={"query": "Who wrote Hamlet?"},
|
||||
score={"relevance": 9, "factuality": 8},
|
||||
expected_score={"relevance": 7, "factuality": 7},
|
||||
passed=True,
|
||||
agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
|
||||
),
|
||||
ExperimentResult(
|
||||
identifier="test-3",
|
||||
inputs={"query": "Any query"},
|
||||
score={"relevance": 9, "factuality": 8},
|
||||
expected_score={"relevance": 7, "factuality": 7},
|
||||
passed=False,
|
||||
agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
|
||||
),
|
||||
ExperimentResult(
|
||||
identifier="test-4",
|
||||
inputs={"query": "Another query"},
|
||||
score={"relevance": 9, "factuality": 8},
|
||||
expected_score={"relevance": 7, "factuality": 7},
|
||||
passed=True,
|
||||
agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
|
||||
),
|
||||
ExperimentResult(
|
||||
identifier="test-6",
|
||||
inputs={"query": "Yet another query"},
|
||||
score={"relevance": 9, "factuality": 8},
|
||||
expected_score={"relevance": 7, "factuality": 7},
|
||||
passed=True,
|
||||
agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
|
||||
)
|
||||
]
|
||||
|
||||
@patch('os.path.exists', return_value=True)
|
||||
@patch('os.path.getsize', return_value=1)
|
||||
@patch('json.load')
|
||||
@patch('builtins.open', new_callable=MagicMock)
|
||||
def test_experiment_results_compare_with_baseline(self, mock_open, mock_json_load, mock_path_getsize, mock_path_exists, mock_results):
|
||||
baseline_data = {
|
||||
"timestamp": "2023-01-01T00:00:00+00:00",
|
||||
"results": [
|
||||
{
|
||||
"identifier": "test-1",
|
||||
"inputs": {"query": "What is the capital of France?"},
|
||||
"score": 7,
|
||||
"expected_score": 7,
|
||||
"passed": False
|
||||
},
|
||||
{
|
||||
"identifier": "test-2",
|
||||
"inputs": {"query": "Who wrote Hamlet?"},
|
||||
"score": {"relevance": 8, "factuality": 7},
|
||||
"expected_score": {"relevance": 7, "factuality": 7},
|
||||
"passed": True
|
||||
},
|
||||
{
|
||||
"identifier": "test-3",
|
||||
"inputs": {"query": "Any query"},
|
||||
"score": {"relevance": 8, "factuality": 7},
|
||||
"expected_score": {"relevance": 7, "factuality": 7},
|
||||
"passed": True
|
||||
},
|
||||
{
|
||||
"identifier": "test-4",
|
||||
"inputs": {"query": "Another query"},
|
||||
"score": {"relevance": 8, "factuality": 7},
|
||||
"expected_score": {"relevance": 7, "factuality": 7},
|
||||
"passed": True
|
||||
},
|
||||
{
|
||||
"identifier": "test-5",
|
||||
"inputs": {"query": "Another query"},
|
||||
"score": {"relevance": 8, "factuality": 7},
|
||||
"expected_score": {"relevance": 7, "factuality": 7},
|
||||
"passed": True
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
mock_json_load.return_value = baseline_data
|
||||
|
||||
results = ExperimentResults(results=mock_results)
|
||||
results.display = MagicMock()
|
||||
|
||||
comparison = results.compare_with_baseline(baseline_filepath="baseline.json")
|
||||
|
||||
assert "baseline_timestamp" in comparison
|
||||
assert comparison["baseline_timestamp"] == "2023-01-01T00:00:00+00:00"
|
||||
assert comparison["improved"] == ["test-1"]
|
||||
assert comparison["regressed"] == ["test-3"]
|
||||
assert comparison["unchanged"] == ["test-2", "test-4"]
|
||||
assert comparison["new_tests"] == ["test-6"]
|
||||
assert comparison["missing_tests"] == ["test-5"]
|
||||
197
tests/evaluation/test_experiment_runner.py
Normal file
197
tests/evaluation/test_experiment_runner.py
Normal file
@@ -0,0 +1,197 @@
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from crewai.crew import Crew
|
||||
from crewai.experimental.evaluation.experiment.runner import ExperimentRunner
|
||||
from crewai.experimental.evaluation.experiment.result import ExperimentResults
|
||||
from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
|
||||
from crewai.experimental.evaluation.base_evaluator import MetricCategory, EvaluationScore
|
||||
|
||||
|
||||
class TestExperimentRunner:
|
||||
@pytest.fixture
|
||||
def mock_crew(self):
|
||||
return MagicMock(llm=Crew)
|
||||
|
||||
@pytest.fixture
|
||||
def mock_evaluator_results(self):
|
||||
agent_evaluation = AgentAggregatedEvaluationResult(
|
||||
agent_id="Test Agent",
|
||||
agent_role="Test Agent Role",
|
||||
metrics={
|
||||
MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
|
||||
score=9,
|
||||
feedback="Test feedback for goal alignment",
|
||||
raw_response="Test raw response for goal alignment"
|
||||
),
|
||||
MetricCategory.REASONING_EFFICIENCY: EvaluationScore(
|
||||
score=None,
|
||||
feedback="Reasoning efficiency not applicable",
|
||||
raw_response="Reasoning efficiency not applicable"
|
||||
),
|
||||
MetricCategory.PARAMETER_EXTRACTION: EvaluationScore(
|
||||
score=7,
|
||||
feedback="Test parameter extraction explanation",
|
||||
raw_response="Test raw output"
|
||||
),
|
||||
MetricCategory.TOOL_SELECTION: EvaluationScore(
|
||||
score=8,
|
||||
feedback="Test tool selection explanation",
|
||||
raw_response="Test raw output"
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
return {"Test Agent": agent_evaluation}
|
||||
|
||||
@patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
|
||||
def test_run_success(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
|
||||
dataset = [
|
||||
{
|
||||
"identifier": "test-case-1",
|
||||
"inputs": {"query": "Test query 1"},
|
||||
"expected_score": 8
|
||||
},
|
||||
{
|
||||
"identifier": "test-case-2",
|
||||
"inputs": {"query": "Test query 2"},
|
||||
"expected_score": {"goal_alignment": 7}
|
||||
},
|
||||
{
|
||||
"inputs": {"query": "Test query 3"},
|
||||
"expected_score": {"tool_selection": 9}
|
||||
}
|
||||
]
|
||||
|
||||
mock_evaluator = MagicMock()
|
||||
mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
|
||||
mock_evaluator.reset_iterations_results = MagicMock()
|
||||
mock_create_evaluator.return_value = mock_evaluator
|
||||
|
||||
runner = ExperimentRunner(dataset=dataset)
|
||||
|
||||
results = runner.run(crew=mock_crew)
|
||||
|
||||
assert isinstance(results, ExperimentResults)
|
||||
result_1, result_2, result_3 = results.results
|
||||
assert len(results.results) == 3
|
||||
|
||||
assert result_1.identifier == "test-case-1"
|
||||
assert result_1.inputs == {"query": "Test query 1"}
|
||||
assert result_1.expected_score == 8
|
||||
assert result_1.passed is True
|
||||
|
||||
assert result_2.identifier == "test-case-2"
|
||||
assert result_2.inputs == {"query": "Test query 2"}
|
||||
assert isinstance(result_2.expected_score, dict)
|
||||
assert "goal_alignment" in result_2.expected_score
|
||||
assert result_2.passed is True
|
||||
|
||||
assert result_3.identifier == "c2ed49e63aa9a83af3ca382794134fd5"
|
||||
assert result_3.inputs == {"query": "Test query 3"}
|
||||
assert isinstance(result_3.expected_score, dict)
|
||||
assert "tool_selection" in result_3.expected_score
|
||||
assert result_3.passed is False
|
||||
|
||||
assert mock_crew.kickoff.call_count == 3
|
||||
mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 1"})
|
||||
mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 2"})
|
||||
mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 3"})
|
||||
|
||||
assert mock_evaluator.reset_iterations_results.call_count == 3
|
||||
assert mock_evaluator.get_agent_evaluation.call_count == 3
|
||||
|
||||
|
||||
@patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
|
||||
def test_run_success_with_unknown_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
|
||||
dataset = [
|
||||
{
|
||||
"identifier": "test-case-2",
|
||||
"inputs": {"query": "Test query 2"},
|
||||
"expected_score": {"goal_alignment": 7, "unknown_metric": 8}
|
||||
}
|
||||
]
|
||||
|
||||
mock_evaluator = MagicMock()
|
||||
mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
|
||||
mock_evaluator.reset_iterations_results = MagicMock()
|
||||
mock_create_evaluator.return_value = mock_evaluator
|
||||
|
||||
runner = ExperimentRunner(dataset=dataset)
|
||||
|
||||
results = runner.run(crew=mock_crew)
|
||||
|
||||
result, = results.results
|
||||
|
||||
assert result.identifier == "test-case-2"
|
||||
assert result.inputs == {"query": "Test query 2"}
|
||||
assert isinstance(result.expected_score, dict)
|
||||
assert "goal_alignment" in result.expected_score.keys()
|
||||
assert "unknown_metric" in result.expected_score.keys()
|
||||
assert result.passed is True
|
||||
|
||||
@patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
|
||||
def test_run_success_with_single_metric_evaluator_and_expected_specific_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
|
||||
dataset = [
|
||||
{
|
||||
"identifier": "test-case-2",
|
||||
"inputs": {"query": "Test query 2"},
|
||||
"expected_score": {"goal_alignment": 7}
|
||||
}
|
||||
]
|
||||
|
||||
mock_evaluator = MagicMock()
|
||||
mock_create_evaluator["Test Agent"].metrics = {
|
||||
MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
|
||||
score=9,
|
||||
feedback="Test feedback for goal alignment",
|
||||
raw_response="Test raw response for goal alignment"
|
||||
)
|
||||
}
|
||||
mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
|
||||
mock_evaluator.reset_iterations_results = MagicMock()
|
||||
mock_create_evaluator.return_value = mock_evaluator
|
||||
|
||||
runner = ExperimentRunner(dataset=dataset)
|
||||
|
||||
results = runner.run(crew=mock_crew)
|
||||
result, = results.results
|
||||
|
||||
assert result.identifier == "test-case-2"
|
||||
assert result.inputs == {"query": "Test query 2"}
|
||||
assert isinstance(result.expected_score, dict)
|
||||
assert "goal_alignment" in result.expected_score.keys()
|
||||
assert result.passed is True
|
||||
|
||||
@patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
|
||||
def test_run_success_when_expected_metric_is_not_available(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
|
||||
dataset = [
|
||||
{
|
||||
"identifier": "test-case-2",
|
||||
"inputs": {"query": "Test query 2"},
|
||||
"expected_score": {"unknown_metric": 7}
|
||||
}
|
||||
]
|
||||
|
||||
mock_evaluator = MagicMock()
|
||||
mock_create_evaluator["Test Agent"].metrics = {
|
||||
MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
|
||||
score=5,
|
||||
feedback="Test feedback for goal alignment",
|
||||
raw_response="Test raw response for goal alignment"
|
||||
)
|
||||
}
|
||||
mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
|
||||
mock_evaluator.reset_iterations_results = MagicMock()
|
||||
mock_create_evaluator.return_value = mock_evaluator
|
||||
|
||||
runner = ExperimentRunner(dataset=dataset)
|
||||
|
||||
results = runner.run(crew=mock_crew)
|
||||
result, = results.results
|
||||
|
||||
assert result.identifier == "test-case-2"
|
||||
assert result.inputs == {"query": "Test query 2"}
|
||||
assert isinstance(result.expected_score, dict)
|
||||
assert "unknown_metric" in result.expected_score.keys()
|
||||
assert result.passed is False
|
||||
Reference in New Issue
Block a user