mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-06 14:48:29 +00:00
Compare commits
8 Commits
devin/1752
...
lg-agent-p
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cbe570088e | ||
|
|
6ebb6c9b63 | ||
|
|
53f674be60 | ||
|
|
11717a5213 | ||
|
|
b6d699f764 | ||
|
|
5b15061b87 | ||
|
|
1b6b2b36d9 | ||
|
|
3ada4053bd |
75
.github/workflows/regression-tests.yml
vendored
Normal file
75
.github/workflows/regression-tests.yml
vendored
Normal file
@@ -0,0 +1,75 @@
|
||||
name: Regression Tests
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
branch:
|
||||
description: 'Branch to run tests on'
|
||||
required: true
|
||||
default: 'main'
|
||||
type: string
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
env:
|
||||
OPENAI_API_KEY: fake-api-key
|
||||
PYTHONUNBUFFERED: 1
|
||||
|
||||
jobs:
|
||||
regression-tests:
|
||||
name: Regression - ${{ github.event.inputs.branch }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event.inputs.branch }}
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Display execution info
|
||||
run: |
|
||||
echo "🚀 Running Regression Tests"
|
||||
echo "📂 Branch: ${{ github.event.inputs.branch }}"
|
||||
echo "📊 Current commit: $(git rev-parse --short HEAD)"
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v3
|
||||
with:
|
||||
enable-cache: true
|
||||
cache-dependency-glob: |
|
||||
**/pyproject.toml
|
||||
**/uv.lock
|
||||
|
||||
- name: Set up Python 3.13
|
||||
run: uv python install 3.13
|
||||
|
||||
- name: Install the project
|
||||
run: uv sync --dev --all-extras
|
||||
|
||||
- name: Install SQLite with FTS5 support
|
||||
run: |
|
||||
# WORKAROUND: GitHub Actions' Ubuntu runner uses SQLite without FTS5 support compiled in.
|
||||
# This is a temporary fix until the runner includes SQLite with FTS5 or Python's sqlite3
|
||||
# module is compiled with FTS5 support by default.
|
||||
# TODO: Remove this workaround once GitHub Actions runners include SQLite FTS5 support
|
||||
|
||||
# Install pysqlite3-binary which has FTS5 support
|
||||
uv pip install pysqlite3-binary
|
||||
# Create a sitecustomize.py to override sqlite3 with pysqlite3
|
||||
mkdir -p .pytest_sqlite_override
|
||||
echo "import sys; import pysqlite3; sys.modules['sqlite3'] = pysqlite3" > .pytest_sqlite_override/sitecustomize.py
|
||||
# Test FTS5 availability
|
||||
PYTHONPATH=.pytest_sqlite_override uv run python -c "import sqlite3; print(f'SQLite version: {sqlite3.sqlite_version}')"
|
||||
PYTHONPATH=.pytest_sqlite_override uv run python -c "import sqlite3; conn = sqlite3.connect(':memory:'); conn.execute('CREATE VIRTUAL TABLE test USING fts5(content)'); print('FTS5 module available')"
|
||||
|
||||
- name: Run Regression Tests
|
||||
run: |
|
||||
PYTHONPATH=.pytest_sqlite_override uv run pytest \
|
||||
--block-network \
|
||||
--timeout=30 \
|
||||
-vv \
|
||||
--durations=10 \
|
||||
-n auto \
|
||||
--maxfail=5 \
|
||||
tests/regression
|
||||
@@ -9,12 +9,7 @@
|
||||
},
|
||||
"favicon": "/images/favicon.svg",
|
||||
"contextual": {
|
||||
"options": [
|
||||
"copy",
|
||||
"view",
|
||||
"chatgpt",
|
||||
"claude"
|
||||
]
|
||||
"options": ["copy", "view", "chatgpt", "claude"]
|
||||
},
|
||||
"navigation": {
|
||||
"languages": [
|
||||
@@ -55,32 +50,22 @@
|
||||
"groups": [
|
||||
{
|
||||
"group": "Get Started",
|
||||
"pages": [
|
||||
"en/introduction",
|
||||
"en/installation",
|
||||
"en/quickstart"
|
||||
]
|
||||
"pages": ["en/introduction", "en/installation", "en/quickstart"]
|
||||
},
|
||||
{
|
||||
"group": "Guides",
|
||||
"pages": [
|
||||
{
|
||||
"group": "Strategy",
|
||||
"pages": [
|
||||
"en/guides/concepts/evaluating-use-cases"
|
||||
]
|
||||
"pages": ["en/guides/concepts/evaluating-use-cases"]
|
||||
},
|
||||
{
|
||||
"group": "Agents",
|
||||
"pages": [
|
||||
"en/guides/agents/crafting-effective-agents"
|
||||
]
|
||||
"pages": ["en/guides/agents/crafting-effective-agents"]
|
||||
},
|
||||
{
|
||||
"group": "Crews",
|
||||
"pages": [
|
||||
"en/guides/crews/first-crew"
|
||||
]
|
||||
"pages": ["en/guides/crews/first-crew"]
|
||||
},
|
||||
{
|
||||
"group": "Flows",
|
||||
@@ -94,7 +79,6 @@
|
||||
"pages": [
|
||||
"en/guides/advanced/customizing-prompts",
|
||||
"en/guides/advanced/fingerprinting"
|
||||
|
||||
]
|
||||
}
|
||||
]
|
||||
@@ -241,6 +225,7 @@
|
||||
"en/observability/langtrace",
|
||||
"en/observability/maxim",
|
||||
"en/observability/mlflow",
|
||||
"en/observability/neatlogs",
|
||||
"en/observability/openlit",
|
||||
"en/observability/opik",
|
||||
"en/observability/patronus-evaluation",
|
||||
@@ -274,9 +259,7 @@
|
||||
},
|
||||
{
|
||||
"group": "Telemetry",
|
||||
"pages": [
|
||||
"en/telemetry"
|
||||
]
|
||||
"pages": ["en/telemetry"]
|
||||
}
|
||||
]
|
||||
},
|
||||
@@ -285,9 +268,7 @@
|
||||
"groups": [
|
||||
{
|
||||
"group": "Getting Started",
|
||||
"pages": [
|
||||
"en/enterprise/introduction"
|
||||
]
|
||||
"pages": ["en/enterprise/introduction"]
|
||||
},
|
||||
{
|
||||
"group": "Features",
|
||||
@@ -342,9 +323,7 @@
|
||||
},
|
||||
{
|
||||
"group": "Resources",
|
||||
"pages": [
|
||||
"en/enterprise/resources/frequently-asked-questions"
|
||||
]
|
||||
"pages": ["en/enterprise/resources/frequently-asked-questions"]
|
||||
}
|
||||
]
|
||||
},
|
||||
@@ -353,9 +332,7 @@
|
||||
"groups": [
|
||||
{
|
||||
"group": "Getting Started",
|
||||
"pages": [
|
||||
"en/api-reference/introduction"
|
||||
]
|
||||
"pages": ["en/api-reference/introduction"]
|
||||
},
|
||||
{
|
||||
"group": "Endpoints",
|
||||
@@ -365,16 +342,13 @@
|
||||
},
|
||||
{
|
||||
"tab": "Examples",
|
||||
"groups": [
|
||||
"groups": [
|
||||
{
|
||||
"group": "Examples",
|
||||
"pages": [
|
||||
"en/examples/example"
|
||||
]
|
||||
"pages": ["en/examples/example"]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -425,21 +399,15 @@
|
||||
"pages": [
|
||||
{
|
||||
"group": "Estratégia",
|
||||
"pages": [
|
||||
"pt-BR/guides/concepts/evaluating-use-cases"
|
||||
]
|
||||
"pages": ["pt-BR/guides/concepts/evaluating-use-cases"]
|
||||
},
|
||||
{
|
||||
"group": "Agentes",
|
||||
"pages": [
|
||||
"pt-BR/guides/agents/crafting-effective-agents"
|
||||
]
|
||||
"pages": ["pt-BR/guides/agents/crafting-effective-agents"]
|
||||
},
|
||||
{
|
||||
"group": "Crews",
|
||||
"pages": [
|
||||
"pt-BR/guides/crews/first-crew"
|
||||
]
|
||||
"pages": ["pt-BR/guides/crews/first-crew"]
|
||||
},
|
||||
{
|
||||
"group": "Flows",
|
||||
@@ -632,9 +600,7 @@
|
||||
},
|
||||
{
|
||||
"group": "Telemetria",
|
||||
"pages": [
|
||||
"pt-BR/telemetry"
|
||||
]
|
||||
"pages": ["pt-BR/telemetry"]
|
||||
}
|
||||
]
|
||||
},
|
||||
@@ -643,9 +609,7 @@
|
||||
"groups": [
|
||||
{
|
||||
"group": "Começando",
|
||||
"pages": [
|
||||
"pt-BR/enterprise/introduction"
|
||||
]
|
||||
"pages": ["pt-BR/enterprise/introduction"]
|
||||
},
|
||||
{
|
||||
"group": "Funcionalidades",
|
||||
@@ -710,9 +674,7 @@
|
||||
"groups": [
|
||||
{
|
||||
"group": "Começando",
|
||||
"pages": [
|
||||
"pt-BR/api-reference/introduction"
|
||||
]
|
||||
"pages": ["pt-BR/api-reference/introduction"]
|
||||
},
|
||||
{
|
||||
"group": "Endpoints",
|
||||
@@ -722,16 +684,13 @@
|
||||
},
|
||||
{
|
||||
"tab": "Exemplos",
|
||||
"groups": [
|
||||
"groups": [
|
||||
{
|
||||
"group": "Exemplos",
|
||||
"pages": [
|
||||
"pt-BR/examples/example"
|
||||
]
|
||||
"pages": ["pt-BR/examples/example"]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
140
docs/en/observability/neatlogs.mdx
Normal file
140
docs/en/observability/neatlogs.mdx
Normal file
@@ -0,0 +1,140 @@
|
||||
---
|
||||
title: Neatlogs Integration
|
||||
description: Understand, debug, and share your CrewAI agent runs
|
||||
icon: magnifying-glass-chart
|
||||
---
|
||||
|
||||
# Introduction
|
||||
|
||||
Neatlogs helps you **see what your agent did**, **why**, and **share it**.
|
||||
|
||||
It captures every step: thoughts, tool calls, responses, evaluations. No raw logs. Just clear, structured traces. Great for debugging and collaboration.
|
||||
|
||||
---
|
||||
|
||||
## Why use Neatlogs?
|
||||
|
||||
CrewAI agents use multiple tools and reasoning steps. When something goes wrong, you need context — not just errors.
|
||||
|
||||
Neatlogs lets you:
|
||||
|
||||
- Follow the full decision path
|
||||
- Add feedback directly on steps
|
||||
- Chat with the trace using AI assistant
|
||||
- Share runs publicly for feedback
|
||||
- Turn insights into tasks
|
||||
|
||||
All in one place.
|
||||
|
||||
Manage your traces effortlessly
|
||||
|
||||

|
||||

|
||||
|
||||
The best UX to view a CrewAI trace. Post comments anywhere you want. Use AI to debug.
|
||||
|
||||

|
||||

|
||||

|
||||
|
||||
---
|
||||
|
||||
## Core Features
|
||||
|
||||
- **Trace Viewer**: Track thoughts, tools, and decisions in sequence
|
||||
- **Inline Comments**: Tag teammates on any trace step
|
||||
- **Feedback & Evaluation**: Mark outputs as correct or incorrect
|
||||
- **Error Highlighting**: Automatic flagging of API/tool failures
|
||||
- **Task Conversion**: Convert comments into assigned tasks
|
||||
- **Ask the Trace (AI)**: Chat with your trace using Neatlogs AI bot
|
||||
- **Public Sharing**: Publish trace links to your community
|
||||
|
||||
---
|
||||
|
||||
## Quick Setup with CrewAI
|
||||
|
||||
<Steps>
|
||||
<Step title="Sign Up & Get API Key">
|
||||
Visit [neatlogs.com](https://neatlogs.com/?utm_source=crewAI-docs), create a project, copy the API key.
|
||||
</Step>
|
||||
<Step title="Install SDK">
|
||||
```bash
|
||||
pip install neatlogs
|
||||
```
|
||||
(Latest version 0.8.0, Python 3.8+; MIT license) :contentReference[oaicite:1]{index=1}
|
||||
</Step>
|
||||
<Step title="Initialize Neatlogs">
|
||||
Before starting Crew agents, add:
|
||||
|
||||
```python
|
||||
import neatlogs
|
||||
neatlogs.init("YOUR_PROJECT_API_KEY")
|
||||
```
|
||||
|
||||
Agents run as usual. Neatlogs captures everything automatically.
|
||||
|
||||
</Step>
|
||||
</Steps>
|
||||
|
||||
---
|
||||
|
||||
## Under the Hood
|
||||
|
||||
According to GitHub, Neatlogs:
|
||||
|
||||
- Captures thoughts, tool calls, responses, errors, and token stats :contentReference[oaicite:2]{index=2}
|
||||
- Supports AI-powered task generation and robust evaluation workflows :contentReference[oaicite:3]{index=3}
|
||||
|
||||
All with just two lines of code.
|
||||
|
||||
---
|
||||
|
||||
## Watch It Work
|
||||
|
||||
### 🔍 Full Demo (4 min)
|
||||
|
||||
<iframe
|
||||
width="100%"
|
||||
height="315"
|
||||
src="https://www.youtube.com/embed/8KDme9T2I7Q?si=b8oHteaBwFNs_Duk"
|
||||
title="YouTube video player"
|
||||
frameBorder="0"
|
||||
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
|
||||
allowFullScreen
|
||||
></iframe>
|
||||
|
||||
### ⚙️ CrewAI Integration (30 s)
|
||||
|
||||
<iframe
|
||||
className="w-full aspect-video rounded-xl"
|
||||
src="https://www.loom.com/embed/9c78b552af43452bb3e4783cb8d91230?sid=e9d7d370-a91a-49b0-809e-2f375d9e801d"
|
||||
title="Loom video player"
|
||||
frameBorder="0"
|
||||
allowFullScreen
|
||||
></iframe>
|
||||
|
||||
---
|
||||
|
||||
## Links & Support
|
||||
|
||||
- 📘 [Neatlogs Docs](https://docs.neatlogs.com/)
|
||||
- 🔐 [Dashboard & API Key](https://app.neatlogs.com/)
|
||||
- 🐦 [Follow on Twitter](https://twitter.com/neatlogs)
|
||||
- 📧 Contact: hello@neatlogs.com
|
||||
- 🛠 [GitHub SDK](https://github.com/NeatLogs/neatlogs) :contentReference[oaicite:4]{index=4}
|
||||
|
||||
---
|
||||
|
||||
## TL;DR
|
||||
|
||||
With just:
|
||||
|
||||
```bash
|
||||
pip install neatlogs
|
||||
|
||||
import neatlogs
|
||||
neatlogs.init("YOUR_API_KEY")
|
||||
|
||||
You can now capture, understand, share, and act on your CrewAI agent runs in seconds.
|
||||
No setup overhead. Full trace transparency. Full team collaboration.
|
||||
```
|
||||
BIN
docs/images/neatlogs-1.png
Normal file
BIN
docs/images/neatlogs-1.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 222 KiB |
BIN
docs/images/neatlogs-2.png
Normal file
BIN
docs/images/neatlogs-2.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 329 KiB |
BIN
docs/images/neatlogs-3.png
Normal file
BIN
docs/images/neatlogs-3.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 590 KiB |
BIN
docs/images/neatlogs-4.png
Normal file
BIN
docs/images/neatlogs-4.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 216 KiB |
BIN
docs/images/neatlogs-5.png
Normal file
BIN
docs/images/neatlogs-5.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 277 KiB |
@@ -137,3 +137,6 @@ exclude = [
|
||||
"docs/**",
|
||||
"docs/",
|
||||
]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
norecursedirs = ["tests/regression"]
|
||||
|
||||
@@ -1313,7 +1313,6 @@ class Crew(FlowTrackable, BaseModel):
|
||||
n_iterations: int,
|
||||
eval_llm: Union[str, InstanceOf[BaseLLM]],
|
||||
inputs: Optional[Dict[str, Any]] = None,
|
||||
include_agent_eval: Optional[bool] = False
|
||||
) -> None:
|
||||
"""Test and evaluate the Crew with the given inputs for n iterations concurrently using concurrent.futures."""
|
||||
try:
|
||||
@@ -1333,28 +1332,13 @@ class Crew(FlowTrackable, BaseModel):
|
||||
)
|
||||
test_crew = self.copy()
|
||||
|
||||
# TODO: Refator to use a single Evaluator Manage class
|
||||
evaluator = CrewEvaluator(test_crew, llm_instance)
|
||||
|
||||
if include_agent_eval:
|
||||
from crewai.evaluation import create_default_evaluator
|
||||
agent_evaluator = create_default_evaluator(crew=test_crew)
|
||||
|
||||
for i in range(1, n_iterations + 1):
|
||||
evaluator.set_iteration(i)
|
||||
|
||||
if include_agent_eval:
|
||||
agent_evaluator.set_iteration(i)
|
||||
|
||||
test_crew.kickoff(inputs=inputs)
|
||||
|
||||
# TODO: Refactor to use ListenerEvents instead of trigger each iteration manually
|
||||
if include_agent_eval:
|
||||
agent_evaluator.evaluate_current_iteration()
|
||||
|
||||
evaluator.print_crew_evaluation_result()
|
||||
if include_agent_eval:
|
||||
agent_evaluator.get_agent_evaluation(include_evaluation_feedback=True)
|
||||
|
||||
crewai_event_bus.emit(
|
||||
self,
|
||||
|
||||
@@ -1,178 +0,0 @@
|
||||
from crewai.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
from crewai.evaluation.evaluation_display import EvaluationDisplayFormatter
|
||||
|
||||
from typing import Any, Dict
|
||||
from collections import defaultdict
|
||||
from crewai.evaluation import BaseEvaluator, create_evaluation_callbacks
|
||||
from collections.abc import Sequence
|
||||
from crewai.crew import Crew
|
||||
from crewai.utilities.events.crewai_event_bus import crewai_event_bus
|
||||
from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
|
||||
|
||||
class AgentEvaluator:
|
||||
def __init__(
|
||||
self,
|
||||
evaluators: Sequence[BaseEvaluator] | None = None,
|
||||
crew: Crew | None = None,
|
||||
):
|
||||
self.crew: Crew | None = crew
|
||||
self.evaluators: Sequence[BaseEvaluator] | None = evaluators
|
||||
|
||||
self.agent_evaluators: dict[str, Sequence[BaseEvaluator] | None] = {}
|
||||
if crew is not None:
|
||||
assert crew and crew.agents is not None
|
||||
for agent in crew.agents:
|
||||
self.agent_evaluators[str(agent.id)] = self.evaluators
|
||||
|
||||
self.callback = create_evaluation_callbacks()
|
||||
self.console_formatter = ConsoleFormatter()
|
||||
self.display_formatter = EvaluationDisplayFormatter()
|
||||
|
||||
self.iteration = 1
|
||||
self.iterations_results: dict[int, dict[str, list[AgentEvaluationResult]]] = {}
|
||||
|
||||
def set_iteration(self, iteration: int) -> None:
|
||||
self.iteration = iteration
|
||||
|
||||
def evaluate_current_iteration(self) -> dict[str, list[AgentEvaluationResult]]:
|
||||
if not self.crew:
|
||||
raise ValueError("Cannot evaluate: no crew was provided to the evaluator.")
|
||||
|
||||
if not self.callback:
|
||||
raise ValueError("Cannot evaluate: no callback was set. Use set_callback() method first.")
|
||||
|
||||
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
|
||||
evaluation_results: defaultdict[str, list[AgentEvaluationResult]] = defaultdict(list)
|
||||
|
||||
total_evals = 0
|
||||
for agent in self.crew.agents:
|
||||
for task in self.crew.tasks:
|
||||
if task.agent and task.agent.id == agent.id and self.agent_evaluators.get(str(agent.id)):
|
||||
total_evals += 1
|
||||
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[bold blue]{task.description}[/bold blue]"),
|
||||
BarColumn(),
|
||||
TextColumn("{task.percentage:.0f}% completed"),
|
||||
console=self.console_formatter.console
|
||||
) as progress:
|
||||
eval_task = progress.add_task(f"Evaluating agents (iteration {self.iteration})...", total=total_evals)
|
||||
|
||||
for agent in self.crew.agents:
|
||||
evaluator = self.agent_evaluators.get(str(agent.id))
|
||||
if not evaluator:
|
||||
continue
|
||||
|
||||
for task in self.crew.tasks:
|
||||
|
||||
if task.agent and str(task.agent.id) != str(agent.id):
|
||||
continue
|
||||
|
||||
trace = self.callback.get_trace(str(agent.id), str(task.id))
|
||||
if not trace:
|
||||
self.console_formatter.print(f"[yellow]Warning: No trace found for agent {agent.role} on task {task.description[:30]}...[/yellow]")
|
||||
progress.update(eval_task, advance=1)
|
||||
continue
|
||||
|
||||
with crewai_event_bus.scoped_handlers():
|
||||
result = self.evaluate(
|
||||
agent=agent,
|
||||
task=task,
|
||||
execution_trace=trace,
|
||||
final_output=task.output
|
||||
)
|
||||
evaluation_results[agent.role].append(result)
|
||||
progress.update(eval_task, advance=1)
|
||||
|
||||
self.iterations_results[self.iteration] = evaluation_results
|
||||
return evaluation_results
|
||||
|
||||
def get_evaluation_results(self):
|
||||
if self.iteration in self.iterations_results:
|
||||
return self.iterations_results[self.iteration]
|
||||
|
||||
return self.evaluate_current_iteration()
|
||||
|
||||
def display_results_with_iterations(self):
|
||||
self.display_formatter.display_summary_results(self.iterations_results)
|
||||
|
||||
def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = False):
|
||||
agent_results = {}
|
||||
with crewai_event_bus.scoped_handlers():
|
||||
task_results = self.get_evaluation_results()
|
||||
for agent_role, results in task_results.items():
|
||||
if not results:
|
||||
continue
|
||||
|
||||
agent_id = results[0].agent_id
|
||||
|
||||
aggregated_result = self.display_formatter._aggregate_agent_results(
|
||||
agent_id=agent_id,
|
||||
agent_role=agent_role,
|
||||
results=results,
|
||||
strategy=strategy
|
||||
)
|
||||
|
||||
agent_results[agent_role] = aggregated_result
|
||||
|
||||
|
||||
if self.iteration == max(self.iterations_results.keys()):
|
||||
self.display_results_with_iterations()
|
||||
|
||||
if include_evaluation_feedback:
|
||||
self.display_evaluation_with_feedback()
|
||||
|
||||
return agent_results
|
||||
|
||||
def display_evaluation_with_feedback(self):
|
||||
self.display_formatter.display_evaluation_with_feedback(self.iterations_results)
|
||||
|
||||
def evaluate(
|
||||
self,
|
||||
agent: Agent,
|
||||
task: Task,
|
||||
execution_trace: Dict[str, Any],
|
||||
final_output: Any
|
||||
) -> AgentEvaluationResult:
|
||||
result = AgentEvaluationResult(
|
||||
agent_id=str(agent.id),
|
||||
task_id=str(task.id)
|
||||
)
|
||||
assert self.evaluators is not None
|
||||
for evaluator in self.evaluators:
|
||||
try:
|
||||
score = evaluator.evaluate(
|
||||
agent=agent,
|
||||
task=task,
|
||||
execution_trace=execution_trace,
|
||||
final_output=final_output
|
||||
)
|
||||
result.metrics[evaluator.metric_category] = score
|
||||
except Exception as e:
|
||||
self.console_formatter.print(f"Error in {evaluator.metric_category.value} evaluator: {str(e)}")
|
||||
|
||||
return result
|
||||
|
||||
def create_default_evaluator(crew, llm=None):
|
||||
from crewai.evaluation import (
|
||||
GoalAlignmentEvaluator,
|
||||
SemanticQualityEvaluator,
|
||||
ToolSelectionEvaluator,
|
||||
ParameterExtractionEvaluator,
|
||||
ToolInvocationEvaluator,
|
||||
ReasoningEfficiencyEvaluator
|
||||
)
|
||||
|
||||
evaluators = [
|
||||
GoalAlignmentEvaluator(llm=llm),
|
||||
SemanticQualityEvaluator(llm=llm),
|
||||
ToolSelectionEvaluator(llm=llm),
|
||||
ParameterExtractionEvaluator(llm=llm),
|
||||
ToolInvocationEvaluator(llm=llm),
|
||||
ReasoningEfficiencyEvaluator(llm=llm),
|
||||
]
|
||||
|
||||
return AgentEvaluator(evaluators=evaluators, crew=crew)
|
||||
40
src/crewai/experimental/__init__.py
Normal file
40
src/crewai/experimental/__init__.py
Normal file
@@ -0,0 +1,40 @@
|
||||
from crewai.experimental.evaluation import (
|
||||
BaseEvaluator,
|
||||
EvaluationScore,
|
||||
MetricCategory,
|
||||
AgentEvaluationResult,
|
||||
SemanticQualityEvaluator,
|
||||
GoalAlignmentEvaluator,
|
||||
ReasoningEfficiencyEvaluator,
|
||||
ToolSelectionEvaluator,
|
||||
ParameterExtractionEvaluator,
|
||||
ToolInvocationEvaluator,
|
||||
EvaluationTraceCallback,
|
||||
create_evaluation_callbacks,
|
||||
AgentEvaluator,
|
||||
create_default_evaluator,
|
||||
ExperimentRunner,
|
||||
ExperimentResults,
|
||||
ExperimentResult,
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"BaseEvaluator",
|
||||
"EvaluationScore",
|
||||
"MetricCategory",
|
||||
"AgentEvaluationResult",
|
||||
"SemanticQualityEvaluator",
|
||||
"GoalAlignmentEvaluator",
|
||||
"ReasoningEfficiencyEvaluator",
|
||||
"ToolSelectionEvaluator",
|
||||
"ParameterExtractionEvaluator",
|
||||
"ToolInvocationEvaluator",
|
||||
"EvaluationTraceCallback",
|
||||
"create_evaluation_callbacks",
|
||||
"AgentEvaluator",
|
||||
"create_default_evaluator",
|
||||
"ExperimentRunner",
|
||||
"ExperimentResults",
|
||||
"ExperimentResult"
|
||||
]
|
||||
@@ -1,40 +1,35 @@
|
||||
from crewai.evaluation.base_evaluator import (
|
||||
from crewai.experimental.evaluation.base_evaluator import (
|
||||
BaseEvaluator,
|
||||
EvaluationScore,
|
||||
MetricCategory,
|
||||
AgentEvaluationResult
|
||||
)
|
||||
|
||||
from crewai.evaluation.metrics.semantic_quality_metrics import (
|
||||
SemanticQualityEvaluator
|
||||
)
|
||||
|
||||
from crewai.evaluation.metrics.goal_metrics import (
|
||||
GoalAlignmentEvaluator
|
||||
)
|
||||
|
||||
from crewai.evaluation.metrics.reasoning_metrics import (
|
||||
ReasoningEfficiencyEvaluator
|
||||
)
|
||||
|
||||
|
||||
from crewai.evaluation.metrics.tools_metrics import (
|
||||
from crewai.experimental.evaluation.metrics import (
|
||||
SemanticQualityEvaluator,
|
||||
GoalAlignmentEvaluator,
|
||||
ReasoningEfficiencyEvaluator,
|
||||
ToolSelectionEvaluator,
|
||||
ParameterExtractionEvaluator,
|
||||
ToolInvocationEvaluator
|
||||
)
|
||||
|
||||
from crewai.evaluation.evaluation_listener import (
|
||||
from crewai.experimental.evaluation.evaluation_listener import (
|
||||
EvaluationTraceCallback,
|
||||
create_evaluation_callbacks
|
||||
)
|
||||
|
||||
|
||||
from crewai.evaluation.agent_evaluator import (
|
||||
from crewai.experimental.evaluation.agent_evaluator import (
|
||||
AgentEvaluator,
|
||||
create_default_evaluator
|
||||
)
|
||||
|
||||
from crewai.experimental.evaluation.experiment import (
|
||||
ExperimentRunner,
|
||||
ExperimentResults,
|
||||
ExperimentResult
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"BaseEvaluator",
|
||||
"EvaluationScore",
|
||||
@@ -49,5 +44,8 @@ __all__ = [
|
||||
"EvaluationTraceCallback",
|
||||
"create_evaluation_callbacks",
|
||||
"AgentEvaluator",
|
||||
"create_default_evaluator"
|
||||
]
|
||||
"create_default_evaluator",
|
||||
"ExperimentRunner",
|
||||
"ExperimentResults",
|
||||
"ExperimentResult"
|
||||
]
|
||||
217
src/crewai/experimental/evaluation/agent_evaluator.py
Normal file
217
src/crewai/experimental/evaluation/agent_evaluator.py
Normal file
@@ -0,0 +1,217 @@
|
||||
from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
from crewai.experimental.evaluation.evaluation_display import EvaluationDisplayFormatter
|
||||
|
||||
from typing import Any
|
||||
from crewai.experimental.evaluation import BaseEvaluator, create_evaluation_callbacks
|
||||
from collections.abc import Sequence
|
||||
from crewai.utilities.events.crewai_event_bus import crewai_event_bus
|
||||
from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
|
||||
from crewai.utilities.events.task_events import TaskCompletedEvent
|
||||
from crewai.utilities.events.agent_events import LiteAgentExecutionCompletedEvent
|
||||
from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult
|
||||
import threading
|
||||
|
||||
class ExecutionState:
|
||||
def __init__(self):
|
||||
self.traces = {}
|
||||
self.current_agent_id = None
|
||||
self.current_task_id = None
|
||||
self.iteration = 1
|
||||
self.iterations_results = {}
|
||||
self.agent_evaluators = {}
|
||||
|
||||
class AgentEvaluator:
|
||||
def __init__(
|
||||
self,
|
||||
agents: list[Agent],
|
||||
evaluators: Sequence[BaseEvaluator] | None = None,
|
||||
):
|
||||
self.agents: list[Agent] = agents
|
||||
self.evaluators: Sequence[BaseEvaluator] | None = evaluators
|
||||
|
||||
self.callback = create_evaluation_callbacks()
|
||||
self.console_formatter = ConsoleFormatter()
|
||||
self.display_formatter = EvaluationDisplayFormatter()
|
||||
|
||||
self._thread_local: threading.local = threading.local()
|
||||
|
||||
for agent in self.agents:
|
||||
self._execution_state.agent_evaluators[str(agent.id)] = self.evaluators
|
||||
|
||||
self._subscribe_to_events()
|
||||
|
||||
@property
|
||||
def _execution_state(self) -> ExecutionState:
|
||||
if not hasattr(self._thread_local, 'execution_state'):
|
||||
self._thread_local.execution_state = ExecutionState()
|
||||
return self._thread_local.execution_state
|
||||
|
||||
def _subscribe_to_events(self) -> None:
|
||||
crewai_event_bus.register_handler(TaskCompletedEvent, self._handle_task_completed)
|
||||
crewai_event_bus.register_handler(LiteAgentExecutionCompletedEvent, self._handle_lite_agent_completed)
|
||||
|
||||
def _handle_task_completed(self, source: Any, event: TaskCompletedEvent) -> None:
|
||||
assert event.task is not None
|
||||
agent = event.task.agent
|
||||
if agent and str(getattr(agent, 'id', 'unknown')) in self._execution_state.agent_evaluators:
|
||||
state = ExecutionState()
|
||||
state.current_agent_id = str(agent.id)
|
||||
state.current_task_id = str(event.task.id)
|
||||
|
||||
trace = self.callback.get_trace(state.current_agent_id, state.current_task_id)
|
||||
|
||||
if not trace:
|
||||
return
|
||||
|
||||
result = self.evaluate(
|
||||
agent=agent,
|
||||
task=event.task,
|
||||
execution_trace=trace,
|
||||
final_output=event.output,
|
||||
state=state
|
||||
)
|
||||
|
||||
current_iteration = self._execution_state.iteration
|
||||
if current_iteration not in self._execution_state.iterations_results:
|
||||
self._execution_state.iterations_results[current_iteration] = {}
|
||||
|
||||
if agent.role not in self._execution_state.iterations_results[current_iteration]:
|
||||
self._execution_state.iterations_results[current_iteration][agent.role] = []
|
||||
|
||||
self._execution_state.iterations_results[current_iteration][agent.role].append(result)
|
||||
|
||||
def _handle_lite_agent_completed(self, source: object, event: LiteAgentExecutionCompletedEvent) -> None:
|
||||
agent_info = event.agent_info
|
||||
agent_id = str(agent_info["id"])
|
||||
|
||||
if agent_id in self._execution_state.agent_evaluators:
|
||||
state = ExecutionState()
|
||||
state.current_agent_id = agent_id
|
||||
state.current_task_id = "lite_task"
|
||||
|
||||
target_agent = None
|
||||
for agent in self.agents:
|
||||
if str(agent.id) == agent_id:
|
||||
target_agent = agent
|
||||
break
|
||||
|
||||
if not target_agent:
|
||||
return
|
||||
|
||||
trace = self.callback.get_trace(state.current_agent_id, state.current_task_id)
|
||||
|
||||
if not trace:
|
||||
return
|
||||
|
||||
result = self.evaluate(
|
||||
agent=target_agent,
|
||||
execution_trace=trace,
|
||||
final_output=event.output,
|
||||
state=state
|
||||
)
|
||||
|
||||
current_iteration = self._execution_state.iteration
|
||||
if current_iteration not in self._execution_state.iterations_results:
|
||||
self._execution_state.iterations_results[current_iteration] = {}
|
||||
|
||||
agent_role = target_agent.role
|
||||
if agent_role not in self._execution_state.iterations_results[current_iteration]:
|
||||
self._execution_state.iterations_results[current_iteration][agent_role] = []
|
||||
|
||||
self._execution_state.iterations_results[current_iteration][agent_role].append(result)
|
||||
|
||||
def set_iteration(self, iteration: int) -> None:
|
||||
self._execution_state.iteration = iteration
|
||||
|
||||
def reset_iterations_results(self) -> None:
|
||||
self._execution_state.iterations_results = {}
|
||||
|
||||
def get_evaluation_results(self) -> dict[str, list[AgentEvaluationResult]]:
|
||||
if self._execution_state.iterations_results and self._execution_state.iteration in self._execution_state.iterations_results:
|
||||
return self._execution_state.iterations_results[self._execution_state.iteration]
|
||||
return {}
|
||||
|
||||
def display_results_with_iterations(self) -> None:
|
||||
self.display_formatter.display_summary_results(self._execution_state.iterations_results)
|
||||
|
||||
def get_agent_evaluation(self, strategy: AggregationStrategy = AggregationStrategy.SIMPLE_AVERAGE, include_evaluation_feedback: bool = True) -> dict[str, AgentAggregatedEvaluationResult]:
|
||||
agent_results = {}
|
||||
with crewai_event_bus.scoped_handlers():
|
||||
task_results = self.get_evaluation_results()
|
||||
for agent_role, results in task_results.items():
|
||||
if not results:
|
||||
continue
|
||||
|
||||
agent_id = results[0].agent_id
|
||||
|
||||
aggregated_result = self.display_formatter._aggregate_agent_results(
|
||||
agent_id=agent_id,
|
||||
agent_role=agent_role,
|
||||
results=results,
|
||||
strategy=strategy
|
||||
)
|
||||
|
||||
agent_results[agent_role] = aggregated_result
|
||||
|
||||
|
||||
if self._execution_state.iterations_results and self._execution_state.iteration == max(self._execution_state.iterations_results.keys(), default=0):
|
||||
self.display_results_with_iterations()
|
||||
|
||||
if include_evaluation_feedback:
|
||||
self.display_evaluation_with_feedback()
|
||||
|
||||
return agent_results
|
||||
|
||||
def display_evaluation_with_feedback(self) -> None:
|
||||
self.display_formatter.display_evaluation_with_feedback(self._execution_state.iterations_results)
|
||||
|
||||
def evaluate(
|
||||
self,
|
||||
agent: Agent,
|
||||
execution_trace: dict[str, Any],
|
||||
final_output: Any,
|
||||
state: ExecutionState,
|
||||
task: Task | None = None,
|
||||
) -> AgentEvaluationResult:
|
||||
result = AgentEvaluationResult(
|
||||
agent_id=state.current_agent_id or str(agent.id),
|
||||
task_id=state.current_task_id or (str(task.id) if task else "unknown_task")
|
||||
)
|
||||
|
||||
assert self.evaluators is not None
|
||||
for evaluator in self.evaluators:
|
||||
try:
|
||||
score = evaluator.evaluate(
|
||||
agent=agent,
|
||||
task=task,
|
||||
execution_trace=execution_trace,
|
||||
final_output=final_output
|
||||
)
|
||||
result.metrics[evaluator.metric_category] = score
|
||||
except Exception as e:
|
||||
self.console_formatter.print(f"Error in {evaluator.metric_category.value} evaluator: {str(e)}")
|
||||
|
||||
return result
|
||||
|
||||
def create_default_evaluator(agents: list[Agent], llm: None = None):
|
||||
from crewai.experimental.evaluation import (
|
||||
GoalAlignmentEvaluator,
|
||||
SemanticQualityEvaluator,
|
||||
ToolSelectionEvaluator,
|
||||
ParameterExtractionEvaluator,
|
||||
ToolInvocationEvaluator,
|
||||
ReasoningEfficiencyEvaluator
|
||||
)
|
||||
|
||||
evaluators = [
|
||||
GoalAlignmentEvaluator(llm=llm),
|
||||
SemanticQualityEvaluator(llm=llm),
|
||||
ToolSelectionEvaluator(llm=llm),
|
||||
ParameterExtractionEvaluator(llm=llm),
|
||||
ToolInvocationEvaluator(llm=llm),
|
||||
ReasoningEfficiencyEvaluator(llm=llm),
|
||||
]
|
||||
|
||||
return AgentEvaluator(evaluators=evaluators, agents=agents)
|
||||
@@ -57,9 +57,9 @@ class BaseEvaluator(abc.ABC):
|
||||
def evaluate(
|
||||
self,
|
||||
agent: Agent,
|
||||
task: Task,
|
||||
execution_trace: Dict[str, Any],
|
||||
final_output: Any,
|
||||
task: Task | None = None,
|
||||
) -> EvaluationScore:
|
||||
pass
|
||||
|
||||
@@ -3,8 +3,8 @@ from typing import Dict, Any, List
|
||||
from rich.table import Table
|
||||
from rich.box import HEAVY_EDGE, ROUNDED
|
||||
from collections.abc import Sequence
|
||||
from crewai.evaluation.base_evaluator import AgentAggregatedEvaluationResult, AggregationStrategy, AgentEvaluationResult, MetricCategory
|
||||
from crewai.evaluation import EvaluationScore
|
||||
from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult, AggregationStrategy, AgentEvaluationResult, MetricCategory
|
||||
from crewai.experimental.evaluation import EvaluationScore
|
||||
from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
|
||||
from crewai.utilities.llm_utils import create_llm
|
||||
|
||||
@@ -17,7 +17,6 @@ class EvaluationDisplayFormatter:
|
||||
self.console_formatter.print("[yellow]No evaluation results to display[/yellow]")
|
||||
return
|
||||
|
||||
# Get all agent roles across all iterations
|
||||
all_agent_roles: set[str] = set()
|
||||
for iter_results in iterations_results.values():
|
||||
all_agent_roles.update(iter_results.keys())
|
||||
@@ -25,7 +24,6 @@ class EvaluationDisplayFormatter:
|
||||
for agent_role in sorted(all_agent_roles):
|
||||
self.console_formatter.print(f"\n[bold cyan]Agent: {agent_role}[/bold cyan]")
|
||||
|
||||
# Process each iteration
|
||||
for iter_num, results in sorted(iterations_results.items()):
|
||||
if agent_role not in results or not results[agent_role]:
|
||||
continue
|
||||
@@ -33,23 +31,19 @@ class EvaluationDisplayFormatter:
|
||||
agent_results = results[agent_role]
|
||||
agent_id = agent_results[0].agent_id
|
||||
|
||||
# Aggregate results for this agent in this iteration
|
||||
aggregated_result = self._aggregate_agent_results(
|
||||
agent_id=agent_id,
|
||||
agent_role=agent_role,
|
||||
results=agent_results,
|
||||
)
|
||||
|
||||
# Display iteration header
|
||||
self.console_formatter.print(f"\n[bold]Iteration {iter_num}[/bold]")
|
||||
|
||||
# Create table for this iteration
|
||||
table = Table(box=ROUNDED)
|
||||
table.add_column("Metric", style="cyan")
|
||||
table.add_column("Score (1-10)", justify="center")
|
||||
table.add_column("Feedback", style="green")
|
||||
|
||||
# Add metrics to table
|
||||
if aggregated_result.metrics:
|
||||
for metric, evaluation_score in aggregated_result.metrics.items():
|
||||
score = evaluation_score.score
|
||||
@@ -91,7 +85,6 @@ class EvaluationDisplayFormatter:
|
||||
"Overall agent evaluation score"
|
||||
)
|
||||
|
||||
# Print the table for this iteration
|
||||
self.console_formatter.print(table)
|
||||
|
||||
def display_summary_results(self, iterations_results: Dict[int, Dict[str, List[AgentAggregatedEvaluationResult]]]):
|
||||
@@ -248,7 +241,6 @@ class EvaluationDisplayFormatter:
|
||||
feedback_summary = None
|
||||
if feedbacks:
|
||||
if len(feedbacks) > 1:
|
||||
# Use the summarization method for multiple feedbacks
|
||||
feedback_summary = self._summarize_feedbacks(
|
||||
agent_role=agent_role,
|
||||
metric=category.title(),
|
||||
@@ -307,7 +299,7 @@ class EvaluationDisplayFormatter:
|
||||
strategy_guidance = "Focus on the highest-scoring aspects and strengths demonstrated."
|
||||
elif strategy == AggregationStrategy.WORST_PERFORMANCE:
|
||||
strategy_guidance = "Focus on areas that need improvement and common issues across tasks."
|
||||
else: # Default/average strategies
|
||||
else:
|
||||
strategy_guidance = "Provide a balanced analysis of strengths and weaknesses across all tasks."
|
||||
|
||||
prompt = [
|
||||
@@ -9,7 +9,9 @@ from crewai.utilities.events.base_event_listener import BaseEventListener
|
||||
from crewai.utilities.events.crewai_event_bus import CrewAIEventsBus
|
||||
from crewai.utilities.events.agent_events import (
|
||||
AgentExecutionStartedEvent,
|
||||
AgentExecutionCompletedEvent
|
||||
AgentExecutionCompletedEvent,
|
||||
LiteAgentExecutionStartedEvent,
|
||||
LiteAgentExecutionCompletedEvent
|
||||
)
|
||||
from crewai.utilities.events.tool_usage_events import (
|
||||
ToolUsageFinishedEvent,
|
||||
@@ -52,10 +54,18 @@ class EvaluationTraceCallback(BaseEventListener):
|
||||
def on_agent_started(source, event: AgentExecutionStartedEvent):
|
||||
self.on_agent_start(event.agent, event.task)
|
||||
|
||||
@event_bus.on(LiteAgentExecutionStartedEvent)
|
||||
def on_lite_agent_started(source, event: LiteAgentExecutionStartedEvent):
|
||||
self.on_lite_agent_start(event.agent_info)
|
||||
|
||||
@event_bus.on(AgentExecutionCompletedEvent)
|
||||
def on_agent_completed(source, event: AgentExecutionCompletedEvent):
|
||||
self.on_agent_finish(event.agent, event.task, event.output)
|
||||
|
||||
@event_bus.on(LiteAgentExecutionCompletedEvent)
|
||||
def on_lite_agent_completed(source, event: LiteAgentExecutionCompletedEvent):
|
||||
self.on_lite_agent_finish(event.output)
|
||||
|
||||
@event_bus.on(ToolUsageFinishedEvent)
|
||||
def on_tool_completed(source, event: ToolUsageFinishedEvent):
|
||||
self.on_tool_use(event.tool_name, event.tool_args, event.output, success=True)
|
||||
@@ -88,19 +98,38 @@ class EvaluationTraceCallback(BaseEventListener):
|
||||
def on_llm_call_completed(source, event: LLMCallCompletedEvent):
|
||||
self.on_llm_call_end(event.messages, event.response)
|
||||
|
||||
def on_lite_agent_start(self, agent_info: dict[str, Any]):
|
||||
self.current_agent_id = agent_info['id']
|
||||
self.current_task_id = "lite_task"
|
||||
|
||||
trace_key = f"{self.current_agent_id}_{self.current_task_id}"
|
||||
self._init_trace(
|
||||
trace_key=trace_key,
|
||||
agent_id=self.current_agent_id,
|
||||
task_id=self.current_task_id,
|
||||
tool_uses=[],
|
||||
llm_calls=[],
|
||||
start_time=datetime.now(),
|
||||
final_output=None
|
||||
)
|
||||
|
||||
def _init_trace(self, trace_key: str, **kwargs: Any):
|
||||
self.traces[trace_key] = kwargs
|
||||
|
||||
def on_agent_start(self, agent: Agent, task: Task):
|
||||
self.current_agent_id = agent.id
|
||||
self.current_task_id = task.id
|
||||
|
||||
trace_key = f"{agent.id}_{task.id}"
|
||||
self.traces[trace_key] = {
|
||||
"agent_id": agent.id,
|
||||
"task_id": task.id,
|
||||
"tool_uses": [],
|
||||
"llm_calls": [],
|
||||
"start_time": datetime.now(),
|
||||
"final_output": None
|
||||
}
|
||||
self._init_trace(
|
||||
trace_key=trace_key,
|
||||
agent_id=agent.id,
|
||||
task_id=task.id,
|
||||
tool_uses=[],
|
||||
llm_calls=[],
|
||||
start_time=datetime.now(),
|
||||
final_output=None
|
||||
)
|
||||
|
||||
def on_agent_finish(self, agent: Agent, task: Task, output: Any):
|
||||
trace_key = f"{agent.id}_{task.id}"
|
||||
@@ -108,9 +137,20 @@ class EvaluationTraceCallback(BaseEventListener):
|
||||
self.traces[trace_key]["final_output"] = output
|
||||
self.traces[trace_key]["end_time"] = datetime.now()
|
||||
|
||||
self._reset_current()
|
||||
|
||||
def _reset_current(self):
|
||||
self.current_agent_id = None
|
||||
self.current_task_id = None
|
||||
|
||||
def on_lite_agent_finish(self, output: Any):
|
||||
trace_key = f"{self.current_agent_id}_lite_task"
|
||||
if trace_key in self.traces:
|
||||
self.traces[trace_key]["final_output"] = output
|
||||
self.traces[trace_key]["end_time"] = datetime.now()
|
||||
|
||||
self._reset_current()
|
||||
|
||||
def on_tool_use(self, tool_name: str, tool_args: dict[str, Any] | str, result: Any,
|
||||
success: bool = True, error_type: str | None = None):
|
||||
if not self.current_agent_id or not self.current_task_id:
|
||||
@@ -0,0 +1,8 @@
|
||||
from crewai.experimental.evaluation.experiment.runner import ExperimentRunner
|
||||
from crewai.experimental.evaluation.experiment.result import ExperimentResults, ExperimentResult
|
||||
|
||||
__all__ = [
|
||||
"ExperimentRunner",
|
||||
"ExperimentResults",
|
||||
"ExperimentResult"
|
||||
]
|
||||
122
src/crewai/experimental/evaluation/experiment/result.py
Normal file
122
src/crewai/experimental/evaluation/experiment/result.py
Normal file
@@ -0,0 +1,122 @@
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
from pydantic import BaseModel
|
||||
|
||||
class ExperimentResult(BaseModel):
|
||||
identifier: str
|
||||
inputs: dict[str, Any]
|
||||
score: int | dict[str, int | float]
|
||||
expected_score: int | dict[str, int | float]
|
||||
passed: bool
|
||||
agent_evaluations: dict[str, Any] | None = None
|
||||
|
||||
class ExperimentResults:
|
||||
def __init__(self, results: list[ExperimentResult], metadata: dict[str, Any] | None = None):
|
||||
self.results = results
|
||||
self.metadata = metadata or {}
|
||||
self.timestamp = datetime.now(timezone.utc)
|
||||
|
||||
from crewai.experimental.evaluation.experiment.result_display import ExperimentResultsDisplay
|
||||
self.display = ExperimentResultsDisplay()
|
||||
|
||||
def to_json(self, filepath: str | None = None) -> dict[str, Any]:
|
||||
data = {
|
||||
"timestamp": self.timestamp.isoformat(),
|
||||
"metadata": self.metadata,
|
||||
"results": [r.model_dump(exclude={"agent_evaluations"}) for r in self.results]
|
||||
}
|
||||
|
||||
if filepath:
|
||||
with open(filepath, 'w') as f:
|
||||
json.dump(data, f, indent=2)
|
||||
self.display.console.print(f"[green]Results saved to {filepath}[/green]")
|
||||
|
||||
return data
|
||||
|
||||
def compare_with_baseline(self, baseline_filepath: str, save_current: bool = True, print_summary: bool = False) -> dict[str, Any]:
|
||||
baseline_runs = []
|
||||
|
||||
if os.path.exists(baseline_filepath) and os.path.getsize(baseline_filepath) > 0:
|
||||
try:
|
||||
with open(baseline_filepath, 'r') as f:
|
||||
baseline_data = json.load(f)
|
||||
|
||||
if isinstance(baseline_data, dict) and "timestamp" in baseline_data:
|
||||
baseline_runs = [baseline_data]
|
||||
elif isinstance(baseline_data, list):
|
||||
baseline_runs = baseline_data
|
||||
except (json.JSONDecodeError, FileNotFoundError) as e:
|
||||
self.display.console.print(f"[yellow]Warning: Could not load baseline file: {str(e)}[/yellow]")
|
||||
|
||||
if not baseline_runs:
|
||||
if save_current:
|
||||
current_data = self.to_json()
|
||||
with open(baseline_filepath, 'w') as f:
|
||||
json.dump([current_data], f, indent=2)
|
||||
self.display.console.print(f"[green]Saved current results as new baseline to {baseline_filepath}[/green]")
|
||||
return {"is_baseline": True, "changes": {}}
|
||||
|
||||
baseline_runs.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
|
||||
latest_run = baseline_runs[0]
|
||||
|
||||
comparison = self._compare_with_run(latest_run)
|
||||
|
||||
if print_summary:
|
||||
self.display.comparison_summary(comparison, latest_run["timestamp"])
|
||||
|
||||
if save_current:
|
||||
current_data = self.to_json()
|
||||
baseline_runs.append(current_data)
|
||||
with open(baseline_filepath, 'w') as f:
|
||||
json.dump(baseline_runs, f, indent=2)
|
||||
self.display.console.print(f"[green]Added current results to baseline file {baseline_filepath}[/green]")
|
||||
|
||||
return comparison
|
||||
|
||||
def _compare_with_run(self, baseline_run: dict[str, Any]) -> dict[str, Any]:
|
||||
baseline_results = baseline_run.get("results", [])
|
||||
|
||||
baseline_lookup = {}
|
||||
for result in baseline_results:
|
||||
test_identifier = result.get("identifier")
|
||||
if test_identifier:
|
||||
baseline_lookup[test_identifier] = result
|
||||
|
||||
improved = []
|
||||
regressed = []
|
||||
unchanged = []
|
||||
new_tests = []
|
||||
|
||||
for result in self.results:
|
||||
test_identifier = result.identifier
|
||||
if not test_identifier or test_identifier not in baseline_lookup:
|
||||
new_tests.append(test_identifier)
|
||||
continue
|
||||
|
||||
baseline_result = baseline_lookup[test_identifier]
|
||||
baseline_passed = baseline_result.get("passed", False)
|
||||
if result.passed and not baseline_passed:
|
||||
improved.append(test_identifier)
|
||||
elif not result.passed and baseline_passed:
|
||||
regressed.append(test_identifier)
|
||||
else:
|
||||
unchanged.append(test_identifier)
|
||||
|
||||
missing_tests = []
|
||||
current_test_identifiers = {result.identifier for result in self.results}
|
||||
for result in baseline_results:
|
||||
test_identifier = result.get("identifier")
|
||||
if test_identifier and test_identifier not in current_test_identifiers:
|
||||
missing_tests.append(test_identifier)
|
||||
|
||||
return {
|
||||
"improved": improved,
|
||||
"regressed": regressed,
|
||||
"unchanged": unchanged,
|
||||
"new_tests": new_tests,
|
||||
"missing_tests": missing_tests,
|
||||
"total_compared": len(improved) + len(regressed) + len(unchanged),
|
||||
"baseline_timestamp": baseline_run.get("timestamp", "unknown")
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
from typing import Dict, Any
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
from rich.panel import Panel
|
||||
from crewai.experimental.evaluation.experiment.result import ExperimentResults
|
||||
|
||||
class ExperimentResultsDisplay:
|
||||
def __init__(self):
|
||||
self.console = Console()
|
||||
|
||||
def summary(self, experiment_results: ExperimentResults):
|
||||
total = len(experiment_results.results)
|
||||
passed = sum(1 for r in experiment_results.results if r.passed)
|
||||
|
||||
table = Table(title="Experiment Summary")
|
||||
table.add_column("Metric", style="cyan")
|
||||
table.add_column("Value", style="green")
|
||||
|
||||
table.add_row("Total Test Cases", str(total))
|
||||
table.add_row("Passed", str(passed))
|
||||
table.add_row("Failed", str(total - passed))
|
||||
table.add_row("Success Rate", f"{(passed / total * 100):.1f}%" if total > 0 else "N/A")
|
||||
|
||||
self.console.print(table)
|
||||
|
||||
def comparison_summary(self, comparison: Dict[str, Any], baseline_timestamp: str):
|
||||
self.console.print(Panel(f"[bold]Comparison with baseline run from {baseline_timestamp}[/bold]",
|
||||
expand=False))
|
||||
|
||||
table = Table(title="Results Comparison")
|
||||
table.add_column("Metric", style="cyan")
|
||||
table.add_column("Count", style="white")
|
||||
table.add_column("Details", style="dim")
|
||||
|
||||
improved = comparison.get("improved", [])
|
||||
if improved:
|
||||
details = ", ".join([f"{test_identifier}" for test_identifier in improved[:3]])
|
||||
if len(improved) > 3:
|
||||
details += f" and {len(improved) - 3} more"
|
||||
table.add_row("✅ Improved", str(len(improved)), details)
|
||||
else:
|
||||
table.add_row("✅ Improved", "0", "")
|
||||
|
||||
regressed = comparison.get("regressed", [])
|
||||
if regressed:
|
||||
details = ", ".join([f"{test_identifier}" for test_identifier in regressed[:3]])
|
||||
if len(regressed) > 3:
|
||||
details += f" and {len(regressed) - 3} more"
|
||||
table.add_row("❌ Regressed", str(len(regressed)), details, style="red")
|
||||
else:
|
||||
table.add_row("❌ Regressed", "0", "")
|
||||
|
||||
unchanged = comparison.get("unchanged", [])
|
||||
table.add_row("⏺ Unchanged", str(len(unchanged)), "")
|
||||
|
||||
new_tests = comparison.get("new_tests", [])
|
||||
if new_tests:
|
||||
details = ", ".join(new_tests[:3])
|
||||
if len(new_tests) > 3:
|
||||
details += f" and {len(new_tests) - 3} more"
|
||||
table.add_row("➕ New Tests", str(len(new_tests)), details)
|
||||
|
||||
missing_tests = comparison.get("missing_tests", [])
|
||||
if missing_tests:
|
||||
details = ", ".join(missing_tests[:3])
|
||||
if len(missing_tests) > 3:
|
||||
details += f" and {len(missing_tests) - 3} more"
|
||||
table.add_row("➖ Missing Tests", str(len(missing_tests)), details)
|
||||
|
||||
self.console.print(table)
|
||||
125
src/crewai/experimental/evaluation/experiment/runner.py
Normal file
125
src/crewai/experimental/evaluation/experiment/runner.py
Normal file
@@ -0,0 +1,125 @@
|
||||
from collections import defaultdict
|
||||
from hashlib import md5
|
||||
from typing import Any
|
||||
|
||||
from crewai import Crew, Agent
|
||||
from crewai.experimental.evaluation import AgentEvaluator, create_default_evaluator
|
||||
from crewai.experimental.evaluation.experiment.result_display import ExperimentResultsDisplay
|
||||
from crewai.experimental.evaluation.experiment.result import ExperimentResults, ExperimentResult
|
||||
from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
|
||||
|
||||
class ExperimentRunner:
|
||||
def __init__(self, dataset: list[dict[str, Any]]):
|
||||
self.dataset = dataset or []
|
||||
self.evaluator: AgentEvaluator | None = None
|
||||
self.display = ExperimentResultsDisplay()
|
||||
|
||||
def run(self, crew: Crew | None = None, agents: list[Agent] | None = None, print_summary: bool = False) -> ExperimentResults:
|
||||
if crew and not agents:
|
||||
agents = crew.agents
|
||||
|
||||
assert agents is not None
|
||||
self.evaluator = create_default_evaluator(agents=agents)
|
||||
|
||||
results = []
|
||||
|
||||
for test_case in self.dataset:
|
||||
self.evaluator.reset_iterations_results()
|
||||
result = self._run_test_case(test_case=test_case, crew=crew, agents=agents)
|
||||
results.append(result)
|
||||
|
||||
experiment_results = ExperimentResults(results)
|
||||
|
||||
if print_summary:
|
||||
self.display.summary(experiment_results)
|
||||
|
||||
return experiment_results
|
||||
|
||||
def _run_test_case(self, test_case: dict[str, Any], agents: list[Agent], crew: Crew | None = None) -> ExperimentResult:
|
||||
inputs = test_case["inputs"]
|
||||
expected_score = test_case["expected_score"]
|
||||
identifier = test_case.get("identifier") or md5(str(test_case).encode(), usedforsecurity=False).hexdigest()
|
||||
|
||||
try:
|
||||
self.display.console.print(f"[dim]Running crew with input: {str(inputs)[:50]}...[/dim]")
|
||||
self.display.console.print("\n")
|
||||
if crew:
|
||||
crew.kickoff(inputs=inputs)
|
||||
else:
|
||||
for agent in agents:
|
||||
agent.kickoff(**inputs)
|
||||
|
||||
assert self.evaluator is not None
|
||||
agent_evaluations = self.evaluator.get_agent_evaluation()
|
||||
|
||||
actual_score = self._extract_scores(agent_evaluations)
|
||||
|
||||
passed = self._assert_scores(expected_score, actual_score)
|
||||
return ExperimentResult(
|
||||
identifier=identifier,
|
||||
inputs=inputs,
|
||||
score=actual_score,
|
||||
expected_score=expected_score,
|
||||
passed=passed,
|
||||
agent_evaluations=agent_evaluations
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.display.console.print(f"[red]Error running test case: {str(e)}[/red]")
|
||||
return ExperimentResult(
|
||||
identifier=identifier,
|
||||
inputs=inputs,
|
||||
score=0,
|
||||
expected_score=expected_score,
|
||||
passed=False
|
||||
)
|
||||
|
||||
def _extract_scores(self, agent_evaluations: dict[str, AgentAggregatedEvaluationResult]) -> float | dict[str, float]:
|
||||
all_scores: dict[str, list[float]] = defaultdict(list)
|
||||
for evaluation in agent_evaluations.values():
|
||||
for metric_name, score in evaluation.metrics.items():
|
||||
if score.score is not None:
|
||||
all_scores[metric_name.value].append(score.score)
|
||||
|
||||
avg_scores = {m: sum(s)/len(s) for m, s in all_scores.items()}
|
||||
|
||||
if len(avg_scores) == 1:
|
||||
return list(avg_scores.values())[0]
|
||||
|
||||
return avg_scores
|
||||
|
||||
def _assert_scores(self, expected: float | dict[str, float],
|
||||
actual: float | dict[str, float]) -> bool:
|
||||
"""
|
||||
Compare expected and actual scores, and return whether the test case passed.
|
||||
|
||||
The rules for comparison are as follows:
|
||||
- If both expected and actual scores are single numbers, the actual score must be >= expected.
|
||||
- If expected is a single number and actual is a dict, compare against the average of actual values.
|
||||
- If expected is a dict and actual is a single number, actual must be >= all expected values.
|
||||
- If both are dicts, actual must have matching keys with values >= expected values.
|
||||
"""
|
||||
|
||||
if isinstance(expected, (int, float)) and isinstance(actual, (int, float)):
|
||||
return actual >= expected
|
||||
|
||||
if isinstance(expected, dict) and isinstance(actual, (int, float)):
|
||||
return all(actual >= exp_score for exp_score in expected.values())
|
||||
|
||||
if isinstance(expected, (int, float)) and isinstance(actual, dict):
|
||||
if not actual:
|
||||
return False
|
||||
avg_score = sum(actual.values()) / len(actual)
|
||||
return avg_score >= expected
|
||||
|
||||
if isinstance(expected, dict) and isinstance(actual, dict):
|
||||
if not expected:
|
||||
return True
|
||||
matching_keys = set(expected.keys()) & set(actual.keys())
|
||||
if not matching_keys:
|
||||
return False
|
||||
|
||||
# All matching keys must have actual >= expected
|
||||
return all(actual[key] >= expected[key] for key in matching_keys)
|
||||
|
||||
return False
|
||||
26
src/crewai/experimental/evaluation/metrics/__init__.py
Normal file
26
src/crewai/experimental/evaluation/metrics/__init__.py
Normal file
@@ -0,0 +1,26 @@
|
||||
from crewai.experimental.evaluation.metrics.reasoning_metrics import (
|
||||
ReasoningEfficiencyEvaluator
|
||||
)
|
||||
|
||||
from crewai.experimental.evaluation.metrics.tools_metrics import (
|
||||
ToolSelectionEvaluator,
|
||||
ParameterExtractionEvaluator,
|
||||
ToolInvocationEvaluator
|
||||
)
|
||||
|
||||
from crewai.experimental.evaluation.metrics.goal_metrics import (
|
||||
GoalAlignmentEvaluator
|
||||
)
|
||||
|
||||
from crewai.experimental.evaluation.metrics.semantic_quality_metrics import (
|
||||
SemanticQualityEvaluator
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"ReasoningEfficiencyEvaluator",
|
||||
"ToolSelectionEvaluator",
|
||||
"ParameterExtractionEvaluator",
|
||||
"ToolInvocationEvaluator",
|
||||
"GoalAlignmentEvaluator",
|
||||
"SemanticQualityEvaluator"
|
||||
]
|
||||
@@ -3,8 +3,8 @@ from typing import Any, Dict
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
|
||||
from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
|
||||
from crewai.evaluation.json_parser import extract_json_from_llm_response
|
||||
from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
|
||||
from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
|
||||
|
||||
class GoalAlignmentEvaluator(BaseEvaluator):
|
||||
@property
|
||||
@@ -14,10 +14,14 @@ class GoalAlignmentEvaluator(BaseEvaluator):
|
||||
def evaluate(
|
||||
self,
|
||||
agent: Agent,
|
||||
task: Task,
|
||||
execution_trace: Dict[str, Any],
|
||||
final_output: Any,
|
||||
task: Task | None = None,
|
||||
) -> EvaluationScore:
|
||||
task_context = ""
|
||||
if task is not None:
|
||||
task_context = f"Task description: {task.description}\nExpected output: {task.expected_output}\n"
|
||||
|
||||
prompt = [
|
||||
{"role": "system", "content": """You are an expert evaluator assessing how well an AI agent's output aligns with its assigned task goal.
|
||||
|
||||
@@ -37,8 +41,7 @@ Return your evaluation as JSON with fields 'score' (number) and 'feedback' (stri
|
||||
{"role": "user", "content": f"""
|
||||
Agent role: {agent.role}
|
||||
Agent goal: {agent.goal}
|
||||
Task description: {task.description}
|
||||
Expected output: {task.expected_output}
|
||||
{task_context}
|
||||
|
||||
Agent's final output:
|
||||
{final_output}
|
||||
@@ -16,8 +16,8 @@ from collections.abc import Sequence
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
|
||||
from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
|
||||
from crewai.evaluation.json_parser import extract_json_from_llm_response
|
||||
from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
|
||||
from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
|
||||
from crewai.tasks.task_output import TaskOutput
|
||||
|
||||
class ReasoningPatternType(Enum):
|
||||
@@ -36,10 +36,14 @@ class ReasoningEfficiencyEvaluator(BaseEvaluator):
|
||||
def evaluate(
|
||||
self,
|
||||
agent: Agent,
|
||||
task: Task,
|
||||
execution_trace: Dict[str, Any],
|
||||
final_output: TaskOutput,
|
||||
final_output: TaskOutput | str,
|
||||
task: Task | None = None,
|
||||
) -> EvaluationScore:
|
||||
task_context = ""
|
||||
if task is not None:
|
||||
task_context = f"Task description: {task.description}\nExpected output: {task.expected_output}\n"
|
||||
|
||||
llm_calls = execution_trace.get("llm_calls", [])
|
||||
|
||||
if not llm_calls or len(llm_calls) < 2:
|
||||
@@ -83,6 +87,8 @@ class ReasoningEfficiencyEvaluator(BaseEvaluator):
|
||||
|
||||
call_samples = self._get_call_samples(llm_calls)
|
||||
|
||||
final_output = final_output.raw if isinstance(final_output, TaskOutput) else final_output
|
||||
|
||||
prompt = [
|
||||
{"role": "system", "content": """You are an expert evaluator assessing the reasoning efficiency of an AI agent's thought process.
|
||||
|
||||
@@ -117,7 +123,7 @@ Return your evaluation as JSON with the following structure:
|
||||
}"""},
|
||||
{"role": "user", "content": f"""
|
||||
Agent role: {agent.role}
|
||||
Task description: {task.description}
|
||||
{task_context}
|
||||
|
||||
Reasoning efficiency metrics:
|
||||
- Total LLM calls: {efficiency_metrics["total_llm_calls"]}
|
||||
@@ -130,7 +136,7 @@ Sample of agent reasoning flow (chronological sequence):
|
||||
{call_samples}
|
||||
|
||||
Agent's final output:
|
||||
{final_output.raw[:500]}... (truncated)
|
||||
{final_output[:500]}... (truncated)
|
||||
|
||||
Evaluate the reasoning efficiency of this agent based on these interaction patterns.
|
||||
Identify any inefficient reasoning patterns and provide specific suggestions for optimization.
|
||||
@@ -3,8 +3,8 @@ from typing import Any, Dict
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
|
||||
from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
|
||||
from crewai.evaluation.json_parser import extract_json_from_llm_response
|
||||
from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
|
||||
from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
|
||||
|
||||
class SemanticQualityEvaluator(BaseEvaluator):
|
||||
@property
|
||||
@@ -14,10 +14,13 @@ class SemanticQualityEvaluator(BaseEvaluator):
|
||||
def evaluate(
|
||||
self,
|
||||
agent: Agent,
|
||||
task: Task,
|
||||
execution_trace: Dict[str, Any],
|
||||
final_output: Any,
|
||||
task: Task | None = None,
|
||||
) -> EvaluationScore:
|
||||
task_context = ""
|
||||
if task is not None:
|
||||
task_context = f"Task description: {task.description}"
|
||||
prompt = [
|
||||
{"role": "system", "content": """You are an expert evaluator assessing the semantic quality of an AI agent's output.
|
||||
|
||||
@@ -37,7 +40,7 @@ Return your evaluation as JSON with fields 'score' (number) and 'feedback' (stri
|
||||
"""},
|
||||
{"role": "user", "content": f"""
|
||||
Agent role: {agent.role}
|
||||
Task description: {task.description}
|
||||
{task_context}
|
||||
|
||||
Agent's final output:
|
||||
{final_output}
|
||||
@@ -1,8 +1,8 @@
|
||||
import json
|
||||
from typing import Dict, Any
|
||||
|
||||
from crewai.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
|
||||
from crewai.evaluation.json_parser import extract_json_from_llm_response
|
||||
from crewai.experimental.evaluation.base_evaluator import BaseEvaluator, EvaluationScore, MetricCategory
|
||||
from crewai.experimental.evaluation.json_parser import extract_json_from_llm_response
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
|
||||
@@ -16,10 +16,14 @@ class ToolSelectionEvaluator(BaseEvaluator):
|
||||
def evaluate(
|
||||
self,
|
||||
agent: Agent,
|
||||
task: Task,
|
||||
execution_trace: Dict[str, Any],
|
||||
final_output: str,
|
||||
task: Task | None = None,
|
||||
) -> EvaluationScore:
|
||||
task_context = ""
|
||||
if task is not None:
|
||||
task_context = f"Task description: {task.description}"
|
||||
|
||||
tool_uses = execution_trace.get("tool_uses", [])
|
||||
tool_count = len(tool_uses)
|
||||
unique_tool_types = set([tool.get("tool", "Unknown tool") for tool in tool_uses])
|
||||
@@ -72,7 +76,7 @@ Return your evaluation as JSON with these fields:
|
||||
"""},
|
||||
{"role": "user", "content": f"""
|
||||
Agent role: {agent.role}
|
||||
Task description: {task.description}
|
||||
{task_context}
|
||||
|
||||
Available tools for this agent:
|
||||
{available_tools_info}
|
||||
@@ -128,10 +132,13 @@ class ParameterExtractionEvaluator(BaseEvaluator):
|
||||
def evaluate(
|
||||
self,
|
||||
agent: Agent,
|
||||
task: Task,
|
||||
execution_trace: Dict[str, Any],
|
||||
final_output: str,
|
||||
task: Task | None = None,
|
||||
) -> EvaluationScore:
|
||||
task_context = ""
|
||||
if task is not None:
|
||||
task_context = f"Task description: {task.description}"
|
||||
tool_uses = execution_trace.get("tool_uses", [])
|
||||
tool_count = len(tool_uses)
|
||||
|
||||
@@ -212,7 +219,7 @@ Return your evaluation as JSON with these fields:
|
||||
"""},
|
||||
{"role": "user", "content": f"""
|
||||
Agent role: {agent.role}
|
||||
Task description: {task.description}
|
||||
{task_context}
|
||||
|
||||
Parameter extraction examples:
|
||||
{param_samples_text}
|
||||
@@ -267,10 +274,13 @@ class ToolInvocationEvaluator(BaseEvaluator):
|
||||
def evaluate(
|
||||
self,
|
||||
agent: Agent,
|
||||
task: Task,
|
||||
execution_trace: Dict[str, Any],
|
||||
final_output: str,
|
||||
task: Task | None = None,
|
||||
) -> EvaluationScore:
|
||||
task_context = ""
|
||||
if task is not None:
|
||||
task_context = f"Task description: {task.description}"
|
||||
tool_uses = execution_trace.get("tool_uses", [])
|
||||
tool_errors = []
|
||||
tool_count = len(tool_uses)
|
||||
@@ -352,7 +362,7 @@ Return your evaluation as JSON with these fields:
|
||||
"""},
|
||||
{"role": "user", "content": f"""
|
||||
Agent role: {agent.role}
|
||||
Task description: {task.description}
|
||||
{task_context}
|
||||
|
||||
Tool invocation examples:
|
||||
{invocation_samples_text}
|
||||
71
src/crewai/experimental/evaluation/testing.py
Normal file
71
src/crewai/experimental/evaluation/testing.py
Normal file
@@ -0,0 +1,71 @@
|
||||
import inspect
|
||||
from pathlib import Path
|
||||
|
||||
from typing_extensions import Any
|
||||
import warnings
|
||||
from crewai.experimental.evaluation.experiment import ExperimentResults, ExperimentRunner
|
||||
from crewai import Crew, Agent
|
||||
|
||||
def assert_experiment_successfully(experiment_results: ExperimentResults, baseline_filepath: str | None = None) -> None:
|
||||
failed_tests = [result for result in experiment_results.results if not result.passed]
|
||||
|
||||
if failed_tests:
|
||||
detailed_failures: list[str] = []
|
||||
|
||||
for result in failed_tests:
|
||||
expected = result.expected_score
|
||||
actual = result.score
|
||||
detailed_failures.append(f"- {result.identifier}: expected {expected}, got {actual}")
|
||||
|
||||
failure_details = "\n".join(detailed_failures)
|
||||
raise AssertionError(f"The following test cases failed:\n{failure_details}")
|
||||
|
||||
baseline_filepath = baseline_filepath or _get_baseline_filepath_fallback()
|
||||
comparison = experiment_results.compare_with_baseline(baseline_filepath=baseline_filepath)
|
||||
assert_experiment_no_regression(comparison)
|
||||
|
||||
def assert_experiment_no_regression(comparison_result: dict[str, list[str]]) -> None:
|
||||
regressed = comparison_result.get("regressed", [])
|
||||
if regressed:
|
||||
raise AssertionError(f"Regression detected! The following tests that previously passed now fail: {regressed}")
|
||||
|
||||
missing_tests = comparison_result.get("missing_tests", [])
|
||||
if missing_tests:
|
||||
warnings.warn(
|
||||
f"Warning: {len(missing_tests)} tests from the baseline are missing in the current run: {missing_tests}",
|
||||
UserWarning
|
||||
)
|
||||
|
||||
def run_experiment(dataset: list[dict[str, Any]], crew: Crew | None = None, agents: list[Agent] | None = None, verbose: bool = False) -> ExperimentResults:
|
||||
runner = ExperimentRunner(dataset=dataset)
|
||||
|
||||
return runner.run(agents=agents, crew=crew, print_summary=verbose)
|
||||
|
||||
def _get_baseline_filepath_fallback() -> str:
|
||||
filename = "experiment_fallback.json"
|
||||
calling_file = None
|
||||
|
||||
try:
|
||||
current_frame = inspect.currentframe()
|
||||
if current_frame is not None:
|
||||
test_func_name = current_frame.f_back.f_back.f_code.co_name # type: ignore[union-attr]
|
||||
filename = f"{test_func_name}.json"
|
||||
calling_file = current_frame.f_back.f_back.f_code.co_filename # type: ignore[union-attr]
|
||||
except Exception:
|
||||
return filename
|
||||
|
||||
if not calling_file:
|
||||
return filename
|
||||
|
||||
calling_path = Path(calling_file)
|
||||
try:
|
||||
baseline_dir_parts = calling_path.parts[:-1]
|
||||
baseline_dir = Path(*baseline_dir_parts) / "results"
|
||||
baseline_dir.mkdir(parents=True, exist_ok=True)
|
||||
baseline_filepath = baseline_dir / filename
|
||||
return str(baseline_filepath)
|
||||
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
|
||||
return filename
|
||||
@@ -305,6 +305,7 @@ class LiteAgent(FlowTrackable, BaseModel):
|
||||
"""
|
||||
# Create agent info for event emission
|
||||
agent_info = {
|
||||
"id": self.id,
|
||||
"role": self.role,
|
||||
"goal": self.goal,
|
||||
"backstory": self.backstory,
|
||||
|
||||
@@ -67,6 +67,7 @@ class Task(BaseModel):
|
||||
description: Descriptive text detailing task's purpose and execution.
|
||||
expected_output: Clear definition of expected task outcome.
|
||||
output_file: File path for storing task output.
|
||||
create_directory: Whether to create the directory for output_file if it doesn't exist.
|
||||
output_json: Pydantic model for structuring JSON output.
|
||||
output_pydantic: Pydantic model for task output.
|
||||
security_config: Security configuration including fingerprinting.
|
||||
@@ -115,6 +116,10 @@ class Task(BaseModel):
|
||||
description="A file path to be used to create a file output.",
|
||||
default=None,
|
||||
)
|
||||
create_directory: Optional[bool] = Field(
|
||||
description="Whether to create the directory for output_file if it doesn't exist.",
|
||||
default=True,
|
||||
)
|
||||
output: Optional[TaskOutput] = Field(
|
||||
description="Task output, it's final result after being executed", default=None
|
||||
)
|
||||
@@ -753,8 +758,10 @@ Follow these guidelines:
|
||||
resolved_path = Path(self.output_file).expanduser().resolve()
|
||||
directory = resolved_path.parent
|
||||
|
||||
if not directory.exists():
|
||||
if self.create_directory and not directory.exists():
|
||||
directory.mkdir(parents=True, exist_ok=True)
|
||||
elif not self.create_directory and not directory.exists():
|
||||
raise RuntimeError(f"Directory {directory} does not exist and create_directory is False")
|
||||
|
||||
with resolved_path.open("w", encoding="utf-8") as file:
|
||||
if isinstance(result, dict):
|
||||
|
||||
@@ -4,6 +4,7 @@ from .agent_events import (
|
||||
AgentExecutionCompletedEvent,
|
||||
AgentExecutionErrorEvent,
|
||||
AgentExecutionStartedEvent,
|
||||
LiteAgentExecutionCompletedEvent,
|
||||
)
|
||||
from .crew_events import (
|
||||
CrewKickoffCompletedEvent,
|
||||
@@ -80,6 +81,7 @@ EventTypes = Union[
|
||||
CrewTrainFailedEvent,
|
||||
AgentExecutionStartedEvent,
|
||||
AgentExecutionCompletedEvent,
|
||||
LiteAgentExecutionCompletedEvent,
|
||||
TaskStartedEvent,
|
||||
TaskCompletedEvent,
|
||||
TaskFailedEvent,
|
||||
|
||||
237
tests/cassettes/TestAgentEvaluator.test_eval_lite_agent.yaml
Normal file
237
tests/cassettes/TestAgentEvaluator.test_eval_lite_agent.yaml
Normal file
@@ -0,0 +1,237 @@
|
||||
interactions:
|
||||
- request:
|
||||
body: '{"messages": [{"role": "system", "content": "You are Test Agent. An agent
|
||||
created for testing purposes\nYour personal goal is: Complete test tasks successfully\n\nTo
|
||||
give my best complete final answer to the task respond using the exact following
|
||||
format:\n\nThought: I now can give a great answer\nFinal Answer: Your final
|
||||
answer must be the great and the most complete as possible, it must be outcome
|
||||
described.\n\nI MUST use these formats, my job depends on it!"}, {"role": "user",
|
||||
"content": "Complete this task successfully"}], "model": "gpt-4o-mini", "stop":
|
||||
["\nObservation:"]}'
|
||||
headers:
|
||||
accept:
|
||||
- application/json
|
||||
accept-encoding:
|
||||
- gzip, deflate, zstd
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '583'
|
||||
content-type:
|
||||
- application/json
|
||||
host:
|
||||
- api.openai.com
|
||||
user-agent:
|
||||
- OpenAI/Python 1.93.0
|
||||
x-stainless-arch:
|
||||
- arm64
|
||||
x-stainless-async:
|
||||
- 'false'
|
||||
x-stainless-lang:
|
||||
- python
|
||||
x-stainless-os:
|
||||
- MacOS
|
||||
x-stainless-package-version:
|
||||
- 1.93.0
|
||||
x-stainless-raw-response:
|
||||
- 'true'
|
||||
x-stainless-read-timeout:
|
||||
- '600.0'
|
||||
x-stainless-retry-count:
|
||||
- '0'
|
||||
x-stainless-runtime:
|
||||
- CPython
|
||||
x-stainless-runtime-version:
|
||||
- 3.11.12
|
||||
method: POST
|
||||
uri: https://api.openai.com/v1/chat/completions
|
||||
response:
|
||||
body:
|
||||
string: !!binary |
|
||||
H4sIAAAAAAAAAwAAAP//jFNNb9swDL3nVxA6J0U+HKTNbd0woMAOw7Bu6LbCUCXa1iqLgkgnzYr8
|
||||
98FKWqdbB+wiQHx81OMj9TgCUM6qNSjTaDFt9JNL+TZ7N/dfrusPN01NyV6vPk3f/mrl5vLrXI17
|
||||
Bt39RCNPrDNDbfQojsIBNgm1YF91tlrOl+fzxXKWgZYs+p5WR5kUNGldcJP5dF5MpqvJ7PzIbsgZ
|
||||
ZLWG7yMAgMd89jqDxQe1hun4KdIis65RrZ+TAFQi30eUZnYsOogaD6ChIBiy9M8NdXUja7iCQFsw
|
||||
OkDtNgga6l4/6MBbTAA/wnsXtIc3+b6Gjx41I8REG2cRWoStkwakQeCIxlXOgEXRzjNQgvzigwBV
|
||||
OUU038OOOgiIFhr0MdPHoIOFK9g67wEDdwlBCI7OIjgB7oxB5qrzfpeznxRokIZS3wwk5EiB8ey0
|
||||
54RVx7r3PXTenwA6BBLdzy27fXtE9s/+eqpjojv+g6oqFxw3ZULNFHovWSiqjO5HALd5jt2L0aiY
|
||||
qI1SCt1jfu7i4lBODdszgEVxBIVE+yE+KxbjV8qVR79PFkEZbRq0A3XYGt1ZRyfA6KTpv9W8VvvQ
|
||||
uAv1/5QfAGMwCtoyJrTOvOx4SEvYf65/pT2bnAUrxrRxBktxmPpBWKx05w8rr3jHgm1ZuVBjiskd
|
||||
9r6K5aLQy0LjxcKo0X70GwAA//8DAMz2wVUFBAAA
|
||||
headers:
|
||||
CF-RAY:
|
||||
- 95f93ea9af627e0b-GRU
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Encoding:
|
||||
- gzip
|
||||
Content-Type:
|
||||
- application/json
|
||||
Date:
|
||||
- Tue, 15 Jul 2025 12:25:54 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Set-Cookie:
|
||||
- __cf_bm=GRZmZLrjW5ZRHNmUJa4ccrMcy20D1rmeqK6Ptlv0mRY-1752582354-1.0.1.1-xKd_yga48Eedech5TRlThlEpDgsB2whxkWHlCyAGOVMqMcvH1Ju9FdXYbuQ9NdUQcVxPLgiGM35lYhqSLVQiXDyK01dnyp2Gvm560FBN9DY;
|
||||
path=/; expires=Tue, 15-Jul-25 12:55:54 GMT; domain=.api.openai.com; HttpOnly;
|
||||
Secure; SameSite=None
|
||||
- _cfuvid=MYqswpSR7sqr4kGp6qZVkaL7HDYwMiww49PeN9QBP.A-1752582354973-0.0.1.1-604800000;
|
||||
path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
X-Content-Type-Options:
|
||||
- nosniff
|
||||
access-control-expose-headers:
|
||||
- X-Request-ID
|
||||
alt-svc:
|
||||
- h3=":443"; ma=86400
|
||||
cf-cache-status:
|
||||
- DYNAMIC
|
||||
openai-organization:
|
||||
- crewai-iuxna1
|
||||
openai-processing-ms:
|
||||
- '4047'
|
||||
openai-version:
|
||||
- '2020-10-01'
|
||||
strict-transport-security:
|
||||
- max-age=31536000; includeSubDomains; preload
|
||||
x-envoy-upstream-service-time:
|
||||
- '4440'
|
||||
x-ratelimit-limit-requests:
|
||||
- '30000'
|
||||
x-ratelimit-limit-tokens:
|
||||
- '150000000'
|
||||
x-ratelimit-remaining-requests:
|
||||
- '29999'
|
||||
x-ratelimit-remaining-tokens:
|
||||
- '149999885'
|
||||
x-ratelimit-reset-requests:
|
||||
- 2ms
|
||||
x-ratelimit-reset-tokens:
|
||||
- 0s
|
||||
x-request-id:
|
||||
- req_5704c0f206a927ddc12aa1a19b612a75
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
- request:
|
||||
body: '{"messages": [{"role": "system", "content": "You are an expert evaluator
|
||||
assessing how well an AI agent''s output aligns with its assigned task goal.\n\nScore
|
||||
the agent''s goal alignment on a scale from 0-10 where:\n- 0: Complete misalignment,
|
||||
agent did not understand or attempt the task goal\n- 5: Partial alignment, agent
|
||||
attempted the task but missed key requirements\n- 10: Perfect alignment, agent
|
||||
fully satisfied all task requirements\n\nConsider:\n1. Did the agent correctly
|
||||
interpret the task goal?\n2. Did the final output directly address the requirements?\n3.
|
||||
Did the agent focus on relevant aspects of the task?\n4. Did the agent provide
|
||||
all requested information or deliverables?\n\nReturn your evaluation as JSON
|
||||
with fields ''score'' (number) and ''feedback'' (string).\n"}, {"role": "user",
|
||||
"content": "\nAgent role: Test Agent\nAgent goal: Complete test tasks successfully\n\n\nAgent''s
|
||||
final output:\nPlease provide me with the specific details or context of the
|
||||
task you need help with, and I will ensure to complete it successfully and provide
|
||||
a thorough response.\n\nEvaluate how well the agent''s output aligns with the
|
||||
assigned task goal.\n"}], "model": "gpt-4o-mini", "stop": []}'
|
||||
headers:
|
||||
accept:
|
||||
- application/json
|
||||
accept-encoding:
|
||||
- gzip, deflate, zstd
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '1196'
|
||||
content-type:
|
||||
- application/json
|
||||
cookie:
|
||||
- __cf_bm=GRZmZLrjW5ZRHNmUJa4ccrMcy20D1rmeqK6Ptlv0mRY-1752582354-1.0.1.1-xKd_yga48Eedech5TRlThlEpDgsB2whxkWHlCyAGOVMqMcvH1Ju9FdXYbuQ9NdUQcVxPLgiGM35lYhqSLVQiXDyK01dnyp2Gvm560FBN9DY;
|
||||
_cfuvid=MYqswpSR7sqr4kGp6qZVkaL7HDYwMiww49PeN9QBP.A-1752582354973-0.0.1.1-604800000
|
||||
host:
|
||||
- api.openai.com
|
||||
user-agent:
|
||||
- OpenAI/Python 1.93.0
|
||||
x-stainless-arch:
|
||||
- arm64
|
||||
x-stainless-async:
|
||||
- 'false'
|
||||
x-stainless-lang:
|
||||
- python
|
||||
x-stainless-os:
|
||||
- MacOS
|
||||
x-stainless-package-version:
|
||||
- 1.93.0
|
||||
x-stainless-raw-response:
|
||||
- 'true'
|
||||
x-stainless-read-timeout:
|
||||
- '600.0'
|
||||
x-stainless-retry-count:
|
||||
- '0'
|
||||
x-stainless-runtime:
|
||||
- CPython
|
||||
x-stainless-runtime-version:
|
||||
- 3.11.12
|
||||
method: POST
|
||||
uri: https://api.openai.com/v1/chat/completions
|
||||
response:
|
||||
body:
|
||||
string: !!binary |
|
||||
H4sIAAAAAAAAA4xUy27bQAy8+yuIPdtGbMdN4FvbSxM0QIsEKNA6MJhdSmK82hWWVFwj8L8XKz/k
|
||||
9AH0ogOHnOFjVq8DAMPOLMDYCtXWjR990O+TT7dfZs/v5OtFy/ef7++mxfu7j83t/cONGeaK+PRM
|
||||
Vo9VYxvrxpNyDHvYJkKlzDq5mk/n19PZfN4BdXTkc1nZ6OgyjmoOPJpeTC9HF1ejyfWhuopsScwC
|
||||
fgwAAF67b+4zOPppFnAxPEZqEsGSzOKUBGBS9DliUIRFMagZ9qCNQSl0rb8uA8DSiI2JlmYB0+E+
|
||||
UBC5J7TrHFuah4oASwoKjh2EqOCojkE0oRIgWE+YoA2OUhZzHEqIBWhFoChrKCP6IWwqthWwgEY4
|
||||
bItASbRLEpDWWhIpWu+3Y7gJooRuCKyAsiYHRUxQx0TgSJG9DIGDY4ua5RA82nVW5cDKqPxCWYhC
|
||||
iSXBhrU69TOGbxV7ysxSxY0Awoa951AGkq69/do67QLZk8vBJsUXdgQYtoBWW/SQSJoYpFPq2Ptp
|
||||
MLjTttC51DFXVIPjRFb9drw0y7A7v0uiohXM3git92cAhhAVs7c6RzwekN3JAz6WTYpP8lupKTiw
|
||||
VKtEKDHke4vGxnTobgDw2HmtfWMf06RYN7rSuKZObjo7eM30Fu/R6yOoUdH38dnkCLzhWx1ud+ZW
|
||||
Y9FW5PrS3trYOo5nwOBs6j+7+Rv3fnIO5f/Q94C11Ci5VZPIsX07cZ+WKP8B/pV22nLXsBFKL2xp
|
||||
pUwpX8JRga3fv0sjW1GqVwWHklKTuHuc+ZKD3eAXAAAA//8DADksFsafBAAA
|
||||
headers:
|
||||
CF-RAY:
|
||||
- 95f93ec73a1c7e0b-GRU
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Encoding:
|
||||
- gzip
|
||||
Content-Type:
|
||||
- application/json
|
||||
Date:
|
||||
- Tue, 15 Jul 2025 12:25:57 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
X-Content-Type-Options:
|
||||
- nosniff
|
||||
access-control-expose-headers:
|
||||
- X-Request-ID
|
||||
alt-svc:
|
||||
- h3=":443"; ma=86400
|
||||
cf-cache-status:
|
||||
- DYNAMIC
|
||||
openai-organization:
|
||||
- crewai-iuxna1
|
||||
openai-processing-ms:
|
||||
- '1544'
|
||||
openai-version:
|
||||
- '2020-10-01'
|
||||
strict-transport-security:
|
||||
- max-age=31536000; includeSubDomains; preload
|
||||
x-envoy-upstream-service-time:
|
||||
- '1546'
|
||||
x-ratelimit-limit-requests:
|
||||
- '30000'
|
||||
x-ratelimit-limit-tokens:
|
||||
- '150000000'
|
||||
x-ratelimit-remaining-requests:
|
||||
- '29999'
|
||||
x-ratelimit-remaining-tokens:
|
||||
- '149999732'
|
||||
x-ratelimit-reset-requests:
|
||||
- 2ms
|
||||
x-ratelimit-reset-tokens:
|
||||
- 0s
|
||||
x-request-id:
|
||||
- req_44930ba12ad8d1e3f0beed1d5e3d8b0c
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
version: 1
|
||||
File diff suppressed because one or more lines are too long
@@ -427,4 +427,140 @@ interactions:
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
- request:
|
||||
body: '{"messages": [{"role": "system", "content": "You are an expert evaluator
|
||||
assessing how well an AI agent''s output aligns with its assigned task goal.\n\nScore
|
||||
the agent''s goal alignment on a scale from 0-10 where:\n- 0: Complete misalignment,
|
||||
agent did not understand or attempt the task goal\n- 5: Partial alignment, agent
|
||||
attempted the task but missed key requirements\n- 10: Perfect alignment, agent
|
||||
fully satisfied all task requirements\n\nConsider:\n1. Did the agent correctly
|
||||
interpret the task goal?\n2. Did the final output directly address the requirements?\n3.
|
||||
Did the agent focus on relevant aspects of the task?\n4. Did the agent provide
|
||||
all requested information or deliverables?\n\nReturn your evaluation as JSON
|
||||
with fields ''score'' (number) and ''feedback'' (string).\n"}, {"role": "user",
|
||||
"content": "\nAgent role: Test Agent\nAgent goal: Complete test tasks successfully\nTask
|
||||
description: Test task description\nExpected output: Expected test output\n\nAgent''s
|
||||
final output:\nThe expected test output is a comprehensive document that outlines
|
||||
the specific parameters and criteria that define success for the task at hand.
|
||||
It should include detailed descriptions of the tasks, the goals that need to
|
||||
be achieved, and any specific formatting or structural requirements necessary
|
||||
for the output. Each component of the task must be analyzed and addressed, providing
|
||||
context as well as examples where applicable. Additionally, any tools or methodologies
|
||||
that are relevant to executing the tasks successfully should be outlined, including
|
||||
any potential risks or challenges that may arise during the process. This document
|
||||
serves as a guiding framework to ensure that all aspects of the task are thoroughly
|
||||
considered and executed to meet the high standards expected.\n\nEvaluate how
|
||||
well the agent''s output aligns with the assigned task goal.\n"}], "model":
|
||||
"gpt-4o-mini", "stop": []}'
|
||||
headers:
|
||||
accept:
|
||||
- application/json
|
||||
accept-encoding:
|
||||
- gzip, deflate, zstd
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '1893'
|
||||
content-type:
|
||||
- application/json
|
||||
cookie:
|
||||
- _cfuvid=XwsgBfgvDGlKFQ4LiGYGIARIoSNTiwidqoo9UZcc.XY-1752087999227-0.0.1.1-604800000
|
||||
host:
|
||||
- api.openai.com
|
||||
user-agent:
|
||||
- OpenAI/Python 1.93.0
|
||||
x-stainless-arch:
|
||||
- arm64
|
||||
x-stainless-async:
|
||||
- 'false'
|
||||
x-stainless-lang:
|
||||
- python
|
||||
x-stainless-os:
|
||||
- MacOS
|
||||
x-stainless-package-version:
|
||||
- 1.93.0
|
||||
x-stainless-raw-response:
|
||||
- 'true'
|
||||
x-stainless-read-timeout:
|
||||
- '600.0'
|
||||
x-stainless-retry-count:
|
||||
- '0'
|
||||
x-stainless-runtime:
|
||||
- CPython
|
||||
x-stainless-runtime-version:
|
||||
- 3.11.12
|
||||
method: POST
|
||||
uri: https://api.openai.com/v1/chat/completions
|
||||
response:
|
||||
body:
|
||||
string: !!binary |
|
||||
H4sIAAAAAAAAAwAAAP//jFRNbxs5DL37VxA6jwPHddrUxxwWi2BRtEAPRevCYCSOh41GUkWOnTTI
|
||||
fy8kf4zT5rCXOfCRT4+P5DxNAAw7swRjO1TbJz+90dvFxy//vX0za7dfr29+3eo/n75++Mh0O/za
|
||||
maZUxLsfZPVYdWFjnzwpx7CHbSZUKqyX767mV/PL2eKqAn105EvZJul0Eac9B57OZ/PFdPZuenl9
|
||||
qO4iWxKzhG8TAICn+i06g6MHs4RZc4z0JIIbMstTEoDJ0ZeIQREWxaCmGUEbg1Ko0p9WAWBlxMZM
|
||||
K7OEq2YfaIncHdr7EluZzx0BbigopBy37MgBgiNF9uTAkdjMqbQOsYVdhwraEdBDIqvkIA6aBgXp
|
||||
4uAdcLB+cNTArmPbAQfHFpUEJPYEQ3CUi2LHYVPoCpOi3EOmnwNn6imoXMC/cUdbyk3FWw7oj8+4
|
||||
SAIhKkgiyy1b9P4RHHneUn4pTEn0WIYC6YDX5866aqDH+yKHFRJm5cqInjeB3AWM7vQsUgzhTFb9
|
||||
48GtUlloSwMkZ4bEDMetOaSg1QH9XldVwSrk2wY4iBLWSs/hmG47zGiVMouylZP7WHkzdRSEtwQu
|
||||
2qH4dhyBjcWKHWsXhzJTEgpVAwagByySirgzRSfLDrtzsTKr8Hy+VJnaQbAsdhi8PwMwhKhYfKzr
|
||||
/P2APJ8W2MdNyvFO/ig1LQeWbp0JJYayrKIxmYo+TwC+10MZXuy+STn2Sdca76k+92ax2POZ8T5H
|
||||
9P31AdSo6Mf4YjFvXuFb71dezk7NWLQdubF0vEscHMczYHLW9d9qXuPed85h83/oR8BaSkpunTI5
|
||||
ti87HtMy/agTfT3t5HIVbITyli2tlSmXSThqcfD7n4qRR1Hq1y2HDeWUuf5ZyiQnz5PfAAAA//8D
|
||||
AEfUP8BcBQAA
|
||||
headers:
|
||||
CF-RAY:
|
||||
- 95f365f1bfc87ded-GRU
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Encoding:
|
||||
- gzip
|
||||
Content-Type:
|
||||
- application/json
|
||||
Date:
|
||||
- Mon, 14 Jul 2025 19:24:07 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Set-Cookie:
|
||||
- __cf_bm=PcC3_3T8.MK_WpZlQLdZfwpNv9Pe45AIYmrXOSgJ65E-1752521047-1.0.1.1-eyqwSWfQC7ZV6.JwTsTihK1ZWCrEmxd52CtNcfe.fw1UjjBN9rdTU4G7hRZiNqHQYo4sVZMmgRgqM9k7HRSzN2zln0bKmMiOuSQTZh6xF_I;
|
||||
path=/; expires=Mon, 14-Jul-25 19:54:07 GMT; domain=.api.openai.com; HttpOnly;
|
||||
Secure; SameSite=None
|
||||
- _cfuvid=JvQ1c4qYZefNwOPoVNgAtX8ET7ObU.JKDvGc43LOR6g-1752521047741-0.0.1.1-604800000;
|
||||
path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
X-Content-Type-Options:
|
||||
- nosniff
|
||||
access-control-expose-headers:
|
||||
- X-Request-ID
|
||||
alt-svc:
|
||||
- h3=":443"; ma=86400
|
||||
cf-cache-status:
|
||||
- DYNAMIC
|
||||
openai-organization:
|
||||
- crewai-iuxna1
|
||||
openai-processing-ms:
|
||||
- '2729'
|
||||
openai-version:
|
||||
- '2020-10-01'
|
||||
strict-transport-security:
|
||||
- max-age=31536000; includeSubDomains; preload
|
||||
x-envoy-upstream-service-time:
|
||||
- '2789'
|
||||
x-ratelimit-limit-requests:
|
||||
- '30000'
|
||||
x-ratelimit-limit-tokens:
|
||||
- '150000000'
|
||||
x-ratelimit-remaining-requests:
|
||||
- '29999'
|
||||
x-ratelimit-remaining-tokens:
|
||||
- '149999559'
|
||||
x-ratelimit-reset-requests:
|
||||
- 2ms
|
||||
x-ratelimit-reset-tokens:
|
||||
- 0s
|
||||
x-request-id:
|
||||
- req_74f6e8ff49db25dbea3d3525cc149e8e
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
version: 1
|
||||
|
||||
@@ -1,95 +0,0 @@
|
||||
import pytest
|
||||
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
from crewai.crew import Crew
|
||||
from crewai.evaluation.agent_evaluator import AgentEvaluator
|
||||
from crewai.evaluation.base_evaluator import AgentEvaluationResult
|
||||
from crewai.evaluation import (
|
||||
GoalAlignmentEvaluator,
|
||||
SemanticQualityEvaluator,
|
||||
ToolSelectionEvaluator,
|
||||
ParameterExtractionEvaluator,
|
||||
ToolInvocationEvaluator,
|
||||
ReasoningEfficiencyEvaluator
|
||||
)
|
||||
|
||||
from crewai.evaluation import create_default_evaluator
|
||||
class TestAgentEvaluator:
|
||||
@pytest.fixture
|
||||
def mock_crew(self):
|
||||
agent = Agent(
|
||||
role="Test Agent",
|
||||
goal="Complete test tasks successfully",
|
||||
backstory="An agent created for testing purposes",
|
||||
allow_delegation=False,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
task = Task(
|
||||
description="Test task description",
|
||||
agent=agent,
|
||||
expected_output="Expected test output"
|
||||
)
|
||||
|
||||
crew = Crew(
|
||||
agents=[agent],
|
||||
tasks=[task]
|
||||
)
|
||||
return crew
|
||||
|
||||
def test_set_iteration(self):
|
||||
agent_evaluator = AgentEvaluator()
|
||||
|
||||
agent_evaluator.set_iteration(3)
|
||||
assert agent_evaluator.iteration == 3
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
def test_evaluate_current_iteration(self, mock_crew):
|
||||
agent_evaluator = AgentEvaluator(crew=mock_crew, evaluators=[GoalAlignmentEvaluator()])
|
||||
|
||||
mock_crew.kickoff()
|
||||
|
||||
results = agent_evaluator.evaluate_current_iteration()
|
||||
|
||||
assert isinstance(results, dict)
|
||||
|
||||
agent, = mock_crew.agents
|
||||
task, = mock_crew.tasks
|
||||
|
||||
assert len(mock_crew.agents) == 1
|
||||
assert agent.role in results
|
||||
assert len(results[agent.role]) == 1
|
||||
|
||||
result, = results[agent.role]
|
||||
assert isinstance(result, AgentEvaluationResult)
|
||||
|
||||
assert result.agent_id == str(agent.id)
|
||||
assert result.task_id == str(task.id)
|
||||
|
||||
goal_alignment, = result.metrics.values()
|
||||
assert goal_alignment.score == 5.0
|
||||
|
||||
expected_feedback = "The agent's output demonstrates an understanding of the need for a comprehensive document"
|
||||
assert expected_feedback in goal_alignment.feedback
|
||||
|
||||
assert goal_alignment.raw_response is not None
|
||||
assert '"score": 5' in goal_alignment.raw_response
|
||||
|
||||
def test_create_default_evaluator(self, mock_crew):
|
||||
agent_evaluator = create_default_evaluator(crew=mock_crew)
|
||||
assert isinstance(agent_evaluator, AgentEvaluator)
|
||||
assert agent_evaluator.crew == mock_crew
|
||||
|
||||
expected_types = [
|
||||
GoalAlignmentEvaluator,
|
||||
SemanticQualityEvaluator,
|
||||
ToolSelectionEvaluator,
|
||||
ParameterExtractionEvaluator,
|
||||
ToolInvocationEvaluator,
|
||||
ReasoningEfficiencyEvaluator
|
||||
]
|
||||
|
||||
assert len(agent_evaluator.evaluators) == len(expected_types)
|
||||
for evaluator, expected_type in zip(agent_evaluator.evaluators, expected_types):
|
||||
assert isinstance(evaluator, expected_type)
|
||||
@@ -1,8 +1,8 @@
|
||||
from unittest.mock import patch, MagicMock
|
||||
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||
from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||
|
||||
from crewai.evaluation.base_evaluator import EvaluationScore
|
||||
from crewai.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator
|
||||
from crewai.experimental.evaluation.base_evaluator import EvaluationScore
|
||||
from crewai.experimental.evaluation.metrics.goal_metrics import GoalAlignmentEvaluator
|
||||
from crewai.utilities.llm_utils import LLM
|
||||
|
||||
|
||||
@@ -3,12 +3,12 @@ from unittest.mock import patch, MagicMock
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from crewai.tasks.task_output import TaskOutput
|
||||
from crewai.evaluation.metrics.reasoning_metrics import (
|
||||
from crewai.experimental.evaluation.metrics.reasoning_metrics import (
|
||||
ReasoningEfficiencyEvaluator,
|
||||
)
|
||||
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||
from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||
from crewai.utilities.llm_utils import LLM
|
||||
from crewai.evaluation.base_evaluator import EvaluationScore
|
||||
from crewai.experimental.evaluation.base_evaluator import EvaluationScore
|
||||
|
||||
class TestReasoningEfficiencyEvaluator(BaseEvaluationMetricsTest):
|
||||
@pytest.fixture
|
||||
@@ -1,8 +1,8 @@
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from crewai.evaluation.base_evaluator import EvaluationScore
|
||||
from crewai.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator
|
||||
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||
from crewai.experimental.evaluation.base_evaluator import EvaluationScore
|
||||
from crewai.experimental.evaluation.metrics.semantic_quality_metrics import SemanticQualityEvaluator
|
||||
from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||
from crewai.utilities.llm_utils import LLM
|
||||
|
||||
class TestSemanticQualityEvaluator(BaseEvaluationMetricsTest):
|
||||
@@ -1,12 +1,12 @@
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from crewai.evaluation.metrics.tools_metrics import (
|
||||
from crewai.experimental.evaluation.metrics.tools_metrics import (
|
||||
ToolSelectionEvaluator,
|
||||
ParameterExtractionEvaluator,
|
||||
ToolInvocationEvaluator
|
||||
)
|
||||
from crewai.utilities.llm_utils import LLM
|
||||
from tests.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||
from tests.experimental.evaluation.metrics.base_evaluation_metrics_test import BaseEvaluationMetricsTest
|
||||
|
||||
class TestToolSelectionEvaluator(BaseEvaluationMetricsTest):
|
||||
def test_no_tools_available(self, mock_task, mock_agent):
|
||||
164
tests/experimental/evaluation/test_agent_evaluator.py
Normal file
164
tests/experimental/evaluation/test_agent_evaluator.py
Normal file
@@ -0,0 +1,164 @@
|
||||
import pytest
|
||||
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
from crewai.crew import Crew
|
||||
from crewai.experimental.evaluation.agent_evaluator import AgentEvaluator
|
||||
from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult
|
||||
from crewai.experimental.evaluation import (
|
||||
GoalAlignmentEvaluator,
|
||||
SemanticQualityEvaluator,
|
||||
ToolSelectionEvaluator,
|
||||
ParameterExtractionEvaluator,
|
||||
ToolInvocationEvaluator,
|
||||
ReasoningEfficiencyEvaluator
|
||||
)
|
||||
|
||||
from crewai.experimental.evaluation import create_default_evaluator
|
||||
|
||||
class TestAgentEvaluator:
|
||||
@pytest.fixture
|
||||
def mock_crew(self):
|
||||
agent = Agent(
|
||||
role="Test Agent",
|
||||
goal="Complete test tasks successfully",
|
||||
backstory="An agent created for testing purposes",
|
||||
allow_delegation=False,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
task = Task(
|
||||
description="Test task description",
|
||||
agent=agent,
|
||||
expected_output="Expected test output"
|
||||
)
|
||||
|
||||
crew = Crew(
|
||||
agents=[agent],
|
||||
tasks=[task]
|
||||
)
|
||||
return crew
|
||||
|
||||
def test_set_iteration(self):
|
||||
agent_evaluator = AgentEvaluator(agents=[])
|
||||
|
||||
agent_evaluator.set_iteration(3)
|
||||
assert agent_evaluator._execution_state.iteration == 3
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
def test_evaluate_current_iteration(self, mock_crew):
|
||||
agent_evaluator = AgentEvaluator(agents=mock_crew.agents, evaluators=[GoalAlignmentEvaluator()])
|
||||
|
||||
mock_crew.kickoff()
|
||||
|
||||
results = agent_evaluator.get_evaluation_results()
|
||||
|
||||
assert isinstance(results, dict)
|
||||
|
||||
agent, = mock_crew.agents
|
||||
task, = mock_crew.tasks
|
||||
|
||||
assert len(mock_crew.agents) == 1
|
||||
assert agent.role in results
|
||||
assert len(results[agent.role]) == 1
|
||||
|
||||
result, = results[agent.role]
|
||||
assert isinstance(result, AgentEvaluationResult)
|
||||
|
||||
assert result.agent_id == str(agent.id)
|
||||
assert result.task_id == str(task.id)
|
||||
|
||||
goal_alignment, = result.metrics.values()
|
||||
assert goal_alignment.score == 5.0
|
||||
|
||||
expected_feedback = "The agent's output demonstrates an understanding of the need for a comprehensive document outlining task"
|
||||
assert expected_feedback in goal_alignment.feedback
|
||||
|
||||
assert goal_alignment.raw_response is not None
|
||||
assert '"score": 5' in goal_alignment.raw_response
|
||||
|
||||
def test_create_default_evaluator(self, mock_crew):
|
||||
agent_evaluator = create_default_evaluator(agents=mock_crew.agents)
|
||||
assert isinstance(agent_evaluator, AgentEvaluator)
|
||||
assert agent_evaluator.agents == mock_crew.agents
|
||||
|
||||
expected_types = [
|
||||
GoalAlignmentEvaluator,
|
||||
SemanticQualityEvaluator,
|
||||
ToolSelectionEvaluator,
|
||||
ParameterExtractionEvaluator,
|
||||
ToolInvocationEvaluator,
|
||||
ReasoningEfficiencyEvaluator
|
||||
]
|
||||
|
||||
assert len(agent_evaluator.evaluators) == len(expected_types)
|
||||
for evaluator, expected_type in zip(agent_evaluator.evaluators, expected_types):
|
||||
assert isinstance(evaluator, expected_type)
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
def test_eval_lite_agent(self):
|
||||
agent = Agent(
|
||||
role="Test Agent",
|
||||
goal="Complete test tasks successfully",
|
||||
backstory="An agent created for testing purposes",
|
||||
)
|
||||
agent_evaluator = AgentEvaluator(agents=[agent], evaluators=[GoalAlignmentEvaluator()])
|
||||
|
||||
agent.kickoff(messages="Complete this task successfully")
|
||||
|
||||
results = agent_evaluator.get_evaluation_results()
|
||||
|
||||
assert isinstance(results, dict)
|
||||
|
||||
result, = results[agent.role]
|
||||
assert isinstance(result, AgentEvaluationResult)
|
||||
|
||||
assert result.agent_id == str(agent.id)
|
||||
assert result.task_id == "lite_task"
|
||||
|
||||
goal_alignment, = result.metrics.values()
|
||||
assert goal_alignment.score == 2.0
|
||||
|
||||
expected_feedback = "The agent did not demonstrate a clear understanding of the task goal, which is to complete test tasks successfully"
|
||||
assert expected_feedback in goal_alignment.feedback
|
||||
|
||||
assert goal_alignment.raw_response is not None
|
||||
assert '"score": 2' in goal_alignment.raw_response
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
def test_eval_specific_agents_from_crew(self, mock_crew):
|
||||
agent = Agent(
|
||||
role="Test Agent Eval",
|
||||
goal="Complete test tasks successfully",
|
||||
backstory="An agent created for testing purposes",
|
||||
)
|
||||
task = Task(
|
||||
description="Test task description",
|
||||
agent=agent,
|
||||
expected_output="Expected test output"
|
||||
)
|
||||
mock_crew.agents.append(agent)
|
||||
mock_crew.tasks.append(task)
|
||||
|
||||
agent_evaluator = AgentEvaluator(agents=[agent], evaluators=[GoalAlignmentEvaluator()])
|
||||
|
||||
mock_crew.kickoff()
|
||||
|
||||
results = agent_evaluator.get_evaluation_results()
|
||||
|
||||
assert isinstance(results, dict)
|
||||
assert len(results.keys()) == 1
|
||||
result, = results[agent.role]
|
||||
assert isinstance(result, AgentEvaluationResult)
|
||||
|
||||
assert result.agent_id == str(agent.id)
|
||||
assert result.task_id == str(task.id)
|
||||
|
||||
goal_alignment, = result.metrics.values()
|
||||
assert goal_alignment.score == 5.0
|
||||
|
||||
expected_feedback = "The agent provided a thorough guide on how to conduct a test task but failed to produce specific expected output"
|
||||
assert expected_feedback in goal_alignment.feedback
|
||||
|
||||
assert goal_alignment.raw_response is not None
|
||||
assert '"score": 5' in goal_alignment.raw_response
|
||||
111
tests/experimental/evaluation/test_experiment_result.py
Normal file
111
tests/experimental/evaluation/test_experiment_result.py
Normal file
@@ -0,0 +1,111 @@
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from crewai.experimental.evaluation.experiment.result import ExperimentResult, ExperimentResults
|
||||
|
||||
|
||||
class TestExperimentResult:
|
||||
@pytest.fixture
|
||||
def mock_results(self):
|
||||
return [
|
||||
ExperimentResult(
|
||||
identifier="test-1",
|
||||
inputs={"query": "What is the capital of France?"},
|
||||
score=10,
|
||||
expected_score=7,
|
||||
passed=True
|
||||
),
|
||||
ExperimentResult(
|
||||
identifier="test-2",
|
||||
inputs={"query": "Who wrote Hamlet?"},
|
||||
score={"relevance": 9, "factuality": 8},
|
||||
expected_score={"relevance": 7, "factuality": 7},
|
||||
passed=True,
|
||||
agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
|
||||
),
|
||||
ExperimentResult(
|
||||
identifier="test-3",
|
||||
inputs={"query": "Any query"},
|
||||
score={"relevance": 9, "factuality": 8},
|
||||
expected_score={"relevance": 7, "factuality": 7},
|
||||
passed=False,
|
||||
agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
|
||||
),
|
||||
ExperimentResult(
|
||||
identifier="test-4",
|
||||
inputs={"query": "Another query"},
|
||||
score={"relevance": 9, "factuality": 8},
|
||||
expected_score={"relevance": 7, "factuality": 7},
|
||||
passed=True,
|
||||
agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
|
||||
),
|
||||
ExperimentResult(
|
||||
identifier="test-6",
|
||||
inputs={"query": "Yet another query"},
|
||||
score={"relevance": 9, "factuality": 8},
|
||||
expected_score={"relevance": 7, "factuality": 7},
|
||||
passed=True,
|
||||
agent_evaluations={"agent1": {"metrics": {"goal_alignment": {"score": 9}}}}
|
||||
)
|
||||
]
|
||||
|
||||
@patch('os.path.exists', return_value=True)
|
||||
@patch('os.path.getsize', return_value=1)
|
||||
@patch('json.load')
|
||||
@patch('builtins.open', new_callable=MagicMock)
|
||||
def test_experiment_results_compare_with_baseline(self, mock_open, mock_json_load, mock_path_getsize, mock_path_exists, mock_results):
|
||||
baseline_data = {
|
||||
"timestamp": "2023-01-01T00:00:00+00:00",
|
||||
"results": [
|
||||
{
|
||||
"identifier": "test-1",
|
||||
"inputs": {"query": "What is the capital of France?"},
|
||||
"score": 7,
|
||||
"expected_score": 7,
|
||||
"passed": False
|
||||
},
|
||||
{
|
||||
"identifier": "test-2",
|
||||
"inputs": {"query": "Who wrote Hamlet?"},
|
||||
"score": {"relevance": 8, "factuality": 7},
|
||||
"expected_score": {"relevance": 7, "factuality": 7},
|
||||
"passed": True
|
||||
},
|
||||
{
|
||||
"identifier": "test-3",
|
||||
"inputs": {"query": "Any query"},
|
||||
"score": {"relevance": 8, "factuality": 7},
|
||||
"expected_score": {"relevance": 7, "factuality": 7},
|
||||
"passed": True
|
||||
},
|
||||
{
|
||||
"identifier": "test-4",
|
||||
"inputs": {"query": "Another query"},
|
||||
"score": {"relevance": 8, "factuality": 7},
|
||||
"expected_score": {"relevance": 7, "factuality": 7},
|
||||
"passed": True
|
||||
},
|
||||
{
|
||||
"identifier": "test-5",
|
||||
"inputs": {"query": "Another query"},
|
||||
"score": {"relevance": 8, "factuality": 7},
|
||||
"expected_score": {"relevance": 7, "factuality": 7},
|
||||
"passed": True
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
mock_json_load.return_value = baseline_data
|
||||
|
||||
results = ExperimentResults(results=mock_results)
|
||||
results.display = MagicMock()
|
||||
|
||||
comparison = results.compare_with_baseline(baseline_filepath="baseline.json")
|
||||
|
||||
assert "baseline_timestamp" in comparison
|
||||
assert comparison["baseline_timestamp"] == "2023-01-01T00:00:00+00:00"
|
||||
assert comparison["improved"] == ["test-1"]
|
||||
assert comparison["regressed"] == ["test-3"]
|
||||
assert comparison["unchanged"] == ["test-2", "test-4"]
|
||||
assert comparison["new_tests"] == ["test-6"]
|
||||
assert comparison["missing_tests"] == ["test-5"]
|
||||
197
tests/experimental/evaluation/test_experiment_runner.py
Normal file
197
tests/experimental/evaluation/test_experiment_runner.py
Normal file
@@ -0,0 +1,197 @@
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from crewai.crew import Crew
|
||||
from crewai.experimental.evaluation.experiment.runner import ExperimentRunner
|
||||
from crewai.experimental.evaluation.experiment.result import ExperimentResults
|
||||
from crewai.experimental.evaluation.evaluation_display import AgentAggregatedEvaluationResult
|
||||
from crewai.experimental.evaluation.base_evaluator import MetricCategory, EvaluationScore
|
||||
|
||||
|
||||
class TestExperimentRunner:
|
||||
@pytest.fixture
|
||||
def mock_crew(self):
|
||||
return MagicMock(llm=Crew)
|
||||
|
||||
@pytest.fixture
|
||||
def mock_evaluator_results(self):
|
||||
agent_evaluation = AgentAggregatedEvaluationResult(
|
||||
agent_id="Test Agent",
|
||||
agent_role="Test Agent Role",
|
||||
metrics={
|
||||
MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
|
||||
score=9,
|
||||
feedback="Test feedback for goal alignment",
|
||||
raw_response="Test raw response for goal alignment"
|
||||
),
|
||||
MetricCategory.REASONING_EFFICIENCY: EvaluationScore(
|
||||
score=None,
|
||||
feedback="Reasoning efficiency not applicable",
|
||||
raw_response="Reasoning efficiency not applicable"
|
||||
),
|
||||
MetricCategory.PARAMETER_EXTRACTION: EvaluationScore(
|
||||
score=7,
|
||||
feedback="Test parameter extraction explanation",
|
||||
raw_response="Test raw output"
|
||||
),
|
||||
MetricCategory.TOOL_SELECTION: EvaluationScore(
|
||||
score=8,
|
||||
feedback="Test tool selection explanation",
|
||||
raw_response="Test raw output"
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
return {"Test Agent": agent_evaluation}
|
||||
|
||||
@patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
|
||||
def test_run_success(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
|
||||
dataset = [
|
||||
{
|
||||
"identifier": "test-case-1",
|
||||
"inputs": {"query": "Test query 1"},
|
||||
"expected_score": 8
|
||||
},
|
||||
{
|
||||
"identifier": "test-case-2",
|
||||
"inputs": {"query": "Test query 2"},
|
||||
"expected_score": {"goal_alignment": 7}
|
||||
},
|
||||
{
|
||||
"inputs": {"query": "Test query 3"},
|
||||
"expected_score": {"tool_selection": 9}
|
||||
}
|
||||
]
|
||||
|
||||
mock_evaluator = MagicMock()
|
||||
mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
|
||||
mock_evaluator.reset_iterations_results = MagicMock()
|
||||
mock_create_evaluator.return_value = mock_evaluator
|
||||
|
||||
runner = ExperimentRunner(dataset=dataset)
|
||||
|
||||
results = runner.run(crew=mock_crew)
|
||||
|
||||
assert isinstance(results, ExperimentResults)
|
||||
result_1, result_2, result_3 = results.results
|
||||
assert len(results.results) == 3
|
||||
|
||||
assert result_1.identifier == "test-case-1"
|
||||
assert result_1.inputs == {"query": "Test query 1"}
|
||||
assert result_1.expected_score == 8
|
||||
assert result_1.passed is True
|
||||
|
||||
assert result_2.identifier == "test-case-2"
|
||||
assert result_2.inputs == {"query": "Test query 2"}
|
||||
assert isinstance(result_2.expected_score, dict)
|
||||
assert "goal_alignment" in result_2.expected_score
|
||||
assert result_2.passed is True
|
||||
|
||||
assert result_3.identifier == "c2ed49e63aa9a83af3ca382794134fd5"
|
||||
assert result_3.inputs == {"query": "Test query 3"}
|
||||
assert isinstance(result_3.expected_score, dict)
|
||||
assert "tool_selection" in result_3.expected_score
|
||||
assert result_3.passed is False
|
||||
|
||||
assert mock_crew.kickoff.call_count == 3
|
||||
mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 1"})
|
||||
mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 2"})
|
||||
mock_crew.kickoff.assert_any_call(inputs={"query": "Test query 3"})
|
||||
|
||||
assert mock_evaluator.reset_iterations_results.call_count == 3
|
||||
assert mock_evaluator.get_agent_evaluation.call_count == 3
|
||||
|
||||
|
||||
@patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
|
||||
def test_run_success_with_unknown_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
|
||||
dataset = [
|
||||
{
|
||||
"identifier": "test-case-2",
|
||||
"inputs": {"query": "Test query 2"},
|
||||
"expected_score": {"goal_alignment": 7, "unknown_metric": 8}
|
||||
}
|
||||
]
|
||||
|
||||
mock_evaluator = MagicMock()
|
||||
mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
|
||||
mock_evaluator.reset_iterations_results = MagicMock()
|
||||
mock_create_evaluator.return_value = mock_evaluator
|
||||
|
||||
runner = ExperimentRunner(dataset=dataset)
|
||||
|
||||
results = runner.run(crew=mock_crew)
|
||||
|
||||
result, = results.results
|
||||
|
||||
assert result.identifier == "test-case-2"
|
||||
assert result.inputs == {"query": "Test query 2"}
|
||||
assert isinstance(result.expected_score, dict)
|
||||
assert "goal_alignment" in result.expected_score.keys()
|
||||
assert "unknown_metric" in result.expected_score.keys()
|
||||
assert result.passed is True
|
||||
|
||||
@patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
|
||||
def test_run_success_with_single_metric_evaluator_and_expected_specific_metric(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
|
||||
dataset = [
|
||||
{
|
||||
"identifier": "test-case-2",
|
||||
"inputs": {"query": "Test query 2"},
|
||||
"expected_score": {"goal_alignment": 7}
|
||||
}
|
||||
]
|
||||
|
||||
mock_evaluator = MagicMock()
|
||||
mock_create_evaluator["Test Agent"].metrics = {
|
||||
MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
|
||||
score=9,
|
||||
feedback="Test feedback for goal alignment",
|
||||
raw_response="Test raw response for goal alignment"
|
||||
)
|
||||
}
|
||||
mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
|
||||
mock_evaluator.reset_iterations_results = MagicMock()
|
||||
mock_create_evaluator.return_value = mock_evaluator
|
||||
|
||||
runner = ExperimentRunner(dataset=dataset)
|
||||
|
||||
results = runner.run(crew=mock_crew)
|
||||
result, = results.results
|
||||
|
||||
assert result.identifier == "test-case-2"
|
||||
assert result.inputs == {"query": "Test query 2"}
|
||||
assert isinstance(result.expected_score, dict)
|
||||
assert "goal_alignment" in result.expected_score.keys()
|
||||
assert result.passed is True
|
||||
|
||||
@patch('crewai.experimental.evaluation.experiment.runner.create_default_evaluator')
|
||||
def test_run_success_when_expected_metric_is_not_available(self, mock_create_evaluator, mock_crew, mock_evaluator_results):
|
||||
dataset = [
|
||||
{
|
||||
"identifier": "test-case-2",
|
||||
"inputs": {"query": "Test query 2"},
|
||||
"expected_score": {"unknown_metric": 7}
|
||||
}
|
||||
]
|
||||
|
||||
mock_evaluator = MagicMock()
|
||||
mock_create_evaluator["Test Agent"].metrics = {
|
||||
MetricCategory.GOAL_ALIGNMENT: EvaluationScore(
|
||||
score=5,
|
||||
feedback="Test feedback for goal alignment",
|
||||
raw_response="Test raw response for goal alignment"
|
||||
)
|
||||
}
|
||||
mock_evaluator.get_agent_evaluation.return_value = mock_evaluator_results
|
||||
mock_evaluator.reset_iterations_results = MagicMock()
|
||||
mock_create_evaluator.return_value = mock_evaluator
|
||||
|
||||
runner = ExperimentRunner(dataset=dataset)
|
||||
|
||||
results = runner.run(crew=mock_crew)
|
||||
result, = results.results
|
||||
|
||||
assert result.identifier == "test-case-2"
|
||||
assert result.inputs == {"query": "Test query 2"}
|
||||
assert isinstance(result.expected_score, dict)
|
||||
assert "unknown_metric" in result.expected_score.keys()
|
||||
assert result.passed is False
|
||||
@@ -0,0 +1,42 @@
|
||||
[
|
||||
{
|
||||
"timestamp": "2025-07-15T21:34:08.253410+00:00",
|
||||
"metadata": {},
|
||||
"results": [
|
||||
{
|
||||
"identifier": "72239c22b0cdde98ad5c588074ef6325",
|
||||
"inputs": {
|
||||
"company": "Apple Inc. (AAPL)"
|
||||
},
|
||||
"score": {
|
||||
"goal_alignment": 10.0,
|
||||
"semantic_quality": 9.0,
|
||||
"tool_selection": 6.0,
|
||||
"parameter_extraction": 5.0,
|
||||
"tool_invocation": 10.0,
|
||||
"reasoning_efficiency": 7.300000000000001
|
||||
},
|
||||
"expected_score": {
|
||||
"goal_alignment": 8
|
||||
},
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"identifier": "test_2",
|
||||
"inputs": {
|
||||
"company": "Microsoft Corporation (MSFT)"
|
||||
},
|
||||
"score": {
|
||||
"goal_alignment": 10.0,
|
||||
"semantic_quality": 7.333333333333333,
|
||||
"tool_selection": 6.25,
|
||||
"parameter_extraction": 9.5,
|
||||
"tool_invocation": 10.0,
|
||||
"reasoning_efficiency": 6.0
|
||||
},
|
||||
"expected_score": 8,
|
||||
"passed": true
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
24
tests/regression/results/test_history_teacher.json
Normal file
24
tests/regression/results/test_history_teacher.json
Normal file
@@ -0,0 +1,24 @@
|
||||
[
|
||||
{
|
||||
"timestamp": "2025-07-15T21:31:05.916161+00:00",
|
||||
"metadata": {},
|
||||
"results": [
|
||||
{
|
||||
"identifier": "df0ea31ac4a7fb4a908b8319ec7b3719",
|
||||
"inputs": {
|
||||
"messages": "How was the Battle of Waterloo?"
|
||||
},
|
||||
"score": {
|
||||
"goal_alignment": 10.0,
|
||||
"semantic_quality": 10.0,
|
||||
"tool_selection": 10.0,
|
||||
"parameter_extraction": 10.0,
|
||||
"tool_invocation": 10.0,
|
||||
"reasoning_efficiency": 5.5
|
||||
},
|
||||
"expected_score": 8,
|
||||
"passed": true
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
144
tests/regression/test_financial_analysis.py
Normal file
144
tests/regression/test_financial_analysis.py
Normal file
@@ -0,0 +1,144 @@
|
||||
import pytest
|
||||
from crewai import Agent, Crew, Process, Task
|
||||
from crewai_tools import SerperDevTool
|
||||
|
||||
from crewai.experimental.evaluation.testing import (
|
||||
assert_experiment_successfully,
|
||||
run_experiment,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def financial_analysis_crew():
|
||||
search_tool = SerperDevTool()
|
||||
|
||||
data_researcher = Agent(
|
||||
role="Financial Data Researcher",
|
||||
goal="Efficiently collect and structure key financial metrics using multiple search strategies. Using only the search tool.",
|
||||
backstory=(
|
||||
"You are a precision-focused financial analyst who uses multiple targeted searches "
|
||||
"to cross-verify data and ensure comprehensive coverage. You leverage different "
|
||||
"search approaches to gather financial information from various authoritative sources."
|
||||
),
|
||||
tools=[search_tool],
|
||||
)
|
||||
|
||||
financial_analyst = Agent(
|
||||
role="Financial Analyst",
|
||||
goal="Analyze financial data to assess company performance and outlook",
|
||||
backstory=(
|
||||
"You are a seasoned financial analyst with expertise in evaluating company "
|
||||
"performance through quantitative analysis. You can interpret financial statements, "
|
||||
"identify trends, and make reasoned assessments of a company's financial health."
|
||||
),
|
||||
tools=[search_tool],
|
||||
)
|
||||
|
||||
report_writer = Agent(
|
||||
role="Financial Report Writer",
|
||||
goal="Synthesize financial analysis into clear, actionable reports",
|
||||
backstory=(
|
||||
"You are an experienced financial writer who excels at turning complex financial "
|
||||
"analyses into clear, concise, and impactful reports. You know how to highlight "
|
||||
"key insights and present information in a way that's accessible to various audiences."
|
||||
),
|
||||
tools=[],
|
||||
)
|
||||
|
||||
research_task = Task(
|
||||
description=(
|
||||
"Research {company} financial data using multiple targeted search queries:\n\n"
|
||||
"**Search Strategy - Execute these searches sequentially:**\n"
|
||||
"1. '{company} quarterly earnings Q4 2024 Q1 2025 financial results'\n"
|
||||
"2. '{company} financial metrics P/E ratio profit margin debt equity'\n"
|
||||
"3. '{company} revenue growth year over year earnings growth rate'\n"
|
||||
"4. '{company} recent financial news SEC filings analyst reports'\n"
|
||||
"5. '{company} stock performance market cap valuation 2024 2025'\n\n"
|
||||
"**Data Collection Guidelines:**\n"
|
||||
"- Use multiple search queries to cross-verify financial figures\n"
|
||||
"- Prioritize official sources (SEC filings, earnings calls, company reports)\n"
|
||||
"- Compare data across different financial platforms for accuracy\n"
|
||||
"- Present findings in the exact format specified in expected_output."
|
||||
),
|
||||
expected_output=(
|
||||
"Financial data summary in this structure:\n\n"
|
||||
"## Company Financial Overview\n"
|
||||
"**Data Sources Used:** [List 3-5 sources from multiple searches]\n\n"
|
||||
"**Latest Quarter:** [Period]\n"
|
||||
"- Revenue: $X (YoY: +/-X%) [Source verification]\n"
|
||||
"- Net Income: $X (YoY: +/-X%) [Source verification]\n"
|
||||
"- EPS: $X (YoY: +/-X%) [Source verification]\n\n"
|
||||
"**Key Metrics:**\n"
|
||||
"- P/E Ratio: X [Current vs Historical]\n"
|
||||
"- Profit Margin: X% [Trend indicator]\n"
|
||||
"- Debt-to-Equity: X [Industry comparison]\n\n"
|
||||
"**Growth Analysis:**\n"
|
||||
"- Revenue Growth: X% (3-year trend)\n"
|
||||
"- Earnings Growth: X% (consistency check)\n\n"
|
||||
"**Material Developments:** [1-2 key items with impact assessment]\n"
|
||||
"**Data Confidence:** [High/Medium/Low based on source consistency]"
|
||||
),
|
||||
agent=data_researcher,
|
||||
)
|
||||
|
||||
analysis_task = Task(
|
||||
description=(
|
||||
"Analyze the collected financial data to assess the company's performance and outlook. "
|
||||
"Include the following in your analysis:\n"
|
||||
"1. Evaluation of financial health based on key metrics\n"
|
||||
"2. Trend analysis showing growth or decline patterns\n"
|
||||
"3. Comparison with industry benchmarks or competitors\n"
|
||||
"4. Identification of strengths and potential areas of concern\n"
|
||||
"5. Short-term financial outlook based on current trends"
|
||||
),
|
||||
expected_output=(
|
||||
"A detailed financial analysis that includes assessment of key metrics, trends, "
|
||||
"comparative analysis, and a reasoned outlook for the company's financial future."
|
||||
),
|
||||
agent=financial_analyst,
|
||||
context=[research_task],
|
||||
)
|
||||
|
||||
report_task = Task(
|
||||
description=(
|
||||
"Create a professional financial report based on the research and analysis. "
|
||||
"The report should:\n"
|
||||
"1. Begin with an executive summary highlighting key findings\n"
|
||||
"2. Present the financial analysis in a clear, logical structure\n"
|
||||
"3. Include visual representations of key data points (described textually)\n"
|
||||
"4. Provide actionable insights for potential investors\n"
|
||||
"5. Conclude with a clear investment recommendation (buy, hold, or sell)"
|
||||
),
|
||||
expected_output=(
|
||||
"A professional, comprehensive financial report with executive summary, "
|
||||
"structured analysis, visual elements, actionable insights, and a clear recommendation."
|
||||
),
|
||||
agent=report_writer,
|
||||
context=[research_task, analysis_task],
|
||||
)
|
||||
|
||||
crew = Crew(
|
||||
agents=[data_researcher, financial_analyst, report_writer],
|
||||
tasks=[research_task, analysis_task, report_task],
|
||||
process=Process.sequential,
|
||||
)
|
||||
|
||||
return crew
|
||||
|
||||
|
||||
def test_financial_analysis_regression(financial_analysis_crew):
|
||||
dataset = [
|
||||
{
|
||||
"inputs": {"company": "Apple Inc. (AAPL)"},
|
||||
"expected_score": {"goal_alignment": 8},
|
||||
},
|
||||
{
|
||||
"identifier": "test_2",
|
||||
"inputs": {"company": "Microsoft Corporation (MSFT)"},
|
||||
"expected_score": 8,
|
||||
},
|
||||
]
|
||||
|
||||
results = run_experiment(dataset=dataset, crew=financial_analysis_crew, verbose=True)
|
||||
|
||||
assert_experiment_successfully(results)
|
||||
33
tests/regression/test_history_teacher.py
Normal file
33
tests/regression/test_history_teacher.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import pytest
|
||||
from crewai import Agent
|
||||
from crewai_tools import SerperDevTool
|
||||
|
||||
from crewai.experimental.evaluation.testing import (
|
||||
assert_experiment_successfully,
|
||||
run_experiment,
|
||||
)
|
||||
|
||||
@pytest.fixture
|
||||
def history_teacher():
|
||||
search_tool = SerperDevTool()
|
||||
return Agent(
|
||||
role="History Educator",
|
||||
goal="Teach students about important historical events with clarity and context",
|
||||
backstory=(
|
||||
"As a renowned historian and educator, you have spent decades studying world history, "
|
||||
"from ancient civilizations to modern events. You are passionate about making history "
|
||||
"engaging and understandable for learners of all ages. Your mission is to educate, explain, "
|
||||
"and spark curiosity about the past."
|
||||
),
|
||||
tools=[search_tool],
|
||||
verbose=True,
|
||||
)
|
||||
def test_history_teacher(history_teacher):
|
||||
dataset = [
|
||||
{"inputs": {"messages": "How was the Battle of Waterloo?"}, "expected_score": 8}
|
||||
]
|
||||
results = run_experiment(
|
||||
dataset=dataset, agents=[history_teacher], verbose=True
|
||||
)
|
||||
|
||||
assert_experiment_successfully(results)
|
||||
@@ -1133,6 +1133,119 @@ def test_output_file_validation():
|
||||
)
|
||||
|
||||
|
||||
def test_create_directory_true():
|
||||
"""Test that directories are created when create_directory=True."""
|
||||
from pathlib import Path
|
||||
|
||||
output_path = "test_create_dir/output.txt"
|
||||
|
||||
task = Task(
|
||||
description="Test task",
|
||||
expected_output="Test output",
|
||||
output_file=output_path,
|
||||
create_directory=True,
|
||||
)
|
||||
|
||||
resolved_path = Path(output_path).expanduser().resolve()
|
||||
resolved_dir = resolved_path.parent
|
||||
|
||||
if resolved_path.exists():
|
||||
resolved_path.unlink()
|
||||
if resolved_dir.exists():
|
||||
import shutil
|
||||
shutil.rmtree(resolved_dir)
|
||||
|
||||
assert not resolved_dir.exists()
|
||||
|
||||
task._save_file("test content")
|
||||
|
||||
assert resolved_dir.exists()
|
||||
assert resolved_path.exists()
|
||||
|
||||
if resolved_path.exists():
|
||||
resolved_path.unlink()
|
||||
if resolved_dir.exists():
|
||||
import shutil
|
||||
shutil.rmtree(resolved_dir)
|
||||
|
||||
|
||||
def test_create_directory_false():
|
||||
"""Test that directories are not created when create_directory=False."""
|
||||
from pathlib import Path
|
||||
|
||||
output_path = "nonexistent_test_dir/output.txt"
|
||||
|
||||
task = Task(
|
||||
description="Test task",
|
||||
expected_output="Test output",
|
||||
output_file=output_path,
|
||||
create_directory=False,
|
||||
)
|
||||
|
||||
resolved_path = Path(output_path).expanduser().resolve()
|
||||
resolved_dir = resolved_path.parent
|
||||
|
||||
if resolved_dir.exists():
|
||||
import shutil
|
||||
shutil.rmtree(resolved_dir)
|
||||
|
||||
assert not resolved_dir.exists()
|
||||
|
||||
with pytest.raises(RuntimeError, match="Directory .* does not exist and create_directory is False"):
|
||||
task._save_file("test content")
|
||||
|
||||
|
||||
def test_create_directory_default():
|
||||
"""Test that create_directory defaults to True for backward compatibility."""
|
||||
task = Task(
|
||||
description="Test task",
|
||||
expected_output="Test output",
|
||||
output_file="output.txt",
|
||||
)
|
||||
|
||||
assert task.create_directory is True
|
||||
|
||||
|
||||
def test_create_directory_with_existing_directory():
|
||||
"""Test that create_directory=False works when directory already exists."""
|
||||
from pathlib import Path
|
||||
|
||||
output_path = "existing_test_dir/output.txt"
|
||||
|
||||
resolved_path = Path(output_path).expanduser().resolve()
|
||||
resolved_dir = resolved_path.parent
|
||||
resolved_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
task = Task(
|
||||
description="Test task",
|
||||
expected_output="Test output",
|
||||
output_file=output_path,
|
||||
create_directory=False,
|
||||
)
|
||||
|
||||
task._save_file("test content")
|
||||
assert resolved_path.exists()
|
||||
|
||||
if resolved_path.exists():
|
||||
resolved_path.unlink()
|
||||
if resolved_dir.exists():
|
||||
import shutil
|
||||
shutil.rmtree(resolved_dir)
|
||||
|
||||
|
||||
def test_github_issue_3149_reproduction():
|
||||
"""Test that reproduces the exact issue from GitHub issue #3149."""
|
||||
task = Task(
|
||||
description="Test task for issue reproduction",
|
||||
expected_output="Test output",
|
||||
output_file="test_output.txt",
|
||||
create_directory=True,
|
||||
)
|
||||
|
||||
assert task.create_directory is True
|
||||
assert task.output_file == "test_output.txt"
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
def test_task_execution_times():
|
||||
researcher = Agent(
|
||||
|
||||
Reference in New Issue
Block a user