mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-03 13:18:29 +00:00
Compare commits
7 Commits
lg-agent-p
...
devin/1752
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
482c7e5318 | ||
|
|
5896f6a119 | ||
|
|
27c449c9c4 | ||
|
|
9737333ffd | ||
|
|
bf248d5118 | ||
|
|
2490e8cd46 | ||
|
|
9b67e5a15f |
75
.github/workflows/regression-tests.yml
vendored
75
.github/workflows/regression-tests.yml
vendored
@@ -1,75 +0,0 @@
|
||||
name: Regression Tests
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
branch:
|
||||
description: 'Branch to run tests on'
|
||||
required: true
|
||||
default: 'main'
|
||||
type: string
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
env:
|
||||
OPENAI_API_KEY: fake-api-key
|
||||
PYTHONUNBUFFERED: 1
|
||||
|
||||
jobs:
|
||||
regression-tests:
|
||||
name: Regression - ${{ github.event.inputs.branch }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event.inputs.branch }}
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Display execution info
|
||||
run: |
|
||||
echo "🚀 Running Regression Tests"
|
||||
echo "📂 Branch: ${{ github.event.inputs.branch }}"
|
||||
echo "📊 Current commit: $(git rev-parse --short HEAD)"
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v3
|
||||
with:
|
||||
enable-cache: true
|
||||
cache-dependency-glob: |
|
||||
**/pyproject.toml
|
||||
**/uv.lock
|
||||
|
||||
- name: Set up Python 3.13
|
||||
run: uv python install 3.13
|
||||
|
||||
- name: Install the project
|
||||
run: uv sync --dev --all-extras
|
||||
|
||||
- name: Install SQLite with FTS5 support
|
||||
run: |
|
||||
# WORKAROUND: GitHub Actions' Ubuntu runner uses SQLite without FTS5 support compiled in.
|
||||
# This is a temporary fix until the runner includes SQLite with FTS5 or Python's sqlite3
|
||||
# module is compiled with FTS5 support by default.
|
||||
# TODO: Remove this workaround once GitHub Actions runners include SQLite FTS5 support
|
||||
|
||||
# Install pysqlite3-binary which has FTS5 support
|
||||
uv pip install pysqlite3-binary
|
||||
# Create a sitecustomize.py to override sqlite3 with pysqlite3
|
||||
mkdir -p .pytest_sqlite_override
|
||||
echo "import sys; import pysqlite3; sys.modules['sqlite3'] = pysqlite3" > .pytest_sqlite_override/sitecustomize.py
|
||||
# Test FTS5 availability
|
||||
PYTHONPATH=.pytest_sqlite_override uv run python -c "import sqlite3; print(f'SQLite version: {sqlite3.sqlite_version}')"
|
||||
PYTHONPATH=.pytest_sqlite_override uv run python -c "import sqlite3; conn = sqlite3.connect(':memory:'); conn.execute('CREATE VIRTUAL TABLE test USING fts5(content)'); print('FTS5 module available')"
|
||||
|
||||
- name: Run Regression Tests
|
||||
run: |
|
||||
PYTHONPATH=.pytest_sqlite_override uv run pytest \
|
||||
--block-network \
|
||||
--timeout=30 \
|
||||
-vv \
|
||||
--durations=10 \
|
||||
-n auto \
|
||||
--maxfail=5 \
|
||||
tests/regression
|
||||
18
.github/workflows/tests.yml
vendored
18
.github/workflows/tests.yml
vendored
@@ -37,25 +37,9 @@ jobs:
|
||||
- name: Install the project
|
||||
run: uv sync --dev --all-extras
|
||||
|
||||
- name: Install SQLite with FTS5 support
|
||||
run: |
|
||||
# WORKAROUND: GitHub Actions' Ubuntu runner uses SQLite without FTS5 support compiled in.
|
||||
# This is a temporary fix until the runner includes SQLite with FTS5 or Python's sqlite3
|
||||
# module is compiled with FTS5 support by default.
|
||||
# TODO: Remove this workaround once GitHub Actions runners include SQLite FTS5 support
|
||||
|
||||
# Install pysqlite3-binary which has FTS5 support
|
||||
uv pip install pysqlite3-binary
|
||||
# Create a sitecustomize.py to override sqlite3 with pysqlite3
|
||||
mkdir -p .pytest_sqlite_override
|
||||
echo "import sys; import pysqlite3; sys.modules['sqlite3'] = pysqlite3" > .pytest_sqlite_override/sitecustomize.py
|
||||
# Test FTS5 availability
|
||||
PYTHONPATH=.pytest_sqlite_override uv run python -c "import sqlite3; print(f'SQLite version: {sqlite3.sqlite_version}')"
|
||||
PYTHONPATH=.pytest_sqlite_override uv run python -c "import sqlite3; conn = sqlite3.connect(':memory:'); conn.execute('CREATE VIRTUAL TABLE test USING fts5(content)'); print('FTS5 module available')"
|
||||
|
||||
- name: Run tests (group ${{ matrix.group }} of 8)
|
||||
run: |
|
||||
PYTHONPATH=.pytest_sqlite_override uv run pytest \
|
||||
uv run pytest \
|
||||
--block-network \
|
||||
--timeout=30 \
|
||||
-vv \
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -26,4 +26,5 @@ test_flow.html
|
||||
crewairules.mdc
|
||||
plan.md
|
||||
conceptual_plan.md
|
||||
build_image
|
||||
build_image
|
||||
chromadb-*.lock
|
||||
|
||||
@@ -10,8 +10,6 @@ Neatlogs helps you **see what your agent did**, **why**, and **share it**.
|
||||
|
||||
It captures every step: thoughts, tool calls, responses, evaluations. No raw logs. Just clear, structured traces. Great for debugging and collaboration.
|
||||
|
||||
---
|
||||
|
||||
## Why use Neatlogs?
|
||||
|
||||
CrewAI agents use multiple tools and reasoning steps. When something goes wrong, you need context — not just errors.
|
||||
@@ -37,8 +35,6 @@ The best UX to view a CrewAI trace. Post comments anywhere you want. Use AI to d
|
||||

|
||||

|
||||
|
||||
---
|
||||
|
||||
## Core Features
|
||||
|
||||
- **Trace Viewer**: Track thoughts, tools, and decisions in sequence
|
||||
@@ -49,8 +45,6 @@ The best UX to view a CrewAI trace. Post comments anywhere you want. Use AI to d
|
||||
- **Ask the Trace (AI)**: Chat with your trace using Neatlogs AI bot
|
||||
- **Public Sharing**: Publish trace links to your community
|
||||
|
||||
---
|
||||
|
||||
## Quick Setup with CrewAI
|
||||
|
||||
<Steps>
|
||||
@@ -61,7 +55,7 @@ The best UX to view a CrewAI trace. Post comments anywhere you want. Use AI to d
|
||||
```bash
|
||||
pip install neatlogs
|
||||
```
|
||||
(Latest version 0.8.0, Python 3.8+; MIT license) :contentReference[oaicite:1]{index=1}
|
||||
(Latest version 0.8.0, Python 3.8+; MIT license)
|
||||
</Step>
|
||||
<Step title="Initialize Neatlogs">
|
||||
Before starting Crew agents, add:
|
||||
@@ -76,18 +70,18 @@ The best UX to view a CrewAI trace. Post comments anywhere you want. Use AI to d
|
||||
</Step>
|
||||
</Steps>
|
||||
|
||||
---
|
||||
|
||||
|
||||
## Under the Hood
|
||||
|
||||
According to GitHub, Neatlogs:
|
||||
|
||||
- Captures thoughts, tool calls, responses, errors, and token stats :contentReference[oaicite:2]{index=2}
|
||||
- Supports AI-powered task generation and robust evaluation workflows :contentReference[oaicite:3]{index=3}
|
||||
- Captures thoughts, tool calls, responses, errors, and token stats
|
||||
- Supports AI-powered task generation and robust evaluation workflows
|
||||
|
||||
All with just two lines of code.
|
||||
|
||||
---
|
||||
|
||||
|
||||
## Watch It Work
|
||||
|
||||
@@ -113,7 +107,7 @@ All with just two lines of code.
|
||||
allowFullScreen
|
||||
></iframe>
|
||||
|
||||
---
|
||||
|
||||
|
||||
## Links & Support
|
||||
|
||||
@@ -121,9 +115,9 @@ All with just two lines of code.
|
||||
- 🔐 [Dashboard & API Key](https://app.neatlogs.com/)
|
||||
- 🐦 [Follow on Twitter](https://twitter.com/neatlogs)
|
||||
- 📧 Contact: hello@neatlogs.com
|
||||
- 🛠 [GitHub SDK](https://github.com/NeatLogs/neatlogs) :contentReference[oaicite:4]{index=4}
|
||||
- 🛠 [GitHub SDK](https://github.com/NeatLogs/neatlogs)
|
||||
|
||||
|
||||
---
|
||||
|
||||
## TL;DR
|
||||
|
||||
|
||||
@@ -39,6 +39,7 @@ dependencies = [
|
||||
"tomli>=2.0.2",
|
||||
"blinker>=1.9.0",
|
||||
"json5>=0.10.0",
|
||||
"portalocker==2.7.0",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
@@ -47,7 +48,7 @@ Documentation = "https://docs.crewai.com"
|
||||
Repository = "https://github.com/crewAIInc/crewAI"
|
||||
|
||||
[project.optional-dependencies]
|
||||
tools = ["crewai-tools~=0.51.0"]
|
||||
tools = ["crewai-tools~=0.55.0"]
|
||||
embeddings = [
|
||||
"tiktoken~=0.8.0"
|
||||
]
|
||||
@@ -137,6 +138,3 @@ exclude = [
|
||||
"docs/**",
|
||||
"docs/",
|
||||
]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
norecursedirs = ["tests/regression"]
|
||||
|
||||
@@ -54,7 +54,7 @@ def _track_install_async():
|
||||
|
||||
_track_install_async()
|
||||
|
||||
__version__ = "0.141.0"
|
||||
__version__ = "0.148.0"
|
||||
__all__ = [
|
||||
"Agent",
|
||||
"Crew",
|
||||
|
||||
@@ -5,7 +5,7 @@ description = "{{name}} using crewAI"
|
||||
authors = [{ name = "Your Name", email = "you@example.com" }]
|
||||
requires-python = ">=3.10,<3.14"
|
||||
dependencies = [
|
||||
"crewai[tools]>=0.141.0,<1.0.0"
|
||||
"crewai[tools]>=0.148.0,<1.0.0"
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
||||
@@ -5,7 +5,7 @@ description = "{{name}} using crewAI"
|
||||
authors = [{ name = "Your Name", email = "you@example.com" }]
|
||||
requires-python = ">=3.10,<3.14"
|
||||
dependencies = [
|
||||
"crewai[tools]>=0.141.0,<1.0.0",
|
||||
"crewai[tools]>=0.148.0,<1.0.0",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
||||
@@ -5,7 +5,7 @@ description = "Power up your crews with {{folder_name}}"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10,<3.14"
|
||||
dependencies = [
|
||||
"crewai[tools]>=0.141.0"
|
||||
"crewai[tools]>=0.148.0"
|
||||
]
|
||||
|
||||
[tool.crewai]
|
||||
|
||||
@@ -25,6 +25,11 @@ class CrewOutput(BaseModel):
|
||||
|
||||
@property
|
||||
def json(self) -> Optional[str]:
|
||||
if not self.tasks_output:
|
||||
raise ValueError(
|
||||
"No tasks found in crew output. Please ensure the crew has completed at least one task before accessing JSON output."
|
||||
)
|
||||
|
||||
if self.tasks_output[-1].output_format != OutputFormat.JSON:
|
||||
raise ValueError(
|
||||
"No JSON output found in the final task. Please make sure to set the output_json property in the final task in your crew."
|
||||
|
||||
@@ -1,23 +1,24 @@
|
||||
import threading
|
||||
from typing import Any
|
||||
|
||||
from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy
|
||||
from crewai.agent import Agent
|
||||
from crewai.task import Task
|
||||
from crewai.experimental.evaluation.evaluation_display import EvaluationDisplayFormatter
|
||||
|
||||
from typing import Any
|
||||
from crewai.utilities.events.agent_events import AgentEvaluationStartedEvent, AgentEvaluationCompletedEvent, AgentEvaluationFailedEvent
|
||||
from crewai.experimental.evaluation import BaseEvaluator, create_evaluation_callbacks
|
||||
from collections.abc import Sequence
|
||||
from crewai.utilities.events.crewai_event_bus import crewai_event_bus
|
||||
from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
|
||||
from crewai.utilities.events.task_events import TaskCompletedEvent
|
||||
from crewai.utilities.events.agent_events import LiteAgentExecutionCompletedEvent
|
||||
from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult
|
||||
import threading
|
||||
from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult, EvaluationScore, MetricCategory
|
||||
|
||||
class ExecutionState:
|
||||
def __init__(self):
|
||||
self.traces = {}
|
||||
self.current_agent_id = None
|
||||
self.current_task_id = None
|
||||
self.current_agent_id: str | None = None
|
||||
self.current_task_id: str | None = None
|
||||
self.iteration = 1
|
||||
self.iterations_results = {}
|
||||
self.agent_evaluators = {}
|
||||
@@ -49,17 +50,21 @@ class AgentEvaluator:
|
||||
return self._thread_local.execution_state
|
||||
|
||||
def _subscribe_to_events(self) -> None:
|
||||
crewai_event_bus.register_handler(TaskCompletedEvent, self._handle_task_completed)
|
||||
crewai_event_bus.register_handler(LiteAgentExecutionCompletedEvent, self._handle_lite_agent_completed)
|
||||
from typing import cast
|
||||
crewai_event_bus.register_handler(TaskCompletedEvent, cast(Any, self._handle_task_completed))
|
||||
crewai_event_bus.register_handler(LiteAgentExecutionCompletedEvent, cast(Any, self._handle_lite_agent_completed))
|
||||
|
||||
def _handle_task_completed(self, source: Any, event: TaskCompletedEvent) -> None:
|
||||
assert event.task is not None
|
||||
agent = event.task.agent
|
||||
if agent and str(getattr(agent, 'id', 'unknown')) in self._execution_state.agent_evaluators:
|
||||
self.emit_evaluation_started_event(agent_role=agent.role, agent_id=str(agent.id), task_id=str(event.task.id))
|
||||
|
||||
state = ExecutionState()
|
||||
state.current_agent_id = str(agent.id)
|
||||
state.current_task_id = str(event.task.id)
|
||||
|
||||
assert state.current_agent_id is not None and state.current_task_id is not None
|
||||
trace = self.callback.get_trace(state.current_agent_id, state.current_task_id)
|
||||
|
||||
if not trace:
|
||||
@@ -100,6 +105,7 @@ class AgentEvaluator:
|
||||
if not target_agent:
|
||||
return
|
||||
|
||||
assert state.current_agent_id is not None and state.current_task_id is not None
|
||||
trace = self.callback.get_trace(state.current_agent_id, state.current_task_id)
|
||||
|
||||
if not trace:
|
||||
@@ -181,8 +187,10 @@ class AgentEvaluator:
|
||||
)
|
||||
|
||||
assert self.evaluators is not None
|
||||
task_id = str(task.id) if task else None
|
||||
for evaluator in self.evaluators:
|
||||
try:
|
||||
self.emit_evaluation_started_event(agent_role=agent.role, agent_id=str(agent.id), task_id=task_id)
|
||||
score = evaluator.evaluate(
|
||||
agent=agent,
|
||||
task=task,
|
||||
@@ -190,11 +198,31 @@ class AgentEvaluator:
|
||||
final_output=final_output
|
||||
)
|
||||
result.metrics[evaluator.metric_category] = score
|
||||
self.emit_evaluation_completed_event(agent_role=agent.role, agent_id=str(agent.id), task_id=task_id, metric_category=evaluator.metric_category, score=score)
|
||||
except Exception as e:
|
||||
self.emit_evaluation_failed_event(agent_role=agent.role, agent_id=str(agent.id), task_id=task_id, error=str(e))
|
||||
self.console_formatter.print(f"Error in {evaluator.metric_category.value} evaluator: {str(e)}")
|
||||
|
||||
return result
|
||||
|
||||
def emit_evaluation_started_event(self, agent_role: str, agent_id: str, task_id: str | None = None):
|
||||
crewai_event_bus.emit(
|
||||
self,
|
||||
AgentEvaluationStartedEvent(agent_role=agent_role, agent_id=agent_id, task_id=task_id, iteration=self._execution_state.iteration)
|
||||
)
|
||||
|
||||
def emit_evaluation_completed_event(self, agent_role: str, agent_id: str, task_id: str | None = None, metric_category: MetricCategory | None = None, score: EvaluationScore | None = None):
|
||||
crewai_event_bus.emit(
|
||||
self,
|
||||
AgentEvaluationCompletedEvent(agent_role=agent_role, agent_id=agent_id, task_id=task_id, iteration=self._execution_state.iteration, metric_category=metric_category, score=score)
|
||||
)
|
||||
|
||||
def emit_evaluation_failed_event(self, agent_role: str, agent_id: str, error: str, task_id: str | None = None):
|
||||
crewai_event_bus.emit(
|
||||
self,
|
||||
AgentEvaluationFailedEvent(agent_role=agent_role, agent_id=agent_id, task_id=task_id, iteration=self._execution_state.iteration, error=error)
|
||||
)
|
||||
|
||||
def create_default_evaluator(agents: list[Agent], llm: None = None):
|
||||
from crewai.experimental.evaluation import (
|
||||
GoalAlignmentEvaluator,
|
||||
|
||||
@@ -227,4 +227,8 @@ class EvaluationTraceCallback(BaseEventListener):
|
||||
|
||||
|
||||
def create_evaluation_callbacks() -> EvaluationTraceCallback:
|
||||
return EvaluationTraceCallback()
|
||||
from crewai.utilities.events.crewai_event_bus import crewai_event_bus
|
||||
|
||||
callback = EvaluationTraceCallback()
|
||||
callback.setup_listeners(crewai_event_bus)
|
||||
return callback
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import inspect
|
||||
from pathlib import Path
|
||||
|
||||
from typing_extensions import Any
|
||||
import warnings
|
||||
@@ -42,30 +41,12 @@ def run_experiment(dataset: list[dict[str, Any]], crew: Crew | None = None, agen
|
||||
return runner.run(agents=agents, crew=crew, print_summary=verbose)
|
||||
|
||||
def _get_baseline_filepath_fallback() -> str:
|
||||
filename = "experiment_fallback.json"
|
||||
calling_file = None
|
||||
test_func_name = "experiment_fallback"
|
||||
|
||||
try:
|
||||
current_frame = inspect.currentframe()
|
||||
if current_frame is not None:
|
||||
test_func_name = current_frame.f_back.f_back.f_code.co_name # type: ignore[union-attr]
|
||||
filename = f"{test_func_name}.json"
|
||||
calling_file = current_frame.f_back.f_back.f_code.co_filename # type: ignore[union-attr]
|
||||
except Exception:
|
||||
return filename
|
||||
|
||||
if not calling_file:
|
||||
return filename
|
||||
|
||||
calling_path = Path(calling_file)
|
||||
try:
|
||||
baseline_dir_parts = calling_path.parts[:-1]
|
||||
baseline_dir = Path(*baseline_dir_parts) / "results"
|
||||
baseline_dir.mkdir(parents=True, exist_ok=True)
|
||||
baseline_filepath = baseline_dir / filename
|
||||
return str(baseline_filepath)
|
||||
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
|
||||
return filename
|
||||
...
|
||||
return f"{test_func_name}_results.json"
|
||||
@@ -18,6 +18,7 @@ from crewai.utilities.chromadb import sanitize_collection_name
|
||||
from crewai.utilities.constants import KNOWLEDGE_DIRECTORY
|
||||
from crewai.utilities.logger import Logger
|
||||
from crewai.utilities.paths import db_storage_path
|
||||
from crewai.utilities.chromadb import create_persistent_client
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
@@ -84,14 +85,11 @@ class KnowledgeStorage(BaseKnowledgeStorage):
|
||||
raise Exception("Collection not initialized")
|
||||
|
||||
def initialize_knowledge_storage(self):
|
||||
base_path = os.path.join(db_storage_path(), "knowledge")
|
||||
chroma_client = chromadb.PersistentClient(
|
||||
path=base_path,
|
||||
self.app = create_persistent_client(
|
||||
path=os.path.join(db_storage_path(), "knowledge"),
|
||||
settings=Settings(allow_reset=True),
|
||||
)
|
||||
|
||||
self.app = chroma_client
|
||||
|
||||
try:
|
||||
collection_name = (
|
||||
f"knowledge_{self.collection_name}"
|
||||
@@ -111,9 +109,8 @@ class KnowledgeStorage(BaseKnowledgeStorage):
|
||||
def reset(self):
|
||||
base_path = os.path.join(db_storage_path(), KNOWLEDGE_DIRECTORY)
|
||||
if not self.app:
|
||||
self.app = chromadb.PersistentClient(
|
||||
path=base_path,
|
||||
settings=Settings(allow_reset=True),
|
||||
self.app = create_persistent_client(
|
||||
path=base_path, settings=Settings(allow_reset=True)
|
||||
)
|
||||
|
||||
self.app.reset()
|
||||
|
||||
@@ -4,12 +4,12 @@ import logging
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from chromadb.api import ClientAPI
|
||||
|
||||
from crewai.memory.storage.base_rag_storage import BaseRAGStorage
|
||||
from crewai.utilities import EmbeddingConfigurator
|
||||
from crewai.utilities.chromadb import create_persistent_client
|
||||
from crewai.utilities.constants import MAX_FILE_NAME_LENGTH
|
||||
from crewai.utilities.paths import db_storage_path
|
||||
|
||||
@@ -60,17 +60,15 @@ class RAGStorage(BaseRAGStorage):
|
||||
self.embedder_config = configurator.configure_embedder(self.embedder_config)
|
||||
|
||||
def _initialize_app(self):
|
||||
import chromadb
|
||||
from chromadb.config import Settings
|
||||
|
||||
self._set_embedder_config()
|
||||
chroma_client = chromadb.PersistentClient(
|
||||
|
||||
self.app = create_persistent_client(
|
||||
path=self.path if self.path else self.storage_file_name,
|
||||
settings=Settings(allow_reset=self.allow_reset),
|
||||
)
|
||||
|
||||
self.app = chroma_client
|
||||
|
||||
self.collection = self.app.get_or_create_collection(
|
||||
name=self.type, embedding_function=self.embedder_config
|
||||
)
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
import re
|
||||
import portalocker
|
||||
from chromadb import PersistentClient
|
||||
from hashlib import md5
|
||||
from typing import Optional
|
||||
|
||||
|
||||
MIN_COLLECTION_LENGTH = 3
|
||||
MAX_COLLECTION_LENGTH = 63
|
||||
DEFAULT_COLLECTION = "default_collection"
|
||||
@@ -60,3 +64,16 @@ def sanitize_collection_name(name: Optional[str], max_collection_length: int = M
|
||||
sanitized = sanitized[:-1] + "z"
|
||||
|
||||
return sanitized
|
||||
|
||||
|
||||
def create_persistent_client(path: str, **kwargs):
|
||||
"""
|
||||
Creates a persistent client for ChromaDB with a lock file to prevent
|
||||
concurrent creations. Works for both multi-threads and multi-processes
|
||||
environments.
|
||||
"""
|
||||
lockfile = f"chromadb-{md5(path.encode(), usedforsecurity=False).hexdigest()}.lock"
|
||||
with portalocker.Lock(lockfile):
|
||||
client = PersistentClient(path=path, **kwargs)
|
||||
|
||||
return client
|
||||
|
||||
@@ -17,6 +17,9 @@ from .agent_events import (
|
||||
AgentExecutionStartedEvent,
|
||||
AgentExecutionCompletedEvent,
|
||||
AgentExecutionErrorEvent,
|
||||
AgentEvaluationStartedEvent,
|
||||
AgentEvaluationCompletedEvent,
|
||||
AgentEvaluationFailedEvent,
|
||||
)
|
||||
from .task_events import (
|
||||
TaskStartedEvent,
|
||||
@@ -74,6 +77,9 @@ __all__ = [
|
||||
"AgentExecutionStartedEvent",
|
||||
"AgentExecutionCompletedEvent",
|
||||
"AgentExecutionErrorEvent",
|
||||
"AgentEvaluationStartedEvent",
|
||||
"AgentEvaluationCompletedEvent",
|
||||
"AgentEvaluationFailedEvent",
|
||||
"TaskStartedEvent",
|
||||
"TaskCompletedEvent",
|
||||
"TaskFailedEvent",
|
||||
|
||||
@@ -123,3 +123,28 @@ class AgentLogsExecutionEvent(BaseEvent):
|
||||
type: str = "agent_logs_execution"
|
||||
|
||||
model_config = {"arbitrary_types_allowed": True}
|
||||
|
||||
# Agent Eval events
|
||||
class AgentEvaluationStartedEvent(BaseEvent):
|
||||
agent_id: str
|
||||
agent_role: str
|
||||
task_id: str | None = None
|
||||
iteration: int
|
||||
type: str = "agent_evaluation_started"
|
||||
|
||||
class AgentEvaluationCompletedEvent(BaseEvent):
|
||||
agent_id: str
|
||||
agent_role: str
|
||||
task_id: str | None = None
|
||||
iteration: int
|
||||
metric_category: Any
|
||||
score: Any
|
||||
type: str = "agent_evaluation_completed"
|
||||
|
||||
class AgentEvaluationFailedEvent(BaseEvent):
|
||||
agent_id: str
|
||||
agent_role: str
|
||||
task_id: str | None = None
|
||||
iteration: int
|
||||
error: str
|
||||
type: str = "agent_evaluation_failed"
|
||||
|
||||
123
tests/cassettes/TestAgentEvaluator.test_failed_evaluation.yaml
Normal file
123
tests/cassettes/TestAgentEvaluator.test_failed_evaluation.yaml
Normal file
@@ -0,0 +1,123 @@
|
||||
interactions:
|
||||
- request:
|
||||
body: '{"messages": [{"role": "system", "content": "You are Test Agent. An agent
|
||||
created for testing purposes\nYour personal goal is: Complete test tasks successfully\nTo
|
||||
give my best complete final answer to the task respond using the exact following
|
||||
format:\n\nThought: I now can give a great answer\nFinal Answer: Your final
|
||||
answer must be the great and the most complete as possible, it must be outcome
|
||||
described.\n\nI MUST use these formats, my job depends on it!"}, {"role": "user",
|
||||
"content": "\nCurrent Task: Test task description\n\nThis is the expected criteria
|
||||
for your final answer: Expected test output\nyou MUST return the actual complete
|
||||
content as the final answer, not a summary.\n\nBegin! This is VERY important
|
||||
to you, use the tools available and give your best Final Answer, your job depends
|
||||
on it!\n\nThought:"}], "model": "gpt-4o-mini", "stop": ["\nObservation:"]}'
|
||||
headers:
|
||||
accept:
|
||||
- application/json
|
||||
accept-encoding:
|
||||
- gzip, deflate, zstd
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '879'
|
||||
content-type:
|
||||
- application/json
|
||||
host:
|
||||
- api.openai.com
|
||||
user-agent:
|
||||
- OpenAI/Python 1.93.0
|
||||
x-stainless-arch:
|
||||
- arm64
|
||||
x-stainless-async:
|
||||
- 'false'
|
||||
x-stainless-lang:
|
||||
- python
|
||||
x-stainless-os:
|
||||
- MacOS
|
||||
x-stainless-package-version:
|
||||
- 1.93.0
|
||||
x-stainless-raw-response:
|
||||
- 'true'
|
||||
x-stainless-read-timeout:
|
||||
- '600.0'
|
||||
x-stainless-retry-count:
|
||||
- '0'
|
||||
x-stainless-runtime:
|
||||
- CPython
|
||||
x-stainless-runtime-version:
|
||||
- 3.11.12
|
||||
method: POST
|
||||
uri: https://api.openai.com/v1/chat/completions
|
||||
response:
|
||||
body:
|
||||
string: !!binary |
|
||||
H4sIAAAAAAAAAwAAAP//jFTBbhtHDL3rK4g5rwRbtaNYt9RoEaNoUaBODm0DgZnh7jKe5WyHXDmO
|
||||
4X8vZiRLcupDLwvsPPLxPQ45jzMAx8GtwfkezQ9jnP9oeLv98N5+vfl9+4v89Mf76+XV7XDz8Yc/
|
||||
r39T15SM9PkLeXvOWvg0jJGMk+xgnwmNCuv56nJ5+XZ1tbqswJACxZLWjTa/SPOBhefLs+XF/Gw1
|
||||
P3+7z+4Te1K3hr9mAACP9Vt0SqCvbg1nzfPJQKrYkVsfggBcTrGcOFRlNRRzzRH0SYykSr8BSffg
|
||||
UaDjLQFCV2QDit5TBvhbfmbBCO/q/xpue1ZgBesJ6OtI3iiAkRqkycbJGrjv2ffgk5S6CqkFhECG
|
||||
HClAIPWZx9Kkgtz3aJVq37vChXoH2qcpBogp3UHkO1rAbU/QViW7Os8hLD5OgQBjBCFfOpEfgKVN
|
||||
ecBSpoFAQxK1jMbSgY+Y2R6aWjJTT6K8JSHVBlACYOgpk3gCS4DyADqS55YpQDdxoMhCuoCbgwKf
|
||||
tpSB0PeAJdaKseKpOsn0z8SZBhJrgESnXERY8S0JRsxWulkoilkKkDJ0JJQx8jcKi13DX3pWyuWm
|
||||
FPDQN8jU7mW3KRfdSaj2r5ZLMEmgXOYg7K5OlcQYI1Cs4vSFavSVmLWnsDgdnEztpFiGV6YYTwAU
|
||||
SVYbXkf20x55OgxpTN2Y02f9LtW1LKz9JhNqkjKQaml0FX2aAXyqyzC9mG835jSMtrF0R7Xc+Zvz
|
||||
HZ877uARvXqzBy0ZxuP58nLVvMK32Q2rnqyT8+h7CsfU4+7hFDidALMT1/9V8xr3zjlL93/oj4D3
|
||||
NBqFzZgpsH/p+BiW6Utd0dfDDl2ugl2ZK/a0MaZcbiJQi1PcPRxOH9Ro2LQsHeUxc309yk3Onmb/
|
||||
AgAA//8DAAbYfvVABQAA
|
||||
headers:
|
||||
CF-RAY:
|
||||
- 95f9c7ffa8331b11-GRU
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Encoding:
|
||||
- gzip
|
||||
Content-Type:
|
||||
- application/json
|
||||
Date:
|
||||
- Tue, 15 Jul 2025 13:59:38 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Set-Cookie:
|
||||
- __cf_bm=J_xe1AP.B5P6D2GVMCesyioeS5E9DnYT34rbwQUefFc-1752587978-1.0.1.1-5Dflk5cAj6YCsOSVbCFWWSpXpw_mXsczIdzWzs2h2OwDL01HQbduE5LAToy67sfjFjHeeO4xRrqPLUQpySy2QqyHXbI_fzX4UAt3.UdwHxU;
|
||||
path=/; expires=Tue, 15-Jul-25 14:29:38 GMT; domain=.api.openai.com; HttpOnly;
|
||||
Secure; SameSite=None
|
||||
- _cfuvid=0rTD8RMpxBQQy42jzmum16_eoRtWNfaZMG_TJkhGS7I-1752587978437-0.0.1.1-604800000;
|
||||
path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
X-Content-Type-Options:
|
||||
- nosniff
|
||||
access-control-expose-headers:
|
||||
- X-Request-ID
|
||||
alt-svc:
|
||||
- h3=":443"; ma=86400
|
||||
cf-cache-status:
|
||||
- DYNAMIC
|
||||
openai-organization:
|
||||
- crewai-iuxna1
|
||||
openai-processing-ms:
|
||||
- '2623'
|
||||
openai-version:
|
||||
- '2020-10-01'
|
||||
strict-transport-security:
|
||||
- max-age=31536000; includeSubDomains; preload
|
||||
x-envoy-upstream-service-time:
|
||||
- '2626'
|
||||
x-ratelimit-limit-requests:
|
||||
- '30000'
|
||||
x-ratelimit-limit-tokens:
|
||||
- '150000000'
|
||||
x-ratelimit-remaining-requests:
|
||||
- '29999'
|
||||
x-ratelimit-remaining-tokens:
|
||||
- '149999813'
|
||||
x-ratelimit-reset-requests:
|
||||
- 2ms
|
||||
x-ratelimit-reset-tokens:
|
||||
- 0s
|
||||
x-request-id:
|
||||
- req_ccc347e91010713379c920aa0efd1f4f
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
version: 1
|
||||
@@ -310,6 +310,41 @@ def test_crew_creation(researcher, writer):
|
||||
assert result.raw == expected_string_output
|
||||
|
||||
|
||||
def test_crew_output_json_empty_tasks():
|
||||
"""Test that CrewOutput.json raises ValueError when tasks_output is empty."""
|
||||
from crewai.crews.crew_output import CrewOutput
|
||||
from crewai.types.usage_metrics import UsageMetrics
|
||||
|
||||
output = CrewOutput(
|
||||
raw="Test output",
|
||||
tasks_output=[],
|
||||
token_usage=UsageMetrics()
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError) as excinfo:
|
||||
_ = output.json
|
||||
|
||||
assert "No tasks found in crew output" in str(excinfo.value)
|
||||
|
||||
|
||||
def test_crew_output_json_reproduction_case():
|
||||
"""Test reproduction case from GitHub issue #3185."""
|
||||
from crewai.crews.crew_output import CrewOutput
|
||||
|
||||
output = CrewOutput(
|
||||
raw="",
|
||||
pydantic=None,
|
||||
json_dict={"test": "value"},
|
||||
tasks_output=[],
|
||||
token_usage={}
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError) as excinfo:
|
||||
_ = output.json
|
||||
|
||||
assert "No tasks found in crew output" in str(excinfo.value)
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
def test_sync_task_execution(researcher, writer):
|
||||
from unittest.mock import patch
|
||||
|
||||
@@ -11,9 +11,13 @@ from crewai.experimental.evaluation import (
|
||||
ToolSelectionEvaluator,
|
||||
ParameterExtractionEvaluator,
|
||||
ToolInvocationEvaluator,
|
||||
ReasoningEfficiencyEvaluator
|
||||
ReasoningEfficiencyEvaluator,
|
||||
MetricCategory,
|
||||
EvaluationScore
|
||||
)
|
||||
|
||||
from crewai.utilities.events.agent_events import AgentEvaluationStartedEvent, AgentEvaluationCompletedEvent, AgentEvaluationFailedEvent
|
||||
from crewai.utilities.events.crewai_event_bus import crewai_event_bus
|
||||
from crewai.experimental.evaluation import create_default_evaluator
|
||||
|
||||
class TestAgentEvaluator:
|
||||
@@ -102,28 +106,57 @@ class TestAgentEvaluator:
|
||||
goal="Complete test tasks successfully",
|
||||
backstory="An agent created for testing purposes",
|
||||
)
|
||||
agent_evaluator = AgentEvaluator(agents=[agent], evaluators=[GoalAlignmentEvaluator()])
|
||||
|
||||
agent.kickoff(messages="Complete this task successfully")
|
||||
with crewai_event_bus.scoped_handlers():
|
||||
events = {}
|
||||
@crewai_event_bus.on(AgentEvaluationStartedEvent)
|
||||
def capture_started(source, event):
|
||||
events["started"] = event
|
||||
|
||||
results = agent_evaluator.get_evaluation_results()
|
||||
@crewai_event_bus.on(AgentEvaluationCompletedEvent)
|
||||
def capture_completed(source, event):
|
||||
events["completed"] = event
|
||||
|
||||
assert isinstance(results, dict)
|
||||
@crewai_event_bus.on(AgentEvaluationFailedEvent)
|
||||
def capture_failed(source, event):
|
||||
events["failed"] = event
|
||||
|
||||
result, = results[agent.role]
|
||||
assert isinstance(result, AgentEvaluationResult)
|
||||
agent_evaluator = AgentEvaluator(agents=[agent], evaluators=[GoalAlignmentEvaluator()])
|
||||
|
||||
assert result.agent_id == str(agent.id)
|
||||
assert result.task_id == "lite_task"
|
||||
agent.kickoff(messages="Complete this task successfully")
|
||||
|
||||
goal_alignment, = result.metrics.values()
|
||||
assert goal_alignment.score == 2.0
|
||||
assert events.keys() == {"started", "completed"}
|
||||
assert events["started"].agent_id == str(agent.id)
|
||||
assert events["started"].agent_role == agent.role
|
||||
assert events["started"].task_id is None
|
||||
assert events["started"].iteration == 1
|
||||
|
||||
expected_feedback = "The agent did not demonstrate a clear understanding of the task goal, which is to complete test tasks successfully"
|
||||
assert expected_feedback in goal_alignment.feedback
|
||||
assert events["completed"].agent_id == str(agent.id)
|
||||
assert events["completed"].agent_role == agent.role
|
||||
assert events["completed"].task_id is None
|
||||
assert events["completed"].iteration == 1
|
||||
assert events["completed"].metric_category == MetricCategory.GOAL_ALIGNMENT
|
||||
assert isinstance(events["completed"].score, EvaluationScore)
|
||||
assert events["completed"].score.score == 2.0
|
||||
|
||||
assert goal_alignment.raw_response is not None
|
||||
assert '"score": 2' in goal_alignment.raw_response
|
||||
results = agent_evaluator.get_evaluation_results()
|
||||
|
||||
assert isinstance(results, dict)
|
||||
|
||||
result, = results[agent.role]
|
||||
assert isinstance(result, AgentEvaluationResult)
|
||||
|
||||
assert result.agent_id == str(agent.id)
|
||||
assert result.task_id == "lite_task"
|
||||
|
||||
goal_alignment, = result.metrics.values()
|
||||
assert goal_alignment.score == 2.0
|
||||
|
||||
expected_feedback = "The agent did not demonstrate a clear understanding of the task goal, which is to complete test tasks successfully"
|
||||
assert expected_feedback in goal_alignment.feedback
|
||||
|
||||
assert goal_alignment.raw_response is not None
|
||||
assert '"score": 2' in goal_alignment.raw_response
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
def test_eval_specific_agents_from_crew(self, mock_crew):
|
||||
@@ -140,25 +173,106 @@ class TestAgentEvaluator:
|
||||
mock_crew.agents.append(agent)
|
||||
mock_crew.tasks.append(task)
|
||||
|
||||
agent_evaluator = AgentEvaluator(agents=[agent], evaluators=[GoalAlignmentEvaluator()])
|
||||
with crewai_event_bus.scoped_handlers():
|
||||
events = {}
|
||||
@crewai_event_bus.on(AgentEvaluationStartedEvent)
|
||||
def capture_started(source, event):
|
||||
events["started"] = event
|
||||
|
||||
mock_crew.kickoff()
|
||||
@crewai_event_bus.on(AgentEvaluationCompletedEvent)
|
||||
def capture_completed(source, event):
|
||||
events["completed"] = event
|
||||
|
||||
results = agent_evaluator.get_evaluation_results()
|
||||
@crewai_event_bus.on(AgentEvaluationFailedEvent)
|
||||
def capture_failed(source, event):
|
||||
events["failed"] = event
|
||||
|
||||
assert isinstance(results, dict)
|
||||
assert len(results.keys()) == 1
|
||||
result, = results[agent.role]
|
||||
assert isinstance(result, AgentEvaluationResult)
|
||||
agent_evaluator = AgentEvaluator(agents=[agent], evaluators=[GoalAlignmentEvaluator()])
|
||||
mock_crew.kickoff()
|
||||
|
||||
assert result.agent_id == str(agent.id)
|
||||
assert result.task_id == str(task.id)
|
||||
assert events.keys() == {"started", "completed"}
|
||||
assert events["started"].agent_id == str(agent.id)
|
||||
assert events["started"].agent_role == agent.role
|
||||
assert events["started"].task_id == str(task.id)
|
||||
assert events["started"].iteration == 1
|
||||
|
||||
goal_alignment, = result.metrics.values()
|
||||
assert goal_alignment.score == 5.0
|
||||
assert events["completed"].agent_id == str(agent.id)
|
||||
assert events["completed"].agent_role == agent.role
|
||||
assert events["completed"].task_id == str(task.id)
|
||||
assert events["completed"].iteration == 1
|
||||
assert events["completed"].metric_category == MetricCategory.GOAL_ALIGNMENT
|
||||
assert isinstance(events["completed"].score, EvaluationScore)
|
||||
assert events["completed"].score.score == 5.0
|
||||
|
||||
expected_feedback = "The agent provided a thorough guide on how to conduct a test task but failed to produce specific expected output"
|
||||
assert expected_feedback in goal_alignment.feedback
|
||||
results = agent_evaluator.get_evaluation_results()
|
||||
|
||||
assert goal_alignment.raw_response is not None
|
||||
assert '"score": 5' in goal_alignment.raw_response
|
||||
assert isinstance(results, dict)
|
||||
assert len(results.keys()) == 1
|
||||
result, = results[agent.role]
|
||||
assert isinstance(result, AgentEvaluationResult)
|
||||
|
||||
assert result.agent_id == str(agent.id)
|
||||
assert result.task_id == str(task.id)
|
||||
|
||||
goal_alignment, = result.metrics.values()
|
||||
assert goal_alignment.score == 5.0
|
||||
|
||||
expected_feedback = "The agent provided a thorough guide on how to conduct a test task but failed to produce specific expected output"
|
||||
assert expected_feedback in goal_alignment.feedback
|
||||
|
||||
assert goal_alignment.raw_response is not None
|
||||
assert '"score": 5' in goal_alignment.raw_response
|
||||
|
||||
|
||||
@pytest.mark.vcr(filter_headers=["authorization"])
|
||||
def test_failed_evaluation(self, mock_crew):
|
||||
agent, = mock_crew.agents
|
||||
task, = mock_crew.tasks
|
||||
|
||||
with crewai_event_bus.scoped_handlers():
|
||||
events = {}
|
||||
|
||||
@crewai_event_bus.on(AgentEvaluationStartedEvent)
|
||||
def capture_started(source, event):
|
||||
events["started"] = event
|
||||
|
||||
@crewai_event_bus.on(AgentEvaluationCompletedEvent)
|
||||
def capture_completed(source, event):
|
||||
events["completed"] = event
|
||||
|
||||
@crewai_event_bus.on(AgentEvaluationFailedEvent)
|
||||
def capture_failed(source, event):
|
||||
events["failed"] = event
|
||||
|
||||
# Create a mock evaluator that will raise an exception
|
||||
from crewai.experimental.evaluation.base_evaluator import BaseEvaluator
|
||||
from crewai.experimental.evaluation import MetricCategory
|
||||
class FailingEvaluator(BaseEvaluator):
|
||||
metric_category = MetricCategory.GOAL_ALIGNMENT
|
||||
|
||||
def evaluate(self, agent, task, execution_trace, final_output):
|
||||
raise ValueError("Forced evaluation failure")
|
||||
|
||||
agent_evaluator = AgentEvaluator(agents=[agent], evaluators=[FailingEvaluator()])
|
||||
mock_crew.kickoff()
|
||||
|
||||
assert events.keys() == {"started", "failed"}
|
||||
assert events["started"].agent_id == str(agent.id)
|
||||
assert events["started"].agent_role == agent.role
|
||||
assert events["started"].task_id == str(task.id)
|
||||
assert events["started"].iteration == 1
|
||||
|
||||
assert events["failed"].agent_id == str(agent.id)
|
||||
assert events["failed"].agent_role == agent.role
|
||||
assert events["failed"].task_id == str(task.id)
|
||||
assert events["failed"].iteration == 1
|
||||
assert events["failed"].error == "Forced evaluation failure"
|
||||
|
||||
results = agent_evaluator.get_evaluation_results()
|
||||
result, = results[agent.role]
|
||||
assert isinstance(result, AgentEvaluationResult)
|
||||
|
||||
assert result.agent_id == str(agent.id)
|
||||
assert result.task_id == str(task.id)
|
||||
|
||||
assert result.metrics == {}
|
||||
|
||||
@@ -1,42 +0,0 @@
|
||||
[
|
||||
{
|
||||
"timestamp": "2025-07-15T21:34:08.253410+00:00",
|
||||
"metadata": {},
|
||||
"results": [
|
||||
{
|
||||
"identifier": "72239c22b0cdde98ad5c588074ef6325",
|
||||
"inputs": {
|
||||
"company": "Apple Inc. (AAPL)"
|
||||
},
|
||||
"score": {
|
||||
"goal_alignment": 10.0,
|
||||
"semantic_quality": 9.0,
|
||||
"tool_selection": 6.0,
|
||||
"parameter_extraction": 5.0,
|
||||
"tool_invocation": 10.0,
|
||||
"reasoning_efficiency": 7.300000000000001
|
||||
},
|
||||
"expected_score": {
|
||||
"goal_alignment": 8
|
||||
},
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"identifier": "test_2",
|
||||
"inputs": {
|
||||
"company": "Microsoft Corporation (MSFT)"
|
||||
},
|
||||
"score": {
|
||||
"goal_alignment": 10.0,
|
||||
"semantic_quality": 7.333333333333333,
|
||||
"tool_selection": 6.25,
|
||||
"parameter_extraction": 9.5,
|
||||
"tool_invocation": 10.0,
|
||||
"reasoning_efficiency": 6.0
|
||||
},
|
||||
"expected_score": 8,
|
||||
"passed": true
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
@@ -1,24 +0,0 @@
|
||||
[
|
||||
{
|
||||
"timestamp": "2025-07-15T21:31:05.916161+00:00",
|
||||
"metadata": {},
|
||||
"results": [
|
||||
{
|
||||
"identifier": "df0ea31ac4a7fb4a908b8319ec7b3719",
|
||||
"inputs": {
|
||||
"messages": "How was the Battle of Waterloo?"
|
||||
},
|
||||
"score": {
|
||||
"goal_alignment": 10.0,
|
||||
"semantic_quality": 10.0,
|
||||
"tool_selection": 10.0,
|
||||
"parameter_extraction": 10.0,
|
||||
"tool_invocation": 10.0,
|
||||
"reasoning_efficiency": 5.5
|
||||
},
|
||||
"expected_score": 8,
|
||||
"passed": true
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
@@ -1,144 +0,0 @@
|
||||
import pytest
|
||||
from crewai import Agent, Crew, Process, Task
|
||||
from crewai_tools import SerperDevTool
|
||||
|
||||
from crewai.experimental.evaluation.testing import (
|
||||
assert_experiment_successfully,
|
||||
run_experiment,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def financial_analysis_crew():
|
||||
search_tool = SerperDevTool()
|
||||
|
||||
data_researcher = Agent(
|
||||
role="Financial Data Researcher",
|
||||
goal="Efficiently collect and structure key financial metrics using multiple search strategies. Using only the search tool.",
|
||||
backstory=(
|
||||
"You are a precision-focused financial analyst who uses multiple targeted searches "
|
||||
"to cross-verify data and ensure comprehensive coverage. You leverage different "
|
||||
"search approaches to gather financial information from various authoritative sources."
|
||||
),
|
||||
tools=[search_tool],
|
||||
)
|
||||
|
||||
financial_analyst = Agent(
|
||||
role="Financial Analyst",
|
||||
goal="Analyze financial data to assess company performance and outlook",
|
||||
backstory=(
|
||||
"You are a seasoned financial analyst with expertise in evaluating company "
|
||||
"performance through quantitative analysis. You can interpret financial statements, "
|
||||
"identify trends, and make reasoned assessments of a company's financial health."
|
||||
),
|
||||
tools=[search_tool],
|
||||
)
|
||||
|
||||
report_writer = Agent(
|
||||
role="Financial Report Writer",
|
||||
goal="Synthesize financial analysis into clear, actionable reports",
|
||||
backstory=(
|
||||
"You are an experienced financial writer who excels at turning complex financial "
|
||||
"analyses into clear, concise, and impactful reports. You know how to highlight "
|
||||
"key insights and present information in a way that's accessible to various audiences."
|
||||
),
|
||||
tools=[],
|
||||
)
|
||||
|
||||
research_task = Task(
|
||||
description=(
|
||||
"Research {company} financial data using multiple targeted search queries:\n\n"
|
||||
"**Search Strategy - Execute these searches sequentially:**\n"
|
||||
"1. '{company} quarterly earnings Q4 2024 Q1 2025 financial results'\n"
|
||||
"2. '{company} financial metrics P/E ratio profit margin debt equity'\n"
|
||||
"3. '{company} revenue growth year over year earnings growth rate'\n"
|
||||
"4. '{company} recent financial news SEC filings analyst reports'\n"
|
||||
"5. '{company} stock performance market cap valuation 2024 2025'\n\n"
|
||||
"**Data Collection Guidelines:**\n"
|
||||
"- Use multiple search queries to cross-verify financial figures\n"
|
||||
"- Prioritize official sources (SEC filings, earnings calls, company reports)\n"
|
||||
"- Compare data across different financial platforms for accuracy\n"
|
||||
"- Present findings in the exact format specified in expected_output."
|
||||
),
|
||||
expected_output=(
|
||||
"Financial data summary in this structure:\n\n"
|
||||
"## Company Financial Overview\n"
|
||||
"**Data Sources Used:** [List 3-5 sources from multiple searches]\n\n"
|
||||
"**Latest Quarter:** [Period]\n"
|
||||
"- Revenue: $X (YoY: +/-X%) [Source verification]\n"
|
||||
"- Net Income: $X (YoY: +/-X%) [Source verification]\n"
|
||||
"- EPS: $X (YoY: +/-X%) [Source verification]\n\n"
|
||||
"**Key Metrics:**\n"
|
||||
"- P/E Ratio: X [Current vs Historical]\n"
|
||||
"- Profit Margin: X% [Trend indicator]\n"
|
||||
"- Debt-to-Equity: X [Industry comparison]\n\n"
|
||||
"**Growth Analysis:**\n"
|
||||
"- Revenue Growth: X% (3-year trend)\n"
|
||||
"- Earnings Growth: X% (consistency check)\n\n"
|
||||
"**Material Developments:** [1-2 key items with impact assessment]\n"
|
||||
"**Data Confidence:** [High/Medium/Low based on source consistency]"
|
||||
),
|
||||
agent=data_researcher,
|
||||
)
|
||||
|
||||
analysis_task = Task(
|
||||
description=(
|
||||
"Analyze the collected financial data to assess the company's performance and outlook. "
|
||||
"Include the following in your analysis:\n"
|
||||
"1. Evaluation of financial health based on key metrics\n"
|
||||
"2. Trend analysis showing growth or decline patterns\n"
|
||||
"3. Comparison with industry benchmarks or competitors\n"
|
||||
"4. Identification of strengths and potential areas of concern\n"
|
||||
"5. Short-term financial outlook based on current trends"
|
||||
),
|
||||
expected_output=(
|
||||
"A detailed financial analysis that includes assessment of key metrics, trends, "
|
||||
"comparative analysis, and a reasoned outlook for the company's financial future."
|
||||
),
|
||||
agent=financial_analyst,
|
||||
context=[research_task],
|
||||
)
|
||||
|
||||
report_task = Task(
|
||||
description=(
|
||||
"Create a professional financial report based on the research and analysis. "
|
||||
"The report should:\n"
|
||||
"1. Begin with an executive summary highlighting key findings\n"
|
||||
"2. Present the financial analysis in a clear, logical structure\n"
|
||||
"3. Include visual representations of key data points (described textually)\n"
|
||||
"4. Provide actionable insights for potential investors\n"
|
||||
"5. Conclude with a clear investment recommendation (buy, hold, or sell)"
|
||||
),
|
||||
expected_output=(
|
||||
"A professional, comprehensive financial report with executive summary, "
|
||||
"structured analysis, visual elements, actionable insights, and a clear recommendation."
|
||||
),
|
||||
agent=report_writer,
|
||||
context=[research_task, analysis_task],
|
||||
)
|
||||
|
||||
crew = Crew(
|
||||
agents=[data_researcher, financial_analyst, report_writer],
|
||||
tasks=[research_task, analysis_task, report_task],
|
||||
process=Process.sequential,
|
||||
)
|
||||
|
||||
return crew
|
||||
|
||||
|
||||
def test_financial_analysis_regression(financial_analysis_crew):
|
||||
dataset = [
|
||||
{
|
||||
"inputs": {"company": "Apple Inc. (AAPL)"},
|
||||
"expected_score": {"goal_alignment": 8},
|
||||
},
|
||||
{
|
||||
"identifier": "test_2",
|
||||
"inputs": {"company": "Microsoft Corporation (MSFT)"},
|
||||
"expected_score": 8,
|
||||
},
|
||||
]
|
||||
|
||||
results = run_experiment(dataset=dataset, crew=financial_analysis_crew, verbose=True)
|
||||
|
||||
assert_experiment_successfully(results)
|
||||
@@ -1,33 +0,0 @@
|
||||
import pytest
|
||||
from crewai import Agent
|
||||
from crewai_tools import SerperDevTool
|
||||
|
||||
from crewai.experimental.evaluation.testing import (
|
||||
assert_experiment_successfully,
|
||||
run_experiment,
|
||||
)
|
||||
|
||||
@pytest.fixture
|
||||
def history_teacher():
|
||||
search_tool = SerperDevTool()
|
||||
return Agent(
|
||||
role="History Educator",
|
||||
goal="Teach students about important historical events with clarity and context",
|
||||
backstory=(
|
||||
"As a renowned historian and educator, you have spent decades studying world history, "
|
||||
"from ancient civilizations to modern events. You are passionate about making history "
|
||||
"engaging and understandable for learners of all ages. Your mission is to educate, explain, "
|
||||
"and spark curiosity about the past."
|
||||
),
|
||||
tools=[search_tool],
|
||||
verbose=True,
|
||||
)
|
||||
def test_history_teacher(history_teacher):
|
||||
dataset = [
|
||||
{"inputs": {"messages": "How was the Battle of Waterloo?"}, "expected_score": 8}
|
||||
]
|
||||
results = run_experiment(
|
||||
dataset=dataset, agents=[history_teacher], verbose=True
|
||||
)
|
||||
|
||||
assert_experiment_successfully(results)
|
||||
@@ -1,16 +1,27 @@
|
||||
import multiprocessing
|
||||
import tempfile
|
||||
import unittest
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
import pytest
|
||||
from chromadb.config import Settings
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from crewai.utilities.chromadb import (
|
||||
MAX_COLLECTION_LENGTH,
|
||||
MIN_COLLECTION_LENGTH,
|
||||
is_ipv4_pattern,
|
||||
sanitize_collection_name,
|
||||
create_persistent_client,
|
||||
)
|
||||
|
||||
|
||||
def persistent_client_worker(path, queue):
|
||||
try:
|
||||
create_persistent_client(path=path)
|
||||
queue.put(None)
|
||||
except Exception as e:
|
||||
queue.put(e)
|
||||
|
||||
|
||||
class TestChromadbUtils(unittest.TestCase):
|
||||
def test_sanitize_collection_name_long_name(self):
|
||||
"""Test sanitizing a very long collection name."""
|
||||
@@ -79,3 +90,34 @@ class TestChromadbUtils(unittest.TestCase):
|
||||
self.assertLessEqual(len(sanitized), MAX_COLLECTION_LENGTH)
|
||||
self.assertTrue(sanitized[0].isalnum())
|
||||
self.assertTrue(sanitized[-1].isalnum())
|
||||
|
||||
def test_create_persistent_client_passes_args(self):
|
||||
with patch(
|
||||
"crewai.utilities.chromadb.PersistentClient"
|
||||
) as mock_persistent_client, tempfile.TemporaryDirectory() as tmpdir:
|
||||
mock_instance = MagicMock()
|
||||
mock_persistent_client.return_value = mock_instance
|
||||
|
||||
settings = Settings(allow_reset=True)
|
||||
client = create_persistent_client(path=tmpdir, settings=settings)
|
||||
|
||||
mock_persistent_client.assert_called_once_with(
|
||||
path=tmpdir, settings=settings
|
||||
)
|
||||
self.assertIs(client, mock_instance)
|
||||
|
||||
def test_create_persistent_client_process_safe(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
queue = multiprocessing.Queue()
|
||||
processes = [
|
||||
multiprocessing.Process(
|
||||
target=persistent_client_worker, args=(tmpdir, queue)
|
||||
)
|
||||
for _ in range(5)
|
||||
]
|
||||
|
||||
[p.start() for p in processes]
|
||||
[p.join() for p in processes]
|
||||
|
||||
errors = [queue.get(timeout=5) for _ in processes]
|
||||
self.assertTrue(all(err is None for err in errors))
|
||||
|
||||
Reference in New Issue
Block a user