Compare commits

..

9 Commits

Author SHA1 Message Date
Lucas Gomide
7c5558bc13 feat: prevent agent parser from causing action loops 2025-07-18 16:35:07 -03:00
Lucas Gomide
c978c4f495 refactor agent parser 2025-07-18 15:56:57 -03:00
Lucas Gomide
fab7c8504a refactor: improve clean up observervation and final answer 2025-07-18 15:40:46 -03:00
Lucas Gomide
ae9907c8e7 fix: prioritize Action over Final Answer to prevent tool bypassing
- Force Action execution when both Action and Final Answer are present
- Prevent agents from bypassing tool execution with premature answers
2025-07-17 15:51:14 -03:00
Lucas Gomide
3836ba50be cleaned text to squash 2025-07-17 15:50:44 -03:00
Lucas Gomide
63f7d75b34 feat: improve action detection when agent provide multiples choices 2025-07-17 15:50:07 -03:00
Lucas Gomide
c212dc2155 fix: try to get the first tool input directory when Agent return a list of inputs 2025-07-17 15:37:20 -03:00
Lucas Gomide
e18174de19 fix: detect and clean agent-written observations in parser
Remove agent-written "Observation:" lines and ALL fake content
2025-07-17 15:34:11 -03:00
Lucas Gomide
9b67e5a15f Emit events about Agent eval (#3168)
* feat: emit events abou Agent Eval

We are triggering events when an evaluation has started/completed/failed

* style: fix type checking issues
2025-07-16 13:18:59 -04:00
18 changed files with 592 additions and 418 deletions

View File

@@ -1,75 +0,0 @@
name: Regression Tests
on:
workflow_dispatch:
inputs:
branch:
description: 'Branch to run tests on'
required: true
default: 'main'
type: string
permissions:
contents: write
env:
OPENAI_API_KEY: fake-api-key
PYTHONUNBUFFERED: 1
jobs:
regression-tests:
name: Regression - ${{ github.event.inputs.branch }}
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ github.event.inputs.branch }}
fetch-depth: 0
- name: Display execution info
run: |
echo "🚀 Running Regression Tests"
echo "📂 Branch: ${{ github.event.inputs.branch }}"
echo "📊 Current commit: $(git rev-parse --short HEAD)"
- name: Install uv
uses: astral-sh/setup-uv@v3
with:
enable-cache: true
cache-dependency-glob: |
**/pyproject.toml
**/uv.lock
- name: Set up Python 3.13
run: uv python install 3.13
- name: Install the project
run: uv sync --dev --all-extras
- name: Install SQLite with FTS5 support
run: |
# WORKAROUND: GitHub Actions' Ubuntu runner uses SQLite without FTS5 support compiled in.
# This is a temporary fix until the runner includes SQLite with FTS5 or Python's sqlite3
# module is compiled with FTS5 support by default.
# TODO: Remove this workaround once GitHub Actions runners include SQLite FTS5 support
# Install pysqlite3-binary which has FTS5 support
uv pip install pysqlite3-binary
# Create a sitecustomize.py to override sqlite3 with pysqlite3
mkdir -p .pytest_sqlite_override
echo "import sys; import pysqlite3; sys.modules['sqlite3'] = pysqlite3" > .pytest_sqlite_override/sitecustomize.py
# Test FTS5 availability
PYTHONPATH=.pytest_sqlite_override uv run python -c "import sqlite3; print(f'SQLite version: {sqlite3.sqlite_version}')"
PYTHONPATH=.pytest_sqlite_override uv run python -c "import sqlite3; conn = sqlite3.connect(':memory:'); conn.execute('CREATE VIRTUAL TABLE test USING fts5(content)'); print('FTS5 module available')"
- name: Run Regression Tests
run: |
PYTHONPATH=.pytest_sqlite_override uv run pytest \
--block-network \
--timeout=30 \
-vv \
--durations=10 \
-n auto \
--maxfail=5 \
tests/regression

View File

@@ -137,6 +137,3 @@ exclude = [
"docs/**",
"docs/",
]
[tool.pytest.ini_options]
norecursedirs = ["tests/regression"]

View File

@@ -3,6 +3,7 @@ from typing import Any, Callable, Dict, List, Optional, Union
from crewai.agents.agent_builder.base_agent import BaseAgent
from crewai.agents.agent_builder.base_agent_executor_mixin import CrewAgentExecutorMixin
from crewai.agents.parser import (
CrewAgentParser,
AgentAction,
AgentFinish,
OutputParserException,
@@ -95,6 +96,7 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
else self.stop
)
)
self._parser = CrewAgentParser(agent=self)
def invoke(self, inputs: Dict[str, str]) -> Dict[str, Any]:
if "system" in self.prompt:
@@ -143,6 +145,7 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
while not isinstance(formatted_answer, AgentFinish):
try:
if has_reached_max_iterations(self.iterations, self.max_iter):
self._parser.reached_max_iterations()
formatted_answer = handle_max_iterations_exceeded(
formatted_answer,
printer=self._printer,
@@ -150,6 +153,7 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
messages=self.messages,
llm=self.llm,
callbacks=self.callbacks,
parser=self._parser,
)
enforce_rpm_limit(self.request_within_rpm_limit)
@@ -161,7 +165,7 @@ class CrewAgentExecutor(CrewAgentExecutorMixin):
printer=self._printer,
from_task=self.task
)
formatted_answer = process_llm_response(answer, self.use_stop_words)
formatted_answer = process_llm_response(answer, self.use_stop_words, self._parser)
if isinstance(formatted_answer, AgentAction):
# Extract agent fingerprint if available

View File

@@ -65,33 +65,26 @@ class CrewAgentParser:
"""
_i18n: I18N = I18N()
_max_iterations_reached: bool = False
agent: Any = None
def __init__(self, agent: Optional[Any] = None):
self.agent = agent
@staticmethod
def parse_text(text: str) -> Union[AgentAction, AgentFinish]:
"""
Static method to parse text into an AgentAction or AgentFinish without needing to instantiate the class.
Args:
text: The text to parse.
Returns:
Either an AgentAction or AgentFinish based on the parsed content.
"""
parser = CrewAgentParser()
return parser.parse(text)
def reached_max_iterations(self) -> None:
self._max_iterations_reached = True
def parse(self, text: str) -> Union[AgentAction, AgentFinish]:
thought = self._extract_thought(text)
includes_answer = FINAL_ANSWER_ACTION in text
regex = (
r"Action\s*\d*\s*:[\s]*(.*?)[\s]*Action\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
)
action_match = re.search(regex, text, re.DOTALL)
if includes_answer:
action_match = self._find_last_action_input_pair(text)
# Prevent tool bypassing when both Action and Final Answer are present
# If the model returns both, we PRIORITIZE the action to force tool execution
if not self._max_iterations_reached and includes_answer and action_match:
return self._create_agent_action(thought, action_match, text)
elif includes_answer:
final_answer = text.split(FINAL_ANSWER_ACTION)[-1].strip()
# Check whether the final answer ends with triple backticks.
if final_answer.endswith("```"):
@@ -103,15 +96,7 @@ class CrewAgentParser:
return AgentFinish(thought, final_answer, text)
elif action_match:
action = action_match.group(1)
clean_action = self._clean_action(action)
action_input = action_match.group(2).strip()
tool_input = action_input.strip(" ").strip('"')
safe_tool_input = self._safe_repair_json(tool_input)
return AgentAction(thought, clean_action, safe_tool_input, text)
return self._create_agent_action(thought, action_match, text)
if not re.search(r"Action\s*\d*\s*:[\s]*(.*?)", text, re.DOTALL):
raise OutputParserException(
@@ -167,3 +152,69 @@ class CrewAgentParser:
return tool_input
return str(result)
def _create_agent_action(self, thought: str, action_match: dict, text: str) -> AgentAction:
cleaned_text = self._clean_agent_observations(text)
action = action_match["action"]
clean_action = self._clean_action(action)
action_input = action_match["input"]
tool_input = action_input.strip(" ").strip('"')
safe_tool_input = self._safe_repair_json(tool_input)
return AgentAction(thought, clean_action, safe_tool_input, cleaned_text)
def _find_last_action_input_pair(self, text: str) -> Optional[dict]:
"""
Finds the last complete Action / Action Input pair in the given text.
Useful for handling multiple action/observation cycles.
"""
def _match_all_pairs(text: str) -> list[tuple[str, str]]:
pattern = (
r"Action\s*\d*\s*:\s*([^\n]+)" # Action content
r"\s*[\n]+" # Optional whitespace/newline
r"Action\s*\d*\s*Input\s*\d*\s*:\s*" # Action Input label
r"([^\n]*(?:\n(?!Observation:|Thought:|Action\s*\d*\s*:|Final Answer:)[^\n]*)*)"
)
return re.findall(pattern, text, re.MULTILINE | re.DOTALL)
def _match_fallback_pair(text: str) -> Optional[dict]:
fallback_pattern = (
r"Action\s*\d*\s*:\s*(.*?)"
r"\s*Action\s*\d*\s*Input\s*\d*\s*:\s*"
r"(.*?)(?=\n(?:Observation:|Thought:|Action\s*\d*\s*:|Final Answer:)|$)"
)
match = re.search(fallback_pattern, text, re.DOTALL)
if match:
return {
"action": match.group(1).strip(),
"input": match.group(2).strip()
}
return None
matches = _match_all_pairs(text)
if matches:
last_action, last_input = matches[-1]
return {
"action": last_action.strip(),
"input": last_input.strip()
}
return _match_fallback_pair(text)
def _clean_agent_observations(self, text: str) -> str:
# Pattern: capture Action/Input lines, then Observation block until next Thought or end-of-string
obs_pattern = re.compile(
r'^(\s*Action:.*\n\s*Action Input:.*\n)' # group 1: Action + Action Input
r'\s*Observation:.*?(?=(?:\n\s*Thought:|\Z))', # non-greedy until Thought: or end-of-string
flags=re.DOTALL | re.MULTILINE
)
if obs_pattern.search(text):
text = obs_pattern.sub(r'\1', text)
# Remove Final Answer and everything following if present
text = re.sub(r'\n\s*Final\s+Answer:.*', '', text, flags=re.DOTALL | re.MULTILINE)
# Normalize blank lines
text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text).strip()
return text

View File

@@ -1,23 +1,24 @@
import threading
from typing import Any
from crewai.experimental.evaluation.base_evaluator import AgentEvaluationResult, AggregationStrategy
from crewai.agent import Agent
from crewai.task import Task
from crewai.experimental.evaluation.evaluation_display import EvaluationDisplayFormatter
from typing import Any
from crewai.utilities.events.agent_events import AgentEvaluationStartedEvent, AgentEvaluationCompletedEvent, AgentEvaluationFailedEvent
from crewai.experimental.evaluation import BaseEvaluator, create_evaluation_callbacks
from collections.abc import Sequence
from crewai.utilities.events.crewai_event_bus import crewai_event_bus
from crewai.utilities.events.utils.console_formatter import ConsoleFormatter
from crewai.utilities.events.task_events import TaskCompletedEvent
from crewai.utilities.events.agent_events import LiteAgentExecutionCompletedEvent
from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult
import threading
from crewai.experimental.evaluation.base_evaluator import AgentAggregatedEvaluationResult, EvaluationScore, MetricCategory
class ExecutionState:
def __init__(self):
self.traces = {}
self.current_agent_id = None
self.current_task_id = None
self.current_agent_id: str | None = None
self.current_task_id: str | None = None
self.iteration = 1
self.iterations_results = {}
self.agent_evaluators = {}
@@ -49,17 +50,21 @@ class AgentEvaluator:
return self._thread_local.execution_state
def _subscribe_to_events(self) -> None:
crewai_event_bus.register_handler(TaskCompletedEvent, self._handle_task_completed)
crewai_event_bus.register_handler(LiteAgentExecutionCompletedEvent, self._handle_lite_agent_completed)
from typing import cast
crewai_event_bus.register_handler(TaskCompletedEvent, cast(Any, self._handle_task_completed))
crewai_event_bus.register_handler(LiteAgentExecutionCompletedEvent, cast(Any, self._handle_lite_agent_completed))
def _handle_task_completed(self, source: Any, event: TaskCompletedEvent) -> None:
assert event.task is not None
agent = event.task.agent
if agent and str(getattr(agent, 'id', 'unknown')) in self._execution_state.agent_evaluators:
self.emit_evaluation_started_event(agent_role=agent.role, agent_id=str(agent.id), task_id=str(event.task.id))
state = ExecutionState()
state.current_agent_id = str(agent.id)
state.current_task_id = str(event.task.id)
assert state.current_agent_id is not None and state.current_task_id is not None
trace = self.callback.get_trace(state.current_agent_id, state.current_task_id)
if not trace:
@@ -100,6 +105,7 @@ class AgentEvaluator:
if not target_agent:
return
assert state.current_agent_id is not None and state.current_task_id is not None
trace = self.callback.get_trace(state.current_agent_id, state.current_task_id)
if not trace:
@@ -181,8 +187,10 @@ class AgentEvaluator:
)
assert self.evaluators is not None
task_id = str(task.id) if task else None
for evaluator in self.evaluators:
try:
self.emit_evaluation_started_event(agent_role=agent.role, agent_id=str(agent.id), task_id=task_id)
score = evaluator.evaluate(
agent=agent,
task=task,
@@ -190,11 +198,31 @@ class AgentEvaluator:
final_output=final_output
)
result.metrics[evaluator.metric_category] = score
self.emit_evaluation_completed_event(agent_role=agent.role, agent_id=str(agent.id), task_id=task_id, metric_category=evaluator.metric_category, score=score)
except Exception as e:
self.emit_evaluation_failed_event(agent_role=agent.role, agent_id=str(agent.id), task_id=task_id, error=str(e))
self.console_formatter.print(f"Error in {evaluator.metric_category.value} evaluator: {str(e)}")
return result
def emit_evaluation_started_event(self, agent_role: str, agent_id: str, task_id: str | None = None):
crewai_event_bus.emit(
self,
AgentEvaluationStartedEvent(agent_role=agent_role, agent_id=agent_id, task_id=task_id, iteration=self._execution_state.iteration)
)
def emit_evaluation_completed_event(self, agent_role: str, agent_id: str, task_id: str | None = None, metric_category: MetricCategory | None = None, score: EvaluationScore | None = None):
crewai_event_bus.emit(
self,
AgentEvaluationCompletedEvent(agent_role=agent_role, agent_id=agent_id, task_id=task_id, iteration=self._execution_state.iteration, metric_category=metric_category, score=score)
)
def emit_evaluation_failed_event(self, agent_role: str, agent_id: str, error: str, task_id: str | None = None):
crewai_event_bus.emit(
self,
AgentEvaluationFailedEvent(agent_role=agent_role, agent_id=agent_id, task_id=task_id, iteration=self._execution_state.iteration, error=error)
)
def create_default_evaluator(agents: list[Agent], llm: None = None):
from crewai.experimental.evaluation import (
GoalAlignmentEvaluator,

View File

@@ -227,4 +227,8 @@ class EvaluationTraceCallback(BaseEventListener):
def create_evaluation_callbacks() -> EvaluationTraceCallback:
return EvaluationTraceCallback()
from crewai.utilities.events.crewai_event_bus import crewai_event_bus
callback = EvaluationTraceCallback()
callback.setup_listeners(crewai_event_bus)
return callback

View File

@@ -1,5 +1,4 @@
import inspect
from pathlib import Path
from typing_extensions import Any
import warnings
@@ -42,30 +41,12 @@ def run_experiment(dataset: list[dict[str, Any]], crew: Crew | None = None, agen
return runner.run(agents=agents, crew=crew, print_summary=verbose)
def _get_baseline_filepath_fallback() -> str:
filename = "experiment_fallback.json"
calling_file = None
test_func_name = "experiment_fallback"
try:
current_frame = inspect.currentframe()
if current_frame is not None:
test_func_name = current_frame.f_back.f_back.f_code.co_name # type: ignore[union-attr]
filename = f"{test_func_name}.json"
calling_file = current_frame.f_back.f_back.f_code.co_filename # type: ignore[union-attr]
except Exception:
return filename
if not calling_file:
return filename
calling_path = Path(calling_file)
try:
baseline_dir_parts = calling_path.parts[:-1]
baseline_dir = Path(*baseline_dir_parts) / "results"
baseline_dir.mkdir(parents=True, exist_ok=True)
baseline_filepath = baseline_dir / filename
return str(baseline_filepath)
except (ValueError, IndexError):
pass
return filename
...
return f"{test_func_name}_results.json"

View File

@@ -35,6 +35,7 @@ from crewai.agents.agent_builder.base_agent import BaseAgent
from crewai.agents.agent_builder.utilities.base_token_process import TokenProcess
from crewai.agents.cache import CacheHandler
from crewai.agents.parser import (
CrewAgentParser,
AgentAction,
AgentFinish,
OutputParserException,
@@ -204,6 +205,7 @@ class LiteAgent(FlowTrackable, BaseModel):
_printer: Printer = PrivateAttr(default_factory=Printer)
_guardrail: Optional[Callable] = PrivateAttr(default=None)
_guardrail_retry_count: int = PrivateAttr(default=0)
_parser: CrewAgentParser = PrivateAttr(default_factory=CrewAgentParser)
@model_validator(mode="after")
def setup_llm(self):
@@ -239,6 +241,13 @@ class LiteAgent(FlowTrackable, BaseModel):
return self
@model_validator(mode="after")
def setup_parser(self):
"""Set up the parser after initialization."""
self._parser = CrewAgentParser(agent=self.original_agent)
return self
@field_validator("guardrail", mode="before")
@classmethod
def validate_guardrail_function(
@@ -511,6 +520,7 @@ class LiteAgent(FlowTrackable, BaseModel):
messages=self._messages,
llm=cast(LLM, self.llm),
callbacks=self._callbacks,
parser=self._parser,
)
enforce_rpm_limit(self.request_within_rpm_limit)
@@ -553,7 +563,7 @@ class LiteAgent(FlowTrackable, BaseModel):
)
raise e
formatted_answer = process_llm_response(answer, self.use_stop_words)
formatted_answer = process_llm_response(answer, self.use_stop_words, self._parser)
if isinstance(formatted_answer, AgentAction):
try:
@@ -622,4 +632,4 @@ class LiteAgent(FlowTrackable, BaseModel):
def _append_message(self, text: str, role: str = "assistant") -> None:
"""Append a message to the message list with the given role."""
self._messages.append(format_message_for_llm(text, role=role))
self._messages.append(format_message_for_llm(text, role=role))

View File

@@ -71,6 +71,7 @@ def handle_max_iterations_exceeded(
messages: List[Dict[str, str]],
llm: Union[LLM, BaseLLM],
callbacks: List[Any],
parser: CrewAgentParser
) -> Union[AgentAction, AgentFinish]:
"""
Handles the case when the maximum number of iterations is exceeded.
@@ -109,7 +110,7 @@ def handle_max_iterations_exceeded(
)
raise ValueError("Invalid response from LLM call - None or empty.")
formatted_answer = format_answer(answer)
formatted_answer = format_answer(parser, answer)
# Return the formatted answer, regardless of its type
return formatted_answer
@@ -119,10 +120,10 @@ def format_message_for_llm(prompt: str, role: str = "user") -> Dict[str, str]:
return {"role": role, "content": prompt}
def format_answer(answer: str) -> Union[AgentAction, AgentFinish]:
def format_answer(parser: CrewAgentParser, answer: str) -> Union[AgentAction, AgentFinish]:
"""Format a response from the LLM into an AgentAction or AgentFinish."""
try:
return CrewAgentParser.parse_text(answer)
return parser.parse(answer)
except Exception:
# If parsing fails, return a default AgentFinish
return AgentFinish(
@@ -173,18 +174,18 @@ def get_llm_response(
def process_llm_response(
answer: str, use_stop_words: bool
answer: str, use_stop_words: bool, parser: CrewAgentParser
) -> Union[AgentAction, AgentFinish]:
"""Process the LLM response and format it into an AgentAction or AgentFinish."""
if not use_stop_words:
try:
# Preliminary parsing to check for errors.
format_answer(answer)
format_answer(parser, answer)
except OutputParserException as e:
if FINAL_ANSWER_AND_PARSABLE_ACTION_ERROR_MESSAGE in e.error:
answer = answer.split("Observation:")[0].strip()
return format_answer(answer)
return format_answer(parser, answer)
def handle_agent_action_core(

View File

@@ -17,6 +17,9 @@ from .agent_events import (
AgentExecutionStartedEvent,
AgentExecutionCompletedEvent,
AgentExecutionErrorEvent,
AgentEvaluationStartedEvent,
AgentEvaluationCompletedEvent,
AgentEvaluationFailedEvent,
)
from .task_events import (
TaskStartedEvent,
@@ -74,6 +77,9 @@ __all__ = [
"AgentExecutionStartedEvent",
"AgentExecutionCompletedEvent",
"AgentExecutionErrorEvent",
"AgentEvaluationStartedEvent",
"AgentEvaluationCompletedEvent",
"AgentEvaluationFailedEvent",
"TaskStartedEvent",
"TaskCompletedEvent",
"TaskFailedEvent",

View File

@@ -123,3 +123,28 @@ class AgentLogsExecutionEvent(BaseEvent):
type: str = "agent_logs_execution"
model_config = {"arbitrary_types_allowed": True}
# Agent Eval events
class AgentEvaluationStartedEvent(BaseEvent):
agent_id: str
agent_role: str
task_id: str | None = None
iteration: int
type: str = "agent_evaluation_started"
class AgentEvaluationCompletedEvent(BaseEvent):
agent_id: str
agent_role: str
task_id: str | None = None
iteration: int
metric_category: Any
score: Any
type: str = "agent_evaluation_completed"
class AgentEvaluationFailedEvent(BaseEvent):
agent_id: str
agent_role: str
task_id: str | None = None
iteration: int
error: str
type: str = "agent_evaluation_failed"

View File

@@ -371,3 +371,151 @@ class MockAgent:
# TODO: ADD TEST TO MAKE SURE ** REMOVAL DOESN'T MESS UP ANYTHING
def test_ensure_agent_action_is_selected_when_model_hallucinates_observation_and_final_answer(parser):
text = """Let's continue our effort to gather comprehensive, well-rounded information about AI in healthcare in 2023 to compile a detailed research report effectively.
Action: Web Search
Action Input: {"search_query": "innovations in AI for healthcare 2023 latest updates and challenges"}
Observation: The search is yielding repeated and abundant information on the fragmented, redundant regulatory frameworks, clinical validation importance, and varied insights about AIs ongoing integration challenges in healthcare. To ensure a rich mix of insights, let's compile, structure, and organize these insights into a coherent report.
Content Synthesis:
- **Innovations and Trends**:
- AI is significantly contributing to personalized medicine, enabling more accurate patient diagnosis and treatment plans.
- Deep learning models, especially in image and pattern recognition, are revolutionizing radiology and pathology.
- AI's role in drug discovery is speeding up research and reducing costs and time for new drugs entering the market.
- AI-driven wearable devices are proving crucial for patient monitoring, predicting potential health issues, and facilitating proactive care.
Thought: I now have ample information to construct a research report detailing innovations, challenges, and opportunities of AI in healthcare in 2023.
Final Answer: The finalized detailed research report on AI in Healthcare, 2023:
Title: Current Innovations, Challenges, and Potential of AI in Healthcare - 2023 Overview
Introduction:
The integration of Artificial Intelligence (AI) in healthcare is heralding a new era of modern medicine. In 2023, substantial technological advancements have brought about transformative changes in healthcare delivery. This report explores the latest AI innovations, identifies prevalent challenges, and discusses the potential opportunities in healthcare.
Potential and Opportunities:
AI's potential in healthcare is vast, presenting numerous opportunities:
- Cost Reduction: AI has the capacity to streamline operations, cutting costs significantly.
- Preventive Healthcare: Utilizing predictive analytics allows for early intervention and prevention, alleviating pressure on emergency and critical care resources.
- Enhanced Surgeries: Robotic surgeries guided by AI improve surgical outcomes and patient recovery times.
- Improved Patient Experience: AI-driven solutions personalize patient interaction, improving engagement and healthcare experiences.
Conclusion:
AI continues to reshape the healthcare landscape in 2023. Facing challenges head-on with robust solutions will unlock unparalleled benefits, positioning AI as a cornerstone for future medical and healthcare advancements. With ongoing improvements in regulations, data quality, and validation processes, the full potential of AI in healthcare stands to be realized.
"""
result = parser.parse(text)
expected_text = """Let's continue our effort to gather comprehensive, well-rounded information about AI in healthcare in 2023 to compile a detailed research report effectively.
Action: Web Search
Action Input: {"search_query": "innovations in AI for healthcare 2023 latest updates and challenges"}
Thought: I now have ample information to construct a research report detailing innovations, challenges, and opportunities of AI in healthcare in 2023.
"""
assert isinstance(result, AgentAction)
assert result.text.strip() == expected_text.strip()
def test_ensure_agent_action_is_selected_when_model_hallucinates_observation_field(parser):
text = """Let's continue our effort to gather comprehensive, well-rounded information about AI in healthcare in 2023 to compile a detailed research report effectively.
Action: Web Search
Action Input: {"search_query": "innovations in AI for healthcare 2023 latest updates and challenges"}
Observation: The search is yielding repeated and abundant information on the fragmented, redundant regulatory frameworks, clinical validation importance, and varied insights about AIs ongoing integration challenges in healthcare. To ensure a rich mix of insights, let's compile, structure, and organize these insights into a coherent report.
Content Synthesis:
- **Innovations and Trends**:
- AI is significantly contributing to personalized medicine, enabling more accurate patient diagnosis and treatment plans.
- Deep learning models, especially in image and pattern recognition, are revolutionizing radiology and pathology.
Final Answer: The finalized detailed research report on AI in Healthcare, 2023:
Title: Current Innovations, Challenges, and Potential of AI in Healthcare - 2023 Overview
Introduction:
The integration of Artificial Intelligence (AI) in healthcare is heralding a new era of modern medicine. In 2023, substantial technological advancements have brought about transformative changes in healthcare delivery. This report explores the latest AI innovations, identifies prevalent challenges, and discusses the potential opportunities in healthcare.
Innovations and Trends:
AI technologies are becoming deeply embedded in various aspects of healthcare operations. Key advancements include:
- Personalized Medicine: AI's analytical capabilities produce precise diagnostic outcomes and tailored treatment plans, fostering personalized medicine.
- Radiology and Pathology: AI, particularly through advanced deep learning models, is improving imaging accuracy, thereby transforming radiological and pathological analyses.
"""
result = parser.parse(text)
expected_text = """Let's continue our effort to gather comprehensive, well-rounded information about AI in healthcare in 2023 to compile a detailed research report effectively.
Action: Web Search
Action Input: {"search_query": "innovations in AI for healthcare 2023 latest updates and challenges"}
"""
assert isinstance(result, AgentAction)
assert result.text.strip() == expected_text.strip()
def test_ensure_agent_finish_is_selected_when_no_action_was_provided(parser):
text = """
```
Thought: The repeated results indicate that there may be a technical issue retrieving new information. I will summarize the available knowledge to complete the task.
Final Answer:
Research Report on AI in Healthcare (2023)
1. Introduction:
AI technologies have become increasingly important in healthcare for their potential to transform patient care, diagnostics, and operational efficiencies. As we progress through 2023, significant advancements are noted alongside various challenges that need addressing.
2. Developments in AI Technologies:
Recent years have seen AI significantly impact medical imaging, precision medicine, drug discovery, and robotic surgery. AI algorithms, such as neural networks and machine learning models, provide breakthroughs in analyzing large datasets to identify disease patterns, optimize treatment plans, and predict outcomes. In 2023, AI continues to be integrated within electronic health records, telemedicine platforms, and virtual health assistants, expanding its access and utility.
3. Challenges:
- **Data Quality and Availability:** AI models require accurate, comprehensive data. However, healthcare data often remains fragmented and inconsistent, limiting AI's efficacy. High-quality data collection and management are crucial.
- **Regulatory Frameworks:** Establishing clear regulations is imperative to ensure AI is used safely in clinical environments. Policymakers need to develop standards for AI research, implementation, and continuous monitoring.
- **Clinical Validation:** Before deploying AI models in healthcare applications, they must undergo rigorous clinical validation to confirm their safety and effectiveness.
- **Privacy and Consent:** Patient data privacy concerns persist. AI systems need robust mechanisms for data protection and maintaining patient consent when using personal health information.
4. Future Potentials:
AI holds the potential to democratize access to healthcare services by making diagnostic tools more accessible and improving personalized treatment plans. Future research and investments are expected to focus on enhancing AI models to process and generate insights from electronic health records, predict patient admissions, and improve monitoring systems in real time.
5. Conclusion:
In 2023, AI in healthcare continues to grow, supported by technological advancements and increased investment, despite ongoing challenges. Addressing these issues could allow AI to revolutionize healthcare, improving patient outcomes, and streamlining the efficiency of healthcare systems worldwide.
```
"""
result = parser.parse(text)
assert isinstance(result, AgentFinish)
assert result.text.strip() == text.strip()
def test_ensure_max_iteration_reached_and_agent_hallucinates_observation_and_final_answer(parser):
text = """Let's continue our effort to gather comprehensive, well-rounded information about AI in healthcare in 2023 to compile a detailed research report effectively.
Action: Web Search
Action Input: {"search_query": "innovations in AI for healthcare 2023 latest updates and challenges"}
Observation: The search is yielding repeated and abundant information on the fragmented, redundant regulatory frameworks, clinical validation importance, and varied insights about AIs ongoing integration challenges in healthcare. To ensure a rich mix of insights, let's compile, structure, and organize these insights into a coherent report.
Thought: I now have ample information to construct a research report detailing innovations, challenges, and opportunities of AI in healthcare in 2023.
Final Answer: The finalized detailed research report on AI in Healthcare, 2023:
Title: Current Innovations, Challenges, and Potential of AI in Healthcare - 2023 Overview
Introduction:
The integration of Artificial Intelligence (AI) in healthcare is heralding a new era of modern medicine. In 2023, substantial technological advancements have brought about transformative changes in healthcare delivery. This report explores the latest AI innovations, identifies prevalent challenges, and discusses the potential opportunities in healthcare.
Conclusion:
AI continues to reshape the healthcare landscape in 2023. Facing challenges head-on with robust solutions will unlock unparalleled benefits, positioning AI as a cornerstone for future medical and healthcare advancements. With ongoing improvements in regulations, data quality, and validation processes, the full potential of AI in healthcare stands to be realized.
"""
parser.reached_max_iterations()
result = parser.parse(text)
expected_text = """
The finalized detailed research report on AI in Healthcare, 2023:
Title: Current Innovations, Challenges, and Potential of AI in Healthcare - 2023 Overview
Introduction:
The integration of Artificial Intelligence (AI) in healthcare is heralding a new era of modern medicine. In 2023, substantial technological advancements have brought about transformative changes in healthcare delivery. This report explores the latest AI innovations, identifies prevalent challenges, and discusses the potential opportunities in healthcare.
Conclusion:
AI continues to reshape the healthcare landscape in 2023. Facing challenges head-on with robust solutions will unlock unparalleled benefits, positioning AI as a cornerstone for future medical and healthcare advancements. With ongoing improvements in regulations, data quality, and validation processes, the full potential of AI in healthcare stands to be realized.
"""
assert isinstance(result, AgentFinish)
assert result.output.strip() == expected_text.strip()

View File

@@ -0,0 +1,123 @@
interactions:
- request:
body: '{"messages": [{"role": "system", "content": "You are Test Agent. An agent
created for testing purposes\nYour personal goal is: Complete test tasks successfully\nTo
give my best complete final answer to the task respond using the exact following
format:\n\nThought: I now can give a great answer\nFinal Answer: Your final
answer must be the great and the most complete as possible, it must be outcome
described.\n\nI MUST use these formats, my job depends on it!"}, {"role": "user",
"content": "\nCurrent Task: Test task description\n\nThis is the expected criteria
for your final answer: Expected test output\nyou MUST return the actual complete
content as the final answer, not a summary.\n\nBegin! This is VERY important
to you, use the tools available and give your best Final Answer, your job depends
on it!\n\nThought:"}], "model": "gpt-4o-mini", "stop": ["\nObservation:"]}'
headers:
accept:
- application/json
accept-encoding:
- gzip, deflate, zstd
connection:
- keep-alive
content-length:
- '879'
content-type:
- application/json
host:
- api.openai.com
user-agent:
- OpenAI/Python 1.93.0
x-stainless-arch:
- arm64
x-stainless-async:
- 'false'
x-stainless-lang:
- python
x-stainless-os:
- MacOS
x-stainless-package-version:
- 1.93.0
x-stainless-raw-response:
- 'true'
x-stainless-read-timeout:
- '600.0'
x-stainless-retry-count:
- '0'
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
- 3.11.12
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAAwAAAP//jFTBbhtHDL3rK4g5rwRbtaNYt9RoEaNoUaBODm0DgZnh7jKe5WyHXDmO
4X8vZiRLcupDLwvsPPLxPQ45jzMAx8GtwfkezQ9jnP9oeLv98N5+vfl9+4v89Mf76+XV7XDz8Yc/
r39T15SM9PkLeXvOWvg0jJGMk+xgnwmNCuv56nJ5+XZ1tbqswJACxZLWjTa/SPOBhefLs+XF/Gw1
P3+7z+4Te1K3hr9mAACP9Vt0SqCvbg1nzfPJQKrYkVsfggBcTrGcOFRlNRRzzRH0SYykSr8BSffg
UaDjLQFCV2QDit5TBvhbfmbBCO/q/xpue1ZgBesJ6OtI3iiAkRqkycbJGrjv2ffgk5S6CqkFhECG
HClAIPWZx9Kkgtz3aJVq37vChXoH2qcpBogp3UHkO1rAbU/QViW7Os8hLD5OgQBjBCFfOpEfgKVN
ecBSpoFAQxK1jMbSgY+Y2R6aWjJTT6K8JSHVBlACYOgpk3gCS4DyADqS55YpQDdxoMhCuoCbgwKf
tpSB0PeAJdaKseKpOsn0z8SZBhJrgESnXERY8S0JRsxWulkoilkKkDJ0JJQx8jcKi13DX3pWyuWm
FPDQN8jU7mW3KRfdSaj2r5ZLMEmgXOYg7K5OlcQYI1Cs4vSFavSVmLWnsDgdnEztpFiGV6YYTwAU
SVYbXkf20x55OgxpTN2Y02f9LtW1LKz9JhNqkjKQaml0FX2aAXyqyzC9mG835jSMtrF0R7Xc+Zvz
HZ877uARvXqzBy0ZxuP58nLVvMK32Q2rnqyT8+h7CsfU4+7hFDidALMT1/9V8xr3zjlL93/oj4D3
NBqFzZgpsH/p+BiW6Utd0dfDDl2ugl2ZK/a0MaZcbiJQi1PcPRxOH9Ro2LQsHeUxc309yk3Onmb/
AgAA//8DAAbYfvVABQAA
headers:
CF-RAY:
- 95f9c7ffa8331b11-GRU
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Tue, 15 Jul 2025 13:59:38 GMT
Server:
- cloudflare
Set-Cookie:
- __cf_bm=J_xe1AP.B5P6D2GVMCesyioeS5E9DnYT34rbwQUefFc-1752587978-1.0.1.1-5Dflk5cAj6YCsOSVbCFWWSpXpw_mXsczIdzWzs2h2OwDL01HQbduE5LAToy67sfjFjHeeO4xRrqPLUQpySy2QqyHXbI_fzX4UAt3.UdwHxU;
path=/; expires=Tue, 15-Jul-25 14:29:38 GMT; domain=.api.openai.com; HttpOnly;
Secure; SameSite=None
- _cfuvid=0rTD8RMpxBQQy42jzmum16_eoRtWNfaZMG_TJkhGS7I-1752587978437-0.0.1.1-604800000;
path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
Transfer-Encoding:
- chunked
X-Content-Type-Options:
- nosniff
access-control-expose-headers:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
cf-cache-status:
- DYNAMIC
openai-organization:
- crewai-iuxna1
openai-processing-ms:
- '2623'
openai-version:
- '2020-10-01'
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-envoy-upstream-service-time:
- '2626'
x-ratelimit-limit-requests:
- '30000'
x-ratelimit-limit-tokens:
- '150000000'
x-ratelimit-remaining-requests:
- '29999'
x-ratelimit-remaining-tokens:
- '149999813'
x-ratelimit-reset-requests:
- 2ms
x-ratelimit-reset-tokens:
- 0s
x-request-id:
- req_ccc347e91010713379c920aa0efd1f4f
status:
code: 200
message: OK
version: 1

View File

@@ -11,9 +11,13 @@ from crewai.experimental.evaluation import (
ToolSelectionEvaluator,
ParameterExtractionEvaluator,
ToolInvocationEvaluator,
ReasoningEfficiencyEvaluator
ReasoningEfficiencyEvaluator,
MetricCategory,
EvaluationScore
)
from crewai.utilities.events.agent_events import AgentEvaluationStartedEvent, AgentEvaluationCompletedEvent, AgentEvaluationFailedEvent
from crewai.utilities.events.crewai_event_bus import crewai_event_bus
from crewai.experimental.evaluation import create_default_evaluator
class TestAgentEvaluator:
@@ -102,28 +106,57 @@ class TestAgentEvaluator:
goal="Complete test tasks successfully",
backstory="An agent created for testing purposes",
)
agent_evaluator = AgentEvaluator(agents=[agent], evaluators=[GoalAlignmentEvaluator()])
agent.kickoff(messages="Complete this task successfully")
with crewai_event_bus.scoped_handlers():
events = {}
@crewai_event_bus.on(AgentEvaluationStartedEvent)
def capture_started(source, event):
events["started"] = event
results = agent_evaluator.get_evaluation_results()
@crewai_event_bus.on(AgentEvaluationCompletedEvent)
def capture_completed(source, event):
events["completed"] = event
assert isinstance(results, dict)
@crewai_event_bus.on(AgentEvaluationFailedEvent)
def capture_failed(source, event):
events["failed"] = event
result, = results[agent.role]
assert isinstance(result, AgentEvaluationResult)
agent_evaluator = AgentEvaluator(agents=[agent], evaluators=[GoalAlignmentEvaluator()])
assert result.agent_id == str(agent.id)
assert result.task_id == "lite_task"
agent.kickoff(messages="Complete this task successfully")
goal_alignment, = result.metrics.values()
assert goal_alignment.score == 2.0
assert events.keys() == {"started", "completed"}
assert events["started"].agent_id == str(agent.id)
assert events["started"].agent_role == agent.role
assert events["started"].task_id is None
assert events["started"].iteration == 1
expected_feedback = "The agent did not demonstrate a clear understanding of the task goal, which is to complete test tasks successfully"
assert expected_feedback in goal_alignment.feedback
assert events["completed"].agent_id == str(agent.id)
assert events["completed"].agent_role == agent.role
assert events["completed"].task_id is None
assert events["completed"].iteration == 1
assert events["completed"].metric_category == MetricCategory.GOAL_ALIGNMENT
assert isinstance(events["completed"].score, EvaluationScore)
assert events["completed"].score.score == 2.0
assert goal_alignment.raw_response is not None
assert '"score": 2' in goal_alignment.raw_response
results = agent_evaluator.get_evaluation_results()
assert isinstance(results, dict)
result, = results[agent.role]
assert isinstance(result, AgentEvaluationResult)
assert result.agent_id == str(agent.id)
assert result.task_id == "lite_task"
goal_alignment, = result.metrics.values()
assert goal_alignment.score == 2.0
expected_feedback = "The agent did not demonstrate a clear understanding of the task goal, which is to complete test tasks successfully"
assert expected_feedback in goal_alignment.feedback
assert goal_alignment.raw_response is not None
assert '"score": 2' in goal_alignment.raw_response
@pytest.mark.vcr(filter_headers=["authorization"])
def test_eval_specific_agents_from_crew(self, mock_crew):
@@ -140,25 +173,106 @@ class TestAgentEvaluator:
mock_crew.agents.append(agent)
mock_crew.tasks.append(task)
agent_evaluator = AgentEvaluator(agents=[agent], evaluators=[GoalAlignmentEvaluator()])
with crewai_event_bus.scoped_handlers():
events = {}
@crewai_event_bus.on(AgentEvaluationStartedEvent)
def capture_started(source, event):
events["started"] = event
mock_crew.kickoff()
@crewai_event_bus.on(AgentEvaluationCompletedEvent)
def capture_completed(source, event):
events["completed"] = event
results = agent_evaluator.get_evaluation_results()
@crewai_event_bus.on(AgentEvaluationFailedEvent)
def capture_failed(source, event):
events["failed"] = event
assert isinstance(results, dict)
assert len(results.keys()) == 1
result, = results[agent.role]
assert isinstance(result, AgentEvaluationResult)
agent_evaluator = AgentEvaluator(agents=[agent], evaluators=[GoalAlignmentEvaluator()])
mock_crew.kickoff()
assert result.agent_id == str(agent.id)
assert result.task_id == str(task.id)
assert events.keys() == {"started", "completed"}
assert events["started"].agent_id == str(agent.id)
assert events["started"].agent_role == agent.role
assert events["started"].task_id == str(task.id)
assert events["started"].iteration == 1
goal_alignment, = result.metrics.values()
assert goal_alignment.score == 5.0
assert events["completed"].agent_id == str(agent.id)
assert events["completed"].agent_role == agent.role
assert events["completed"].task_id == str(task.id)
assert events["completed"].iteration == 1
assert events["completed"].metric_category == MetricCategory.GOAL_ALIGNMENT
assert isinstance(events["completed"].score, EvaluationScore)
assert events["completed"].score.score == 5.0
expected_feedback = "The agent provided a thorough guide on how to conduct a test task but failed to produce specific expected output"
assert expected_feedback in goal_alignment.feedback
results = agent_evaluator.get_evaluation_results()
assert goal_alignment.raw_response is not None
assert '"score": 5' in goal_alignment.raw_response
assert isinstance(results, dict)
assert len(results.keys()) == 1
result, = results[agent.role]
assert isinstance(result, AgentEvaluationResult)
assert result.agent_id == str(agent.id)
assert result.task_id == str(task.id)
goal_alignment, = result.metrics.values()
assert goal_alignment.score == 5.0
expected_feedback = "The agent provided a thorough guide on how to conduct a test task but failed to produce specific expected output"
assert expected_feedback in goal_alignment.feedback
assert goal_alignment.raw_response is not None
assert '"score": 5' in goal_alignment.raw_response
@pytest.mark.vcr(filter_headers=["authorization"])
def test_failed_evaluation(self, mock_crew):
agent, = mock_crew.agents
task, = mock_crew.tasks
with crewai_event_bus.scoped_handlers():
events = {}
@crewai_event_bus.on(AgentEvaluationStartedEvent)
def capture_started(source, event):
events["started"] = event
@crewai_event_bus.on(AgentEvaluationCompletedEvent)
def capture_completed(source, event):
events["completed"] = event
@crewai_event_bus.on(AgentEvaluationFailedEvent)
def capture_failed(source, event):
events["failed"] = event
# Create a mock evaluator that will raise an exception
from crewai.experimental.evaluation.base_evaluator import BaseEvaluator
from crewai.experimental.evaluation import MetricCategory
class FailingEvaluator(BaseEvaluator):
metric_category = MetricCategory.GOAL_ALIGNMENT
def evaluate(self, agent, task, execution_trace, final_output):
raise ValueError("Forced evaluation failure")
agent_evaluator = AgentEvaluator(agents=[agent], evaluators=[FailingEvaluator()])
mock_crew.kickoff()
assert events.keys() == {"started", "failed"}
assert events["started"].agent_id == str(agent.id)
assert events["started"].agent_role == agent.role
assert events["started"].task_id == str(task.id)
assert events["started"].iteration == 1
assert events["failed"].agent_id == str(agent.id)
assert events["failed"].agent_role == agent.role
assert events["failed"].task_id == str(task.id)
assert events["failed"].iteration == 1
assert events["failed"].error == "Forced evaluation failure"
results = agent_evaluator.get_evaluation_results()
result, = results[agent.role]
assert isinstance(result, AgentEvaluationResult)
assert result.agent_id == str(agent.id)
assert result.task_id == str(task.id)
assert result.metrics == {}

View File

@@ -1,42 +0,0 @@
[
{
"timestamp": "2025-07-15T21:34:08.253410+00:00",
"metadata": {},
"results": [
{
"identifier": "72239c22b0cdde98ad5c588074ef6325",
"inputs": {
"company": "Apple Inc. (AAPL)"
},
"score": {
"goal_alignment": 10.0,
"semantic_quality": 9.0,
"tool_selection": 6.0,
"parameter_extraction": 5.0,
"tool_invocation": 10.0,
"reasoning_efficiency": 7.300000000000001
},
"expected_score": {
"goal_alignment": 8
},
"passed": true
},
{
"identifier": "test_2",
"inputs": {
"company": "Microsoft Corporation (MSFT)"
},
"score": {
"goal_alignment": 10.0,
"semantic_quality": 7.333333333333333,
"tool_selection": 6.25,
"parameter_extraction": 9.5,
"tool_invocation": 10.0,
"reasoning_efficiency": 6.0
},
"expected_score": 8,
"passed": true
}
]
}
]

View File

@@ -1,24 +0,0 @@
[
{
"timestamp": "2025-07-15T21:31:05.916161+00:00",
"metadata": {},
"results": [
{
"identifier": "df0ea31ac4a7fb4a908b8319ec7b3719",
"inputs": {
"messages": "How was the Battle of Waterloo?"
},
"score": {
"goal_alignment": 10.0,
"semantic_quality": 10.0,
"tool_selection": 10.0,
"parameter_extraction": 10.0,
"tool_invocation": 10.0,
"reasoning_efficiency": 5.5
},
"expected_score": 8,
"passed": true
}
]
}
]

View File

@@ -1,144 +0,0 @@
import pytest
from crewai import Agent, Crew, Process, Task
from crewai_tools import SerperDevTool
from crewai.experimental.evaluation.testing import (
assert_experiment_successfully,
run_experiment,
)
@pytest.fixture
def financial_analysis_crew():
search_tool = SerperDevTool()
data_researcher = Agent(
role="Financial Data Researcher",
goal="Efficiently collect and structure key financial metrics using multiple search strategies. Using only the search tool.",
backstory=(
"You are a precision-focused financial analyst who uses multiple targeted searches "
"to cross-verify data and ensure comprehensive coverage. You leverage different "
"search approaches to gather financial information from various authoritative sources."
),
tools=[search_tool],
)
financial_analyst = Agent(
role="Financial Analyst",
goal="Analyze financial data to assess company performance and outlook",
backstory=(
"You are a seasoned financial analyst with expertise in evaluating company "
"performance through quantitative analysis. You can interpret financial statements, "
"identify trends, and make reasoned assessments of a company's financial health."
),
tools=[search_tool],
)
report_writer = Agent(
role="Financial Report Writer",
goal="Synthesize financial analysis into clear, actionable reports",
backstory=(
"You are an experienced financial writer who excels at turning complex financial "
"analyses into clear, concise, and impactful reports. You know how to highlight "
"key insights and present information in a way that's accessible to various audiences."
),
tools=[],
)
research_task = Task(
description=(
"Research {company} financial data using multiple targeted search queries:\n\n"
"**Search Strategy - Execute these searches sequentially:**\n"
"1. '{company} quarterly earnings Q4 2024 Q1 2025 financial results'\n"
"2. '{company} financial metrics P/E ratio profit margin debt equity'\n"
"3. '{company} revenue growth year over year earnings growth rate'\n"
"4. '{company} recent financial news SEC filings analyst reports'\n"
"5. '{company} stock performance market cap valuation 2024 2025'\n\n"
"**Data Collection Guidelines:**\n"
"- Use multiple search queries to cross-verify financial figures\n"
"- Prioritize official sources (SEC filings, earnings calls, company reports)\n"
"- Compare data across different financial platforms for accuracy\n"
"- Present findings in the exact format specified in expected_output."
),
expected_output=(
"Financial data summary in this structure:\n\n"
"## Company Financial Overview\n"
"**Data Sources Used:** [List 3-5 sources from multiple searches]\n\n"
"**Latest Quarter:** [Period]\n"
"- Revenue: $X (YoY: +/-X%) [Source verification]\n"
"- Net Income: $X (YoY: +/-X%) [Source verification]\n"
"- EPS: $X (YoY: +/-X%) [Source verification]\n\n"
"**Key Metrics:**\n"
"- P/E Ratio: X [Current vs Historical]\n"
"- Profit Margin: X% [Trend indicator]\n"
"- Debt-to-Equity: X [Industry comparison]\n\n"
"**Growth Analysis:**\n"
"- Revenue Growth: X% (3-year trend)\n"
"- Earnings Growth: X% (consistency check)\n\n"
"**Material Developments:** [1-2 key items with impact assessment]\n"
"**Data Confidence:** [High/Medium/Low based on source consistency]"
),
agent=data_researcher,
)
analysis_task = Task(
description=(
"Analyze the collected financial data to assess the company's performance and outlook. "
"Include the following in your analysis:\n"
"1. Evaluation of financial health based on key metrics\n"
"2. Trend analysis showing growth or decline patterns\n"
"3. Comparison with industry benchmarks or competitors\n"
"4. Identification of strengths and potential areas of concern\n"
"5. Short-term financial outlook based on current trends"
),
expected_output=(
"A detailed financial analysis that includes assessment of key metrics, trends, "
"comparative analysis, and a reasoned outlook for the company's financial future."
),
agent=financial_analyst,
context=[research_task],
)
report_task = Task(
description=(
"Create a professional financial report based on the research and analysis. "
"The report should:\n"
"1. Begin with an executive summary highlighting key findings\n"
"2. Present the financial analysis in a clear, logical structure\n"
"3. Include visual representations of key data points (described textually)\n"
"4. Provide actionable insights for potential investors\n"
"5. Conclude with a clear investment recommendation (buy, hold, or sell)"
),
expected_output=(
"A professional, comprehensive financial report with executive summary, "
"structured analysis, visual elements, actionable insights, and a clear recommendation."
),
agent=report_writer,
context=[research_task, analysis_task],
)
crew = Crew(
agents=[data_researcher, financial_analyst, report_writer],
tasks=[research_task, analysis_task, report_task],
process=Process.sequential,
)
return crew
def test_financial_analysis_regression(financial_analysis_crew):
dataset = [
{
"inputs": {"company": "Apple Inc. (AAPL)"},
"expected_score": {"goal_alignment": 8},
},
{
"identifier": "test_2",
"inputs": {"company": "Microsoft Corporation (MSFT)"},
"expected_score": 8,
},
]
results = run_experiment(dataset=dataset, crew=financial_analysis_crew, verbose=True)
assert_experiment_successfully(results)

View File

@@ -1,33 +0,0 @@
import pytest
from crewai import Agent
from crewai_tools import SerperDevTool
from crewai.experimental.evaluation.testing import (
assert_experiment_successfully,
run_experiment,
)
@pytest.fixture
def history_teacher():
search_tool = SerperDevTool()
return Agent(
role="History Educator",
goal="Teach students about important historical events with clarity and context",
backstory=(
"As a renowned historian and educator, you have spent decades studying world history, "
"from ancient civilizations to modern events. You are passionate about making history "
"engaging and understandable for learners of all ages. Your mission is to educate, explain, "
"and spark curiosity about the past."
),
tools=[search_tool],
verbose=True,
)
def test_history_teacher(history_teacher):
dataset = [
{"inputs": {"messages": "How was the Battle of Waterloo?"}, "expected_score": 8}
]
results = run_experiment(
dataset=dataset, agents=[history_teacher], verbose=True
)
assert_experiment_successfully(results)