Compare commits

...

1 Commits

Author SHA1 Message Date
Lucas Gomide
cbe570088e feat: add regression tests and configure its workflow 2025-07-15 18:45:26 -03:00
7 changed files with 343 additions and 3 deletions

75
.github/workflows/regression-tests.yml vendored Normal file
View File

@@ -0,0 +1,75 @@
name: Regression Tests
on:
workflow_dispatch:
inputs:
branch:
description: 'Branch to run tests on'
required: true
default: 'main'
type: string
permissions:
contents: write
env:
OPENAI_API_KEY: fake-api-key
PYTHONUNBUFFERED: 1
jobs:
regression-tests:
name: Regression - ${{ github.event.inputs.branch }}
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ github.event.inputs.branch }}
fetch-depth: 0
- name: Display execution info
run: |
echo "🚀 Running Regression Tests"
echo "📂 Branch: ${{ github.event.inputs.branch }}"
echo "📊 Current commit: $(git rev-parse --short HEAD)"
- name: Install uv
uses: astral-sh/setup-uv@v3
with:
enable-cache: true
cache-dependency-glob: |
**/pyproject.toml
**/uv.lock
- name: Set up Python 3.13
run: uv python install 3.13
- name: Install the project
run: uv sync --dev --all-extras
- name: Install SQLite with FTS5 support
run: |
# WORKAROUND: GitHub Actions' Ubuntu runner uses SQLite without FTS5 support compiled in.
# This is a temporary fix until the runner includes SQLite with FTS5 or Python's sqlite3
# module is compiled with FTS5 support by default.
# TODO: Remove this workaround once GitHub Actions runners include SQLite FTS5 support
# Install pysqlite3-binary which has FTS5 support
uv pip install pysqlite3-binary
# Create a sitecustomize.py to override sqlite3 with pysqlite3
mkdir -p .pytest_sqlite_override
echo "import sys; import pysqlite3; sys.modules['sqlite3'] = pysqlite3" > .pytest_sqlite_override/sitecustomize.py
# Test FTS5 availability
PYTHONPATH=.pytest_sqlite_override uv run python -c "import sqlite3; print(f'SQLite version: {sqlite3.sqlite_version}')"
PYTHONPATH=.pytest_sqlite_override uv run python -c "import sqlite3; conn = sqlite3.connect(':memory:'); conn.execute('CREATE VIRTUAL TABLE test USING fts5(content)'); print('FTS5 module available')"
- name: Run Regression Tests
run: |
PYTHONPATH=.pytest_sqlite_override uv run pytest \
--block-network \
--timeout=30 \
-vv \
--durations=10 \
-n auto \
--maxfail=5 \
tests/regression

View File

@@ -137,3 +137,6 @@ exclude = [
"docs/**",
"docs/",
]
[tool.pytest.ini_options]
norecursedirs = ["tests/regression"]

View File

@@ -1,4 +1,5 @@
import inspect
from pathlib import Path
from typing_extensions import Any
import warnings
@@ -41,12 +42,30 @@ def run_experiment(dataset: list[dict[str, Any]], crew: Crew | None = None, agen
return runner.run(agents=agents, crew=crew, print_summary=verbose)
def _get_baseline_filepath_fallback() -> str:
test_func_name = "experiment_fallback"
filename = "experiment_fallback.json"
calling_file = None
try:
current_frame = inspect.currentframe()
if current_frame is not None:
test_func_name = current_frame.f_back.f_back.f_code.co_name # type: ignore[union-attr]
filename = f"{test_func_name}.json"
calling_file = current_frame.f_back.f_back.f_code.co_filename # type: ignore[union-attr]
except Exception:
...
return f"{test_func_name}_results.json"
return filename
if not calling_file:
return filename
calling_path = Path(calling_file)
try:
baseline_dir_parts = calling_path.parts[:-1]
baseline_dir = Path(*baseline_dir_parts) / "results"
baseline_dir.mkdir(parents=True, exist_ok=True)
baseline_filepath = baseline_dir / filename
return str(baseline_filepath)
except (ValueError, IndexError):
pass
return filename

View File

@@ -0,0 +1,42 @@
[
{
"timestamp": "2025-07-15T21:34:08.253410+00:00",
"metadata": {},
"results": [
{
"identifier": "72239c22b0cdde98ad5c588074ef6325",
"inputs": {
"company": "Apple Inc. (AAPL)"
},
"score": {
"goal_alignment": 10.0,
"semantic_quality": 9.0,
"tool_selection": 6.0,
"parameter_extraction": 5.0,
"tool_invocation": 10.0,
"reasoning_efficiency": 7.300000000000001
},
"expected_score": {
"goal_alignment": 8
},
"passed": true
},
{
"identifier": "test_2",
"inputs": {
"company": "Microsoft Corporation (MSFT)"
},
"score": {
"goal_alignment": 10.0,
"semantic_quality": 7.333333333333333,
"tool_selection": 6.25,
"parameter_extraction": 9.5,
"tool_invocation": 10.0,
"reasoning_efficiency": 6.0
},
"expected_score": 8,
"passed": true
}
]
}
]

View File

@@ -0,0 +1,24 @@
[
{
"timestamp": "2025-07-15T21:31:05.916161+00:00",
"metadata": {},
"results": [
{
"identifier": "df0ea31ac4a7fb4a908b8319ec7b3719",
"inputs": {
"messages": "How was the Battle of Waterloo?"
},
"score": {
"goal_alignment": 10.0,
"semantic_quality": 10.0,
"tool_selection": 10.0,
"parameter_extraction": 10.0,
"tool_invocation": 10.0,
"reasoning_efficiency": 5.5
},
"expected_score": 8,
"passed": true
}
]
}
]

View File

@@ -0,0 +1,144 @@
import pytest
from crewai import Agent, Crew, Process, Task
from crewai_tools import SerperDevTool
from crewai.experimental.evaluation.testing import (
assert_experiment_successfully,
run_experiment,
)
@pytest.fixture
def financial_analysis_crew():
search_tool = SerperDevTool()
data_researcher = Agent(
role="Financial Data Researcher",
goal="Efficiently collect and structure key financial metrics using multiple search strategies. Using only the search tool.",
backstory=(
"You are a precision-focused financial analyst who uses multiple targeted searches "
"to cross-verify data and ensure comprehensive coverage. You leverage different "
"search approaches to gather financial information from various authoritative sources."
),
tools=[search_tool],
)
financial_analyst = Agent(
role="Financial Analyst",
goal="Analyze financial data to assess company performance and outlook",
backstory=(
"You are a seasoned financial analyst with expertise in evaluating company "
"performance through quantitative analysis. You can interpret financial statements, "
"identify trends, and make reasoned assessments of a company's financial health."
),
tools=[search_tool],
)
report_writer = Agent(
role="Financial Report Writer",
goal="Synthesize financial analysis into clear, actionable reports",
backstory=(
"You are an experienced financial writer who excels at turning complex financial "
"analyses into clear, concise, and impactful reports. You know how to highlight "
"key insights and present information in a way that's accessible to various audiences."
),
tools=[],
)
research_task = Task(
description=(
"Research {company} financial data using multiple targeted search queries:\n\n"
"**Search Strategy - Execute these searches sequentially:**\n"
"1. '{company} quarterly earnings Q4 2024 Q1 2025 financial results'\n"
"2. '{company} financial metrics P/E ratio profit margin debt equity'\n"
"3. '{company} revenue growth year over year earnings growth rate'\n"
"4. '{company} recent financial news SEC filings analyst reports'\n"
"5. '{company} stock performance market cap valuation 2024 2025'\n\n"
"**Data Collection Guidelines:**\n"
"- Use multiple search queries to cross-verify financial figures\n"
"- Prioritize official sources (SEC filings, earnings calls, company reports)\n"
"- Compare data across different financial platforms for accuracy\n"
"- Present findings in the exact format specified in expected_output."
),
expected_output=(
"Financial data summary in this structure:\n\n"
"## Company Financial Overview\n"
"**Data Sources Used:** [List 3-5 sources from multiple searches]\n\n"
"**Latest Quarter:** [Period]\n"
"- Revenue: $X (YoY: +/-X%) [Source verification]\n"
"- Net Income: $X (YoY: +/-X%) [Source verification]\n"
"- EPS: $X (YoY: +/-X%) [Source verification]\n\n"
"**Key Metrics:**\n"
"- P/E Ratio: X [Current vs Historical]\n"
"- Profit Margin: X% [Trend indicator]\n"
"- Debt-to-Equity: X [Industry comparison]\n\n"
"**Growth Analysis:**\n"
"- Revenue Growth: X% (3-year trend)\n"
"- Earnings Growth: X% (consistency check)\n\n"
"**Material Developments:** [1-2 key items with impact assessment]\n"
"**Data Confidence:** [High/Medium/Low based on source consistency]"
),
agent=data_researcher,
)
analysis_task = Task(
description=(
"Analyze the collected financial data to assess the company's performance and outlook. "
"Include the following in your analysis:\n"
"1. Evaluation of financial health based on key metrics\n"
"2. Trend analysis showing growth or decline patterns\n"
"3. Comparison with industry benchmarks or competitors\n"
"4. Identification of strengths and potential areas of concern\n"
"5. Short-term financial outlook based on current trends"
),
expected_output=(
"A detailed financial analysis that includes assessment of key metrics, trends, "
"comparative analysis, and a reasoned outlook for the company's financial future."
),
agent=financial_analyst,
context=[research_task],
)
report_task = Task(
description=(
"Create a professional financial report based on the research and analysis. "
"The report should:\n"
"1. Begin with an executive summary highlighting key findings\n"
"2. Present the financial analysis in a clear, logical structure\n"
"3. Include visual representations of key data points (described textually)\n"
"4. Provide actionable insights for potential investors\n"
"5. Conclude with a clear investment recommendation (buy, hold, or sell)"
),
expected_output=(
"A professional, comprehensive financial report with executive summary, "
"structured analysis, visual elements, actionable insights, and a clear recommendation."
),
agent=report_writer,
context=[research_task, analysis_task],
)
crew = Crew(
agents=[data_researcher, financial_analyst, report_writer],
tasks=[research_task, analysis_task, report_task],
process=Process.sequential,
)
return crew
def test_financial_analysis_regression(financial_analysis_crew):
dataset = [
{
"inputs": {"company": "Apple Inc. (AAPL)"},
"expected_score": {"goal_alignment": 8},
},
{
"identifier": "test_2",
"inputs": {"company": "Microsoft Corporation (MSFT)"},
"expected_score": 8,
},
]
results = run_experiment(dataset=dataset, crew=financial_analysis_crew, verbose=True)
assert_experiment_successfully(results)

View File

@@ -0,0 +1,33 @@
import pytest
from crewai import Agent
from crewai_tools import SerperDevTool
from crewai.experimental.evaluation.testing import (
assert_experiment_successfully,
run_experiment,
)
@pytest.fixture
def history_teacher():
search_tool = SerperDevTool()
return Agent(
role="History Educator",
goal="Teach students about important historical events with clarity and context",
backstory=(
"As a renowned historian and educator, you have spent decades studying world history, "
"from ancient civilizations to modern events. You are passionate about making history "
"engaging and understandable for learners of all ages. Your mission is to educate, explain, "
"and spark curiosity about the past."
),
tools=[search_tool],
verbose=True,
)
def test_history_teacher(history_teacher):
dataset = [
{"inputs": {"messages": "How was the Battle of Waterloo?"}, "expected_score": 8}
]
results = run_experiment(
dataset=dataset, agents=[history_teacher], verbose=True
)
assert_experiment_successfully(results)