feat: add regression tests and configure its workflow

2026-01-28 17:48:13 +00:00 · 2025-07-15 18:45:26 -03:00
7 changed files with 343 additions and 3 deletions
--- a/.github/workflows/regression-tests.yml
+++ b/.github/workflows/regression-tests.yml
@@ -0,0 +1,75 @@
 name: Regression Tests
 on:
  workflow_dispatch:
    inputs:
      branch:
        description: 'Branch to run tests on'
        required: true
        default: 'main'
        type: string
 permissions:
  contents: write
 env:
  OPENAI_API_KEY: fake-api-key
  PYTHONUNBUFFERED: 1
 jobs:
  regression-tests:
    name: Regression - ${{ github.event.inputs.branch }}
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event.inputs.branch }}
          fetch-depth: 0
      - name: Display execution info
        run: |
          echo "🚀 Running Regression Tests"
          echo "📂 Branch: ${{ github.event.inputs.branch }}"
          echo "📊 Current commit: $(git rev-parse --short HEAD)"
      - name: Install uv
        uses: astral-sh/setup-uv@v3
        with:
          enable-cache: true
          cache-dependency-glob: |
            **/pyproject.toml
            **/uv.lock
      - name: Set up Python 3.13
        run: uv python install 3.13
      - name: Install the project
        run: uv sync --dev --all-extras
      - name: Install SQLite with FTS5 support
        run: |
          # WORKAROUND: GitHub Actions' Ubuntu runner uses SQLite without FTS5 support compiled in.
          # This is a temporary fix until the runner includes SQLite with FTS5 or Python's sqlite3
          # module is compiled with FTS5 support by default.
          # TODO: Remove this workaround once GitHub Actions runners include SQLite FTS5 support
          # Install pysqlite3-binary which has FTS5 support
          uv pip install pysqlite3-binary
          # Create a sitecustomize.py to override sqlite3 with pysqlite3
          mkdir -p .pytest_sqlite_override
          echo "import sys; import pysqlite3; sys.modules['sqlite3'] = pysqlite3" > .pytest_sqlite_override/sitecustomize.py
          # Test FTS5 availability
          PYTHONPATH=.pytest_sqlite_override uv run python -c "import sqlite3; print(f'SQLite version: {sqlite3.sqlite_version}')"
          PYTHONPATH=.pytest_sqlite_override uv run python -c "import sqlite3; conn = sqlite3.connect(':memory:'); conn.execute('CREATE VIRTUAL TABLE test USING fts5(content)'); print('FTS5 module available')"
      - name: Run Regression Tests
        run: |
          PYTHONPATH=.pytest_sqlite_override uv run pytest \
            --block-network \
            --timeout=30 \
            -vv \
            --durations=10 \
            -n auto \
            --maxfail=5 \
            tests/regression
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -137,3 +137,6 @@ exclude = [
 "docs/**",
 "docs/",
 ]
 [tool.pytest.ini_options]
 norecursedirs = ["tests/regression"]
--- a/src/crewai/experimental/evaluation/testing.py
+++ b/src/crewai/experimental/evaluation/testing.py
@@ -1,4 +1,5 @@
 import inspect
 from pathlib import Path
 from typing_extensions import Any
 import warnings
@@ -41,12 +42,30 @@ def run_experiment(dataset: list[dict[str, Any]], crew: Crew | None = None, agen
    return runner.run(agents=agents, crew=crew, print_summary=verbose)
 def _get_baseline_filepath_fallback() -> str:
-    test_func_name = "experiment_fallback"
+    filename = "experiment_fallback.json"
    calling_file = None
    try:
        current_frame = inspect.currentframe()
        if current_frame is not None:
            test_func_name = current_frame.f_back.f_back.f_code.co_name # type: ignore[union-attr]
            filename = f"{test_func_name}.json"
            calling_file = current_frame.f_back.f_back.f_code.co_filename # type: ignore[union-attr]
    except Exception:
-        ...
+        return filename
-    return f"{test_func_name}_results.json"
+
    if not calling_file:
        return filename
    calling_path = Path(calling_file)
    try:
        baseline_dir_parts = calling_path.parts[:-1]
        baseline_dir = Path(*baseline_dir_parts) / "results"
        baseline_dir.mkdir(parents=True, exist_ok=True)
        baseline_filepath = baseline_dir / filename
        return str(baseline_filepath)
    except (ValueError, IndexError):
        pass
    return filename
--- a/tests/regression/results/test_financial_analysis_regression.json
+++ b/tests/regression/results/test_financial_analysis_regression.json
@@ -0,0 +1,42 @@
 [
  {
    "timestamp": "2025-07-15T21:34:08.253410+00:00",
    "metadata": {},
    "results": [
      {
        "identifier": "72239c22b0cdde98ad5c588074ef6325",
        "inputs": {
          "company": "Apple Inc. (AAPL)"
        },
        "score": {
          "goal_alignment": 10.0,
          "semantic_quality": 9.0,
          "tool_selection": 6.0,
          "parameter_extraction": 5.0,
          "tool_invocation": 10.0,
          "reasoning_efficiency": 7.300000000000001
        },
        "expected_score": {
          "goal_alignment": 8
        },
        "passed": true
      },
      {
        "identifier": "test_2",
        "inputs": {
          "company": "Microsoft Corporation (MSFT)"
        },
        "score": {
          "goal_alignment": 10.0,
          "semantic_quality": 7.333333333333333,
          "tool_selection": 6.25,
          "parameter_extraction": 9.5,
          "tool_invocation": 10.0,
          "reasoning_efficiency": 6.0
        },
        "expected_score": 8,
        "passed": true
      }
    ]
  }
 ]
--- a/tests/regression/results/test_history_teacher.json
+++ b/tests/regression/results/test_history_teacher.json
@@ -0,0 +1,24 @@
 [
  {
    "timestamp": "2025-07-15T21:31:05.916161+00:00",
    "metadata": {},
    "results": [
      {
        "identifier": "df0ea31ac4a7fb4a908b8319ec7b3719",
        "inputs": {
          "messages": "How was the Battle of Waterloo?"
        },
        "score": {
          "goal_alignment": 10.0,
          "semantic_quality": 10.0,
          "tool_selection": 10.0,
          "parameter_extraction": 10.0,
          "tool_invocation": 10.0,
          "reasoning_efficiency": 5.5
        },
        "expected_score": 8,
        "passed": true
      }
    ]
  }
 ]
--- a/tests/regression/test_financial_analysis.py
+++ b/tests/regression/test_financial_analysis.py
@@ -0,0 +1,144 @@
 import pytest
 from crewai import Agent, Crew, Process, Task
 from crewai_tools import SerperDevTool
 from crewai.experimental.evaluation.testing import (
    assert_experiment_successfully,
    run_experiment,
 )
@pytest.fixture
 def financial_analysis_crew():
    search_tool = SerperDevTool()
    data_researcher = Agent(
        role="Financial Data Researcher",
        goal="Efficiently collect and structure key financial metrics using multiple search strategies. Using only the search tool.",
        backstory=(
            "You are a precision-focused financial analyst who uses multiple targeted searches "
            "to cross-verify data and ensure comprehensive coverage. You leverage different "
            "search approaches to gather financial information from various authoritative sources."
        ),
        tools=[search_tool],
    )
    financial_analyst = Agent(
        role="Financial Analyst",
        goal="Analyze financial data to assess company performance and outlook",
        backstory=(
            "You are a seasoned financial analyst with expertise in evaluating company "
            "performance through quantitative analysis. You can interpret financial statements, "
            "identify trends, and make reasoned assessments of a company's financial health."
        ),
        tools=[search_tool],
    )
    report_writer = Agent(
        role="Financial Report Writer",
        goal="Synthesize financial analysis into clear, actionable reports",
        backstory=(
            "You are an experienced financial writer who excels at turning complex financial "
            "analyses into clear, concise, and impactful reports. You know how to highlight "
            "key insights and present information in a way that's accessible to various audiences."
        ),
        tools=[],
    )
    research_task = Task(
        description=(
            "Research {company} financial data using multiple targeted search queries:\n\n"
            "**Search Strategy - Execute these searches sequentially:**\n"
            "1. '{company} quarterly earnings Q4 2024 Q1 2025 financial results'\n"
            "2. '{company} financial metrics P/E ratio profit margin debt equity'\n"
            "3. '{company} revenue growth year over year earnings growth rate'\n"
            "4. '{company} recent financial news SEC filings analyst reports'\n"
            "5. '{company} stock performance market cap valuation 2024 2025'\n\n"
            "**Data Collection Guidelines:**\n"
            "- Use multiple search queries to cross-verify financial figures\n"
            "- Prioritize official sources (SEC filings, earnings calls, company reports)\n"
            "- Compare data across different financial platforms for accuracy\n"
            "- Present findings in the exact format specified in expected_output."
        ),
        expected_output=(
            "Financial data summary in this structure:\n\n"
            "## Company Financial Overview\n"
            "**Data Sources Used:** [List 3-5 sources from multiple searches]\n\n"
            "**Latest Quarter:** [Period]\n"
            "- Revenue: $X (YoY: +/-X%) [Source verification]\n"
            "- Net Income: $X (YoY: +/-X%) [Source verification]\n"
            "- EPS: $X (YoY: +/-X%) [Source verification]\n\n"
            "**Key Metrics:**\n"
            "- P/E Ratio: X [Current vs Historical]\n"
            "- Profit Margin: X% [Trend indicator]\n"
            "- Debt-to-Equity: X [Industry comparison]\n\n"
            "**Growth Analysis:**\n"
            "- Revenue Growth: X% (3-year trend)\n"
            "- Earnings Growth: X% (consistency check)\n\n"
            "**Material Developments:** [1-2 key items with impact assessment]\n"
            "**Data Confidence:** [High/Medium/Low based on source consistency]"
        ),
        agent=data_researcher,
    )
    analysis_task = Task(
        description=(
            "Analyze the collected financial data to assess the company's performance and outlook. "
            "Include the following in your analysis:\n"
            "1. Evaluation of financial health based on key metrics\n"
            "2. Trend analysis showing growth or decline patterns\n"
            "3. Comparison with industry benchmarks or competitors\n"
            "4. Identification of strengths and potential areas of concern\n"
            "5. Short-term financial outlook based on current trends"
        ),
        expected_output=(
            "A detailed financial analysis that includes assessment of key metrics, trends, "
            "comparative analysis, and a reasoned outlook for the company's financial future."
        ),
        agent=financial_analyst,
        context=[research_task],
    )
    report_task = Task(
        description=(
            "Create a professional financial report based on the research and analysis. "
            "The report should:\n"
            "1. Begin with an executive summary highlighting key findings\n"
            "2. Present the financial analysis in a clear, logical structure\n"
            "3. Include visual representations of key data points (described textually)\n"
            "4. Provide actionable insights for potential investors\n"
            "5. Conclude with a clear investment recommendation (buy, hold, or sell)"
        ),
        expected_output=(
            "A professional, comprehensive financial report with executive summary, "
            "structured analysis, visual elements, actionable insights, and a clear recommendation."
        ),
        agent=report_writer,
        context=[research_task, analysis_task],
    )
    crew = Crew(
        agents=[data_researcher, financial_analyst, report_writer],
        tasks=[research_task, analysis_task, report_task],
        process=Process.sequential,
    )
    return crew
 def test_financial_analysis_regression(financial_analysis_crew):
    dataset = [
        {
            "inputs": {"company": "Apple Inc. (AAPL)"},
            "expected_score": {"goal_alignment": 8},
        },
        {
            "identifier": "test_2",
            "inputs": {"company": "Microsoft Corporation (MSFT)"},
            "expected_score": 8,
        },
    ]
    results = run_experiment(dataset=dataset, crew=financial_analysis_crew, verbose=True)
    assert_experiment_successfully(results)
--- a/tests/regression/test_history_teacher.py
+++ b/tests/regression/test_history_teacher.py
@@ -0,0 +1,33 @@
 import pytest
 from crewai import Agent
 from crewai_tools import SerperDevTool
 from crewai.experimental.evaluation.testing import (
    assert_experiment_successfully,
    run_experiment,
 )
@pytest.fixture
 def history_teacher():
    search_tool = SerperDevTool()
    return Agent(
        role="History Educator",
        goal="Teach students about important historical events with clarity and context",
        backstory=(
            "As a renowned historian and educator, you have spent decades studying world history, "
            "from ancient civilizations to modern events. You are passionate about making history "
            "engaging and understandable for learners of all ages. Your mission is to educate, explain, "
            "and spark curiosity about the past."
        ),
        tools=[search_tool],
        verbose=True,
    )
 def test_history_teacher(history_teacher):
    dataset = [
        {"inputs": {"messages": "How was the Battle of Waterloo?"}, "expected_score": 8}
    ]
    results = run_experiment(
        dataset=dataset, agents=[history_teacher], verbose=True
    )
    assert_experiment_successfully(results)