feat: add regression tests and configure its workflow

2025-12-16 12:28:30 +00:00 · 2025-07-15 18:45:26 -03:00
7 changed files with 343 additions and 3 deletions
--- a/.github/workflows/regression-tests.yml
+++ b/.github/workflows/regression-tests.yml
@@ -0,0 +1,75 @@
+name: Regression Tests
+
+on:
+  workflow_dispatch:
+    inputs:
+      branch:
+        description: 'Branch to run tests on'
+        required: true
+        default: 'main'
+        type: string
+
+permissions:
+  contents: write
+
+env:
+  OPENAI_API_KEY: fake-api-key
+  PYTHONUNBUFFERED: 1
+
+jobs:
+  regression-tests:
+    name: Regression - ${{ github.event.inputs.branch }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.inputs.branch }}
+          fetch-depth: 0
+
+      - name: Display execution info
+        run: |
+          echo "🚀 Running Regression Tests"
+          echo "📂 Branch: ${{ github.event.inputs.branch }}"
+          echo "📊 Current commit: $(git rev-parse --short HEAD)"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v3
+        with:
+          enable-cache: true
+          cache-dependency-glob: |
+            **/pyproject.toml
+            **/uv.lock
+
+      - name: Set up Python 3.13
+        run: uv python install 3.13
+
+      - name: Install the project
+        run: uv sync --dev --all-extras
+
+      - name: Install SQLite with FTS5 support
+        run: |
+          # WORKAROUND: GitHub Actions' Ubuntu runner uses SQLite without FTS5 support compiled in.
+          # This is a temporary fix until the runner includes SQLite with FTS5 or Python's sqlite3
+          # module is compiled with FTS5 support by default.
+          # TODO: Remove this workaround once GitHub Actions runners include SQLite FTS5 support
+
+          # Install pysqlite3-binary which has FTS5 support
+          uv pip install pysqlite3-binary
+          # Create a sitecustomize.py to override sqlite3 with pysqlite3
+          mkdir -p .pytest_sqlite_override
+          echo "import sys; import pysqlite3; sys.modules['sqlite3'] = pysqlite3" > .pytest_sqlite_override/sitecustomize.py
+          # Test FTS5 availability
+          PYTHONPATH=.pytest_sqlite_override uv run python -c "import sqlite3; print(f'SQLite version: {sqlite3.sqlite_version}')"
+          PYTHONPATH=.pytest_sqlite_override uv run python -c "import sqlite3; conn = sqlite3.connect(':memory:'); conn.execute('CREATE VIRTUAL TABLE test USING fts5(content)'); print('FTS5 module available')"
+
+      - name: Run Regression Tests
+        run: |
+          PYTHONPATH=.pytest_sqlite_override uv run pytest \
+            --block-network \
+            --timeout=30 \
+            -vv \
+            --durations=10 \
+            -n auto \
+            --maxfail=5 \
+            tests/regression
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -137,3 +137,6 @@ exclude = [
 "docs/**",
 "docs/",
 ]
+
+[tool.pytest.ini_options]
+norecursedirs = ["tests/regression"]
--- a/src/crewai/experimental/evaluation/testing.py
+++ b/src/crewai/experimental/evaluation/testing.py
@@ -1,4 +1,5 @@
 import inspect
+from pathlib import Path

 from typing_extensions import Any
 import warnings
@@ -41,12 +42,30 @@ def run_experiment(dataset: list[dict[str, Any]], crew: Crew | None = None, agen
    return runner.run(agents=agents, crew=crew, print_summary=verbose)

 def _get_baseline_filepath_fallback() -> str:
-    test_func_name = "experiment_fallback"
+    filename = "experiment_fallback.json"
+    calling_file = None

    try:
        current_frame = inspect.currentframe()
        if current_frame is not None:
            test_func_name = current_frame.f_back.f_back.f_code.co_name # type: ignore[union-attr]
+            filename = f"{test_func_name}.json"
+            calling_file = current_frame.f_back.f_back.f_code.co_filename # type: ignore[union-attr]
    except Exception:
-        ...
-    return f"{test_func_name}_results.json"
+        return filename
+
+    if not calling_file:
+        return filename
+
+    calling_path = Path(calling_file)
+    try:
+        baseline_dir_parts = calling_path.parts[:-1]
+        baseline_dir = Path(*baseline_dir_parts) / "results"
+        baseline_dir.mkdir(parents=True, exist_ok=True)
+        baseline_filepath = baseline_dir / filename
+        return str(baseline_filepath)
+
+    except (ValueError, IndexError):
+        pass
+
+    return filename
--- a/tests/regression/results/test_financial_analysis_regression.json
+++ b/tests/regression/results/test_financial_analysis_regression.json
@@ -0,0 +1,42 @@
+[
+  {
+    "timestamp": "2025-07-15T21:34:08.253410+00:00",
+    "metadata": {},
+    "results": [
+      {
+        "identifier": "72239c22b0cdde98ad5c588074ef6325",
+        "inputs": {
+          "company": "Apple Inc. (AAPL)"
+        },
+        "score": {
+          "goal_alignment": 10.0,
+          "semantic_quality": 9.0,
+          "tool_selection": 6.0,
+          "parameter_extraction": 5.0,
+          "tool_invocation": 10.0,
+          "reasoning_efficiency": 7.300000000000001
+        },
+        "expected_score": {
+          "goal_alignment": 8
+        },
+        "passed": true
+      },
+      {
+        "identifier": "test_2",
+        "inputs": {
+          "company": "Microsoft Corporation (MSFT)"
+        },
+        "score": {
+          "goal_alignment": 10.0,
+          "semantic_quality": 7.333333333333333,
+          "tool_selection": 6.25,
+          "parameter_extraction": 9.5,
+          "tool_invocation": 10.0,
+          "reasoning_efficiency": 6.0
+        },
+        "expected_score": 8,
+        "passed": true
+      }
+    ]
+  }
+]
--- a/tests/regression/results/test_history_teacher.json
+++ b/tests/regression/results/test_history_teacher.json
@@ -0,0 +1,24 @@
+[
+  {
+    "timestamp": "2025-07-15T21:31:05.916161+00:00",
+    "metadata": {},
+    "results": [
+      {
+        "identifier": "df0ea31ac4a7fb4a908b8319ec7b3719",
+        "inputs": {
+          "messages": "How was the Battle of Waterloo?"
+        },
+        "score": {
+          "goal_alignment": 10.0,
+          "semantic_quality": 10.0,
+          "tool_selection": 10.0,
+          "parameter_extraction": 10.0,
+          "tool_invocation": 10.0,
+          "reasoning_efficiency": 5.5
+        },
+        "expected_score": 8,
+        "passed": true
+      }
+    ]
+  }
+]
--- a/tests/regression/test_financial_analysis.py
+++ b/tests/regression/test_financial_analysis.py
@@ -0,0 +1,144 @@
+import pytest
+from crewai import Agent, Crew, Process, Task
+from crewai_tools import SerperDevTool
+
+from crewai.experimental.evaluation.testing import (
+    assert_experiment_successfully,
+    run_experiment,
+)
+
+
+@pytest.fixture
+def financial_analysis_crew():
+    search_tool = SerperDevTool()
+
+    data_researcher = Agent(
+        role="Financial Data Researcher",
+        goal="Efficiently collect and structure key financial metrics using multiple search strategies. Using only the search tool.",
+        backstory=(
+            "You are a precision-focused financial analyst who uses multiple targeted searches "
+            "to cross-verify data and ensure comprehensive coverage. You leverage different "
+            "search approaches to gather financial information from various authoritative sources."
+        ),
+        tools=[search_tool],
+    )
+
+    financial_analyst = Agent(
+        role="Financial Analyst",
+        goal="Analyze financial data to assess company performance and outlook",
+        backstory=(
+            "You are a seasoned financial analyst with expertise in evaluating company "
+            "performance through quantitative analysis. You can interpret financial statements, "
+            "identify trends, and make reasoned assessments of a company's financial health."
+        ),
+        tools=[search_tool],
+    )
+
+    report_writer = Agent(
+        role="Financial Report Writer",
+        goal="Synthesize financial analysis into clear, actionable reports",
+        backstory=(
+            "You are an experienced financial writer who excels at turning complex financial "
+            "analyses into clear, concise, and impactful reports. You know how to highlight "
+            "key insights and present information in a way that's accessible to various audiences."
+        ),
+        tools=[],
+    )
+
+    research_task = Task(
+        description=(
+            "Research {company} financial data using multiple targeted search queries:\n\n"
+            "**Search Strategy - Execute these searches sequentially:**\n"
+            "1. '{company} quarterly earnings Q4 2024 Q1 2025 financial results'\n"
+            "2. '{company} financial metrics P/E ratio profit margin debt equity'\n"
+            "3. '{company} revenue growth year over year earnings growth rate'\n"
+            "4. '{company} recent financial news SEC filings analyst reports'\n"
+            "5. '{company} stock performance market cap valuation 2024 2025'\n\n"
+            "**Data Collection Guidelines:**\n"
+            "- Use multiple search queries to cross-verify financial figures\n"
+            "- Prioritize official sources (SEC filings, earnings calls, company reports)\n"
+            "- Compare data across different financial platforms for accuracy\n"
+            "- Present findings in the exact format specified in expected_output."
+        ),
+        expected_output=(
+            "Financial data summary in this structure:\n\n"
+            "## Company Financial Overview\n"
+            "**Data Sources Used:** [List 3-5 sources from multiple searches]\n\n"
+            "**Latest Quarter:** [Period]\n"
+            "- Revenue: $X (YoY: +/-X%) [Source verification]\n"
+            "- Net Income: $X (YoY: +/-X%) [Source verification]\n"
+            "- EPS: $X (YoY: +/-X%) [Source verification]\n\n"
+            "**Key Metrics:**\n"
+            "- P/E Ratio: X [Current vs Historical]\n"
+            "- Profit Margin: X% [Trend indicator]\n"
+            "- Debt-to-Equity: X [Industry comparison]\n\n"
+            "**Growth Analysis:**\n"
+            "- Revenue Growth: X% (3-year trend)\n"
+            "- Earnings Growth: X% (consistency check)\n\n"
+            "**Material Developments:** [1-2 key items with impact assessment]\n"
+            "**Data Confidence:** [High/Medium/Low based on source consistency]"
+        ),
+        agent=data_researcher,
+    )
+
+    analysis_task = Task(
+        description=(
+            "Analyze the collected financial data to assess the company's performance and outlook. "
+            "Include the following in your analysis:\n"
+            "1. Evaluation of financial health based on key metrics\n"
+            "2. Trend analysis showing growth or decline patterns\n"
+            "3. Comparison with industry benchmarks or competitors\n"
+            "4. Identification of strengths and potential areas of concern\n"
+            "5. Short-term financial outlook based on current trends"
+        ),
+        expected_output=(
+            "A detailed financial analysis that includes assessment of key metrics, trends, "
+            "comparative analysis, and a reasoned outlook for the company's financial future."
+        ),
+        agent=financial_analyst,
+        context=[research_task],
+    )
+
+    report_task = Task(
+        description=(
+            "Create a professional financial report based on the research and analysis. "
+            "The report should:\n"
+            "1. Begin with an executive summary highlighting key findings\n"
+            "2. Present the financial analysis in a clear, logical structure\n"
+            "3. Include visual representations of key data points (described textually)\n"
+            "4. Provide actionable insights for potential investors\n"
+            "5. Conclude with a clear investment recommendation (buy, hold, or sell)"
+        ),
+        expected_output=(
+            "A professional, comprehensive financial report with executive summary, "
+            "structured analysis, visual elements, actionable insights, and a clear recommendation."
+        ),
+        agent=report_writer,
+        context=[research_task, analysis_task],
+    )
+
+    crew = Crew(
+        agents=[data_researcher, financial_analyst, report_writer],
+        tasks=[research_task, analysis_task, report_task],
+        process=Process.sequential,
+    )
+
+    return crew
+
+
+def test_financial_analysis_regression(financial_analysis_crew):
+    dataset = [
+        {
+            "inputs": {"company": "Apple Inc. (AAPL)"},
+            "expected_score": {"goal_alignment": 8},
+        },
+        {
+            "identifier": "test_2",
+            "inputs": {"company": "Microsoft Corporation (MSFT)"},
+            "expected_score": 8,
+        },
+    ]
+
+    results = run_experiment(dataset=dataset, crew=financial_analysis_crew, verbose=True)
+
+    assert_experiment_successfully(results)
--- a/tests/regression/test_history_teacher.py
+++ b/tests/regression/test_history_teacher.py
@@ -0,0 +1,33 @@
+import pytest
+from crewai import Agent
+from crewai_tools import SerperDevTool
+
+from crewai.experimental.evaluation.testing import (
+    assert_experiment_successfully,
+    run_experiment,
+)
+
+@pytest.fixture
+def history_teacher():
+    search_tool = SerperDevTool()
+    return Agent(
+        role="History Educator",
+        goal="Teach students about important historical events with clarity and context",
+        backstory=(
+            "As a renowned historian and educator, you have spent decades studying world history, "
+            "from ancient civilizations to modern events. You are passionate about making history "
+            "engaging and understandable for learners of all ages. Your mission is to educate, explain, "
+            "and spark curiosity about the past."
+        ),
+        tools=[search_tool],
+        verbose=True,
+    )
+def test_history_teacher(history_teacher):
+    dataset = [
+        {"inputs": {"messages": "How was the Battle of Waterloo?"}, "expected_score": 8}
+    ]
+    results = run_experiment(
+        dataset=dataset, agents=[history_teacher], verbose=True
+    )
+
+    assert_experiment_successfully(results)