diff --git a/.github/workflows/regression-tests.yml b/.github/workflows/regression-tests.yml new file mode 100644 index 000000000..736e3485f --- /dev/null +++ b/.github/workflows/regression-tests.yml @@ -0,0 +1,75 @@ +name: Regression Tests + +on: + workflow_dispatch: + inputs: + branch: + description: 'Branch to run tests on' + required: true + default: 'main' + type: string + +permissions: + contents: write + +env: + OPENAI_API_KEY: fake-api-key + PYTHONUNBUFFERED: 1 + +jobs: + regression-tests: + name: Regression - ${{ github.event.inputs.branch }} + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.branch }} + fetch-depth: 0 + + - name: Display execution info + run: | + echo "🚀 Running Regression Tests" + echo "📂 Branch: ${{ github.event.inputs.branch }}" + echo "📊 Current commit: $(git rev-parse --short HEAD)" + + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + cache-dependency-glob: | + **/pyproject.toml + **/uv.lock + + - name: Set up Python 3.13 + run: uv python install 3.13 + + - name: Install the project + run: uv sync --dev --all-extras + + - name: Install SQLite with FTS5 support + run: | + # WORKAROUND: GitHub Actions' Ubuntu runner uses SQLite without FTS5 support compiled in. + # This is a temporary fix until the runner includes SQLite with FTS5 or Python's sqlite3 + # module is compiled with FTS5 support by default. + # TODO: Remove this workaround once GitHub Actions runners include SQLite FTS5 support + + # Install pysqlite3-binary which has FTS5 support + uv pip install pysqlite3-binary + # Create a sitecustomize.py to override sqlite3 with pysqlite3 + mkdir -p .pytest_sqlite_override + echo "import sys; import pysqlite3; sys.modules['sqlite3'] = pysqlite3" > .pytest_sqlite_override/sitecustomize.py + # Test FTS5 availability + PYTHONPATH=.pytest_sqlite_override uv run python -c "import sqlite3; print(f'SQLite version: {sqlite3.sqlite_version}')" + PYTHONPATH=.pytest_sqlite_override uv run python -c "import sqlite3; conn = sqlite3.connect(':memory:'); conn.execute('CREATE VIRTUAL TABLE test USING fts5(content)'); print('FTS5 module available')" + + - name: Run Regression Tests + run: | + PYTHONPATH=.pytest_sqlite_override uv run pytest \ + --block-network \ + --timeout=30 \ + -vv \ + --durations=10 \ + -n auto \ + --maxfail=5 \ + tests/regression diff --git a/pyproject.toml b/pyproject.toml index c849aacc6..a3f9f66dc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -137,3 +137,6 @@ exclude = [ "docs/**", "docs/", ] + +[tool.pytest.ini_options] +norecursedirs = ["tests/regression"] diff --git a/src/crewai/experimental/evaluation/testing.py b/src/crewai/experimental/evaluation/testing.py index 1cd9331a2..0990589e1 100644 --- a/src/crewai/experimental/evaluation/testing.py +++ b/src/crewai/experimental/evaluation/testing.py @@ -1,4 +1,5 @@ import inspect +from pathlib import Path from typing_extensions import Any import warnings @@ -41,12 +42,30 @@ def run_experiment(dataset: list[dict[str, Any]], crew: Crew | None = None, agen return runner.run(agents=agents, crew=crew, print_summary=verbose) def _get_baseline_filepath_fallback() -> str: - test_func_name = "experiment_fallback" + filename = "experiment_fallback.json" + calling_file = None try: current_frame = inspect.currentframe() if current_frame is not None: test_func_name = current_frame.f_back.f_back.f_code.co_name # type: ignore[union-attr] + filename = f"{test_func_name}.json" + calling_file = current_frame.f_back.f_back.f_code.co_filename # type: ignore[union-attr] except Exception: - ... - return f"{test_func_name}_results.json" \ No newline at end of file + return filename + + if not calling_file: + return filename + + calling_path = Path(calling_file) + try: + baseline_dir_parts = calling_path.parts[:-1] + baseline_dir = Path(*baseline_dir_parts) / "results" + baseline_dir.mkdir(parents=True, exist_ok=True) + baseline_filepath = baseline_dir / filename + return str(baseline_filepath) + + except (ValueError, IndexError): + pass + + return filename diff --git a/tests/regression/results/test_financial_analysis_regression.json b/tests/regression/results/test_financial_analysis_regression.json new file mode 100644 index 000000000..b6a117cf0 --- /dev/null +++ b/tests/regression/results/test_financial_analysis_regression.json @@ -0,0 +1,42 @@ +[ + { + "timestamp": "2025-07-15T21:34:08.253410+00:00", + "metadata": {}, + "results": [ + { + "identifier": "72239c22b0cdde98ad5c588074ef6325", + "inputs": { + "company": "Apple Inc. (AAPL)" + }, + "score": { + "goal_alignment": 10.0, + "semantic_quality": 9.0, + "tool_selection": 6.0, + "parameter_extraction": 5.0, + "tool_invocation": 10.0, + "reasoning_efficiency": 7.300000000000001 + }, + "expected_score": { + "goal_alignment": 8 + }, + "passed": true + }, + { + "identifier": "test_2", + "inputs": { + "company": "Microsoft Corporation (MSFT)" + }, + "score": { + "goal_alignment": 10.0, + "semantic_quality": 7.333333333333333, + "tool_selection": 6.25, + "parameter_extraction": 9.5, + "tool_invocation": 10.0, + "reasoning_efficiency": 6.0 + }, + "expected_score": 8, + "passed": true + } + ] + } +] \ No newline at end of file diff --git a/tests/regression/results/test_history_teacher.json b/tests/regression/results/test_history_teacher.json new file mode 100644 index 000000000..362e01524 --- /dev/null +++ b/tests/regression/results/test_history_teacher.json @@ -0,0 +1,24 @@ +[ + { + "timestamp": "2025-07-15T21:31:05.916161+00:00", + "metadata": {}, + "results": [ + { + "identifier": "df0ea31ac4a7fb4a908b8319ec7b3719", + "inputs": { + "messages": "How was the Battle of Waterloo?" + }, + "score": { + "goal_alignment": 10.0, + "semantic_quality": 10.0, + "tool_selection": 10.0, + "parameter_extraction": 10.0, + "tool_invocation": 10.0, + "reasoning_efficiency": 5.5 + }, + "expected_score": 8, + "passed": true + } + ] + } +] \ No newline at end of file diff --git a/tests/regression/test_financial_analysis.py b/tests/regression/test_financial_analysis.py new file mode 100644 index 000000000..cb4d38b70 --- /dev/null +++ b/tests/regression/test_financial_analysis.py @@ -0,0 +1,144 @@ +import pytest +from crewai import Agent, Crew, Process, Task +from crewai_tools import SerperDevTool + +from crewai.experimental.evaluation.testing import ( + assert_experiment_successfully, + run_experiment, +) + + +@pytest.fixture +def financial_analysis_crew(): + search_tool = SerperDevTool() + + data_researcher = Agent( + role="Financial Data Researcher", + goal="Efficiently collect and structure key financial metrics using multiple search strategies. Using only the search tool.", + backstory=( + "You are a precision-focused financial analyst who uses multiple targeted searches " + "to cross-verify data and ensure comprehensive coverage. You leverage different " + "search approaches to gather financial information from various authoritative sources." + ), + tools=[search_tool], + ) + + financial_analyst = Agent( + role="Financial Analyst", + goal="Analyze financial data to assess company performance and outlook", + backstory=( + "You are a seasoned financial analyst with expertise in evaluating company " + "performance through quantitative analysis. You can interpret financial statements, " + "identify trends, and make reasoned assessments of a company's financial health." + ), + tools=[search_tool], + ) + + report_writer = Agent( + role="Financial Report Writer", + goal="Synthesize financial analysis into clear, actionable reports", + backstory=( + "You are an experienced financial writer who excels at turning complex financial " + "analyses into clear, concise, and impactful reports. You know how to highlight " + "key insights and present information in a way that's accessible to various audiences." + ), + tools=[], + ) + + research_task = Task( + description=( + "Research {company} financial data using multiple targeted search queries:\n\n" + "**Search Strategy - Execute these searches sequentially:**\n" + "1. '{company} quarterly earnings Q4 2024 Q1 2025 financial results'\n" + "2. '{company} financial metrics P/E ratio profit margin debt equity'\n" + "3. '{company} revenue growth year over year earnings growth rate'\n" + "4. '{company} recent financial news SEC filings analyst reports'\n" + "5. '{company} stock performance market cap valuation 2024 2025'\n\n" + "**Data Collection Guidelines:**\n" + "- Use multiple search queries to cross-verify financial figures\n" + "- Prioritize official sources (SEC filings, earnings calls, company reports)\n" + "- Compare data across different financial platforms for accuracy\n" + "- Present findings in the exact format specified in expected_output." + ), + expected_output=( + "Financial data summary in this structure:\n\n" + "## Company Financial Overview\n" + "**Data Sources Used:** [List 3-5 sources from multiple searches]\n\n" + "**Latest Quarter:** [Period]\n" + "- Revenue: $X (YoY: +/-X%) [Source verification]\n" + "- Net Income: $X (YoY: +/-X%) [Source verification]\n" + "- EPS: $X (YoY: +/-X%) [Source verification]\n\n" + "**Key Metrics:**\n" + "- P/E Ratio: X [Current vs Historical]\n" + "- Profit Margin: X% [Trend indicator]\n" + "- Debt-to-Equity: X [Industry comparison]\n\n" + "**Growth Analysis:**\n" + "- Revenue Growth: X% (3-year trend)\n" + "- Earnings Growth: X% (consistency check)\n\n" + "**Material Developments:** [1-2 key items with impact assessment]\n" + "**Data Confidence:** [High/Medium/Low based on source consistency]" + ), + agent=data_researcher, + ) + + analysis_task = Task( + description=( + "Analyze the collected financial data to assess the company's performance and outlook. " + "Include the following in your analysis:\n" + "1. Evaluation of financial health based on key metrics\n" + "2. Trend analysis showing growth or decline patterns\n" + "3. Comparison with industry benchmarks or competitors\n" + "4. Identification of strengths and potential areas of concern\n" + "5. Short-term financial outlook based on current trends" + ), + expected_output=( + "A detailed financial analysis that includes assessment of key metrics, trends, " + "comparative analysis, and a reasoned outlook for the company's financial future." + ), + agent=financial_analyst, + context=[research_task], + ) + + report_task = Task( + description=( + "Create a professional financial report based on the research and analysis. " + "The report should:\n" + "1. Begin with an executive summary highlighting key findings\n" + "2. Present the financial analysis in a clear, logical structure\n" + "3. Include visual representations of key data points (described textually)\n" + "4. Provide actionable insights for potential investors\n" + "5. Conclude with a clear investment recommendation (buy, hold, or sell)" + ), + expected_output=( + "A professional, comprehensive financial report with executive summary, " + "structured analysis, visual elements, actionable insights, and a clear recommendation." + ), + agent=report_writer, + context=[research_task, analysis_task], + ) + + crew = Crew( + agents=[data_researcher, financial_analyst, report_writer], + tasks=[research_task, analysis_task, report_task], + process=Process.sequential, + ) + + return crew + + +def test_financial_analysis_regression(financial_analysis_crew): + dataset = [ + { + "inputs": {"company": "Apple Inc. (AAPL)"}, + "expected_score": {"goal_alignment": 8}, + }, + { + "identifier": "test_2", + "inputs": {"company": "Microsoft Corporation (MSFT)"}, + "expected_score": 8, + }, + ] + + results = run_experiment(dataset=dataset, crew=financial_analysis_crew, verbose=True) + + assert_experiment_successfully(results) diff --git a/tests/regression/test_history_teacher.py b/tests/regression/test_history_teacher.py new file mode 100644 index 000000000..43ae86633 --- /dev/null +++ b/tests/regression/test_history_teacher.py @@ -0,0 +1,33 @@ +import pytest +from crewai import Agent +from crewai_tools import SerperDevTool + +from crewai.experimental.evaluation.testing import ( + assert_experiment_successfully, + run_experiment, +) + +@pytest.fixture +def history_teacher(): + search_tool = SerperDevTool() + return Agent( + role="History Educator", + goal="Teach students about important historical events with clarity and context", + backstory=( + "As a renowned historian and educator, you have spent decades studying world history, " + "from ancient civilizations to modern events. You are passionate about making history " + "engaging and understandable for learners of all ages. Your mission is to educate, explain, " + "and spark curiosity about the past." + ), + tools=[search_tool], + verbose=True, + ) +def test_history_teacher(history_teacher): + dataset = [ + {"inputs": {"messages": "How was the Battle of Waterloo?"}, "expected_score": 8} + ] + results = run_experiment( + dataset=dataset, agents=[history_teacher], verbose=True + ) + + assert_experiment_successfully(results)