diff --git a/src/crewai/evaluation/experiment/testing.py b/src/crewai/evaluation/experiment/testing.py new file mode 100644 index 000000000..11de59a80 --- /dev/null +++ b/src/crewai/evaluation/experiment/testing.py @@ -0,0 +1,49 @@ +import warnings +from crewai.experimental.evaluation import ExperimentResults + +def assert_experiment_successfully(experiment_results: ExperimentResults) -> None: + """ + Assert that all experiment results passed successfully. + + Args: + experiment_results: The experiment results to check + + Raises: + AssertionError: If any test case failed + """ + failed_tests = [result for result in experiment_results.results if not result.passed] + + if failed_tests: + detailed_failures: list[str] = [] + + for result in failed_tests: + expected = result.expected_score + actual = result.score + detailed_failures.append(f"- {result.identifier}: expected {expected}, got {actual}") + + failure_details = "\n".join(detailed_failures) + raise AssertionError(f"The following test cases failed:\n{failure_details}") + +def assert_experiment_no_regression(comparison_result: dict[str, list[str]]) -> None: + """ + Assert that there are no regressions in the experiment results compared to baseline. + Also warns if there are missing tests. + + Args: + comparison_result: The result from compare_with_baseline() + + Raises: + AssertionError: If there are regressions + """ + # Check for regressions + regressed = comparison_result.get("regressed", []) + if regressed: + raise AssertionError(f"Regression detected! The following tests that previously passed now fail: {regressed}") + + # Check for missing tests and warn + missing_tests = comparison_result.get("missing_tests", []) + if missing_tests: + warnings.warn( + f"Warning: {len(missing_tests)} tests from the baseline are missing in the current run: {missing_tests}", + UserWarning + ) \ No newline at end of file diff --git a/src/crewai/experimental/evaluation/testing.py b/src/crewai/experimental/evaluation/testing.py new file mode 100644 index 000000000..7a235d3f1 --- /dev/null +++ b/src/crewai/experimental/evaluation/testing.py @@ -0,0 +1,52 @@ +import inspect + +from typing_extensions import Any +import warnings +from crewai.experimental.evaluation.experiment import ExperimentResults, ExperimentRunner +from crewai import Crew + +def assert_experiment_successfully(experiment_results: ExperimentResults, baseline_filepath: str | None = None) -> None: + failed_tests = [result for result in experiment_results.results if not result.passed] + + if failed_tests: + detailed_failures: list[str] = [] + + for result in failed_tests: + expected = result.expected_score + actual = result.score + detailed_failures.append(f"- {result.identifier}: expected {expected}, got {actual}") + + failure_details = "\n".join(detailed_failures) + raise AssertionError(f"The following test cases failed:\n{failure_details}") + + baseline_filepath = baseline_filepath or _get_baseline_filepath_fallback() + comparison = experiment_results.compare_with_baseline(baseline_filepath=baseline_filepath) + assert_experiment_no_regression(comparison) + +def assert_experiment_no_regression(comparison_result: dict[str, list[str]]) -> None: + regressed = comparison_result.get("regressed", []) + if regressed: + raise AssertionError(f"Regression detected! The following tests that previously passed now fail: {regressed}") + + missing_tests = comparison_result.get("missing_tests", []) + if missing_tests: + warnings.warn( + f"Warning: {len(missing_tests)} tests from the baseline are missing in the current run: {missing_tests}", + UserWarning + ) + +def run_experiment(dataset: list[dict[str, Any]], crew: Crew, verbose: bool = False) -> ExperimentResults: + runner = ExperimentRunner(dataset=dataset) + + return runner.run(crew=crew, print_summary=verbose) + +def _get_baseline_filepath_fallback() -> str: + test_func_name = "experiment_fallback" + + try: + current_frame = inspect.currentframe() + if current_frame is not None: + test_func_name = current_frame.f_back.f_back.f_code.co_name # type: ignore[union-attr] + except Exception: + ... + return f"{test_func_name}_results.json" \ No newline at end of file