fix(tracing): reduce retry attempts and skip retries on network exceptions

`_initialize_backend_batch` executes while holding `_batch_ready_cv`, blocking any concurrent thread waiting for batch readiness (e.g. event handlers). The previous retry logic could hold the lock for an unacceptable amount of time: Previous worst case (max_retries=2, all attempts timing out): 3 attempts × 30s HTTP timeout + 2 × 0.2s sleep = ~90.4s Plus auth-fallback recursive call: up to ~180s total New worst case (max_retries=1, all attempts timing out): 2 attempts × 30s HTTP timeout + 1 × 0.2s sleep = ~60.2s Exceptions (timeout, connection error) now abort immediately: 1 attempt × 30s HTTP timeout = ~30s (no sleep, no retry) Changes: - Reduce max_retries from 2 to 1 (5xx responses still retry once) - Move try/except outside the retry loop so any exception (timeout, connection error) aborts immediately without retrying or sleeping, capping the exception path at a single HTTP timeout Addresses review comment on PR #4947 about retry sleeps holding the batch initialization lock and adding unexpected latency.
2026-07-02 13:48:09 +00:00 · 2026-03-24 01:19:48 -03:00
parent 46bb7aa79e
commit 5a812a4cf3
2 changed files with 13 additions and 27 deletions
--- a/lib/crewai/src/crewai/events/listeners/tracing/trace_batch_manager.py
+++ b/lib/crewai/src/crewai/events/listeners/tracing/trace_batch_manager.py
@@ -144,12 +144,11 @@ class TraceBatchManager:
                payload["ephemeral_trace_id"] = self.current_batch.batch_id
                payload["user_identifier"] = get_user_id()

-            max_retries = 2
+            max_retries = 1
            response = None
-            last_exception = None

-            for attempt in range(max_retries + 1):
-                try:
+            try:
+                for attempt in range(max_retries + 1):
                    response = (
                        self.plus_api.initialize_ephemeral_trace_batch(payload)
                        if use_ephemeral
@@ -163,17 +162,9 @@ class TraceBatchManager:
                            f"(status={response.status_code if response else 'None'}), retrying..."
                        )
                        time.sleep(0.2)
-                except Exception as e:
-                    last_exception = e
-                    if attempt < max_retries:
-                        logger.debug(
-                            f"Trace batch init attempt {attempt + 1} raised {type(e).__name__}, retrying..."
-                        )
-                        time.sleep(0.2)
-
-            if last_exception and response is None:
+            except Exception as e:
                logger.warning(
-                    f"Error initializing trace batch: {last_exception}. Continuing without tracing."
+                    f"Error initializing trace batch: {e}. Continuing without tracing."
                )
                self.trace_batch_id = None
                return
--- a/lib/crewai/tests/tracing/test_tracing.py
+++ b/lib/crewai/tests/tracing/test_tracing.py
@@ -1133,15 +1133,9 @@ class TestInitializeBackendBatchRetry:
        assert bm.trace_batch_id == server_id
        assert mock_init.call_count == 2

-    def test_retries_on_exception_then_succeeds(self):
-        """Retries on ConnectionError, succeeds on second attempt."""
+    def test_no_retry_on_exception(self):
+        """Exceptions (e.g. timeout, connection error) abort immediately without retry."""
        bm = self._make_batch_manager()
-        server_id = "server-id-after-exception"
-
-        success_response = MagicMock(
-            status_code=201,
-            json=MagicMock(return_value={"ephemeral_trace_id": server_id}),
-        )

        with (
            patch(
@@ -1151,9 +1145,9 @@ class TestInitializeBackendBatchRetry:
            patch.object(
                bm.plus_api,
                "initialize_ephemeral_trace_batch",
-                side_effect=[ConnectionError("network down"), success_response],
+                side_effect=ConnectionError("network down"),
            ) as mock_init,
-            patch("crewai.events.listeners.tracing.trace_batch_manager.time.sleep"),
+            patch("crewai.events.listeners.tracing.trace_batch_manager.time.sleep") as mock_sleep,
        ):
            bm._initialize_backend_batch(
                user_context={"privacy_level": "standard"},
@@ -1161,8 +1155,9 @@ class TestInitializeBackendBatchRetry:
                use_ephemeral=True,
            )

-        assert bm.trace_batch_id == server_id
-        assert mock_init.call_count == 2
+        assert bm.trace_batch_id is None
+        assert mock_init.call_count == 1
+        mock_sleep.assert_not_called()

    def test_no_retry_on_4xx(self):
        """Does NOT retry on 422 — client error is not transient."""
@@ -1215,7 +1210,7 @@ class TestInitializeBackendBatchRetry:
            )

        assert bm.trace_batch_id is None
-        assert mock_init.call_count == 3  # initial + 2 retries
+        assert mock_init.call_count == 2  # initial + 1 retry


 class TestFirstTimeHandlerBackendInitGuard: