From 5a812a4cf397ddd319e6ba00cfd1c335353c7441 Mon Sep 17 00:00:00 2001 From: Tiago Freire Date: Tue, 24 Mar 2026 01:19:48 -0300 Subject: [PATCH] fix(tracing): reduce retry attempts and skip retries on network exceptions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_initialize_backend_batch` executes while holding `_batch_ready_cv`, blocking any concurrent thread waiting for batch readiness (e.g. event handlers). The previous retry logic could hold the lock for an unacceptable amount of time: Previous worst case (max_retries=2, all attempts timing out): 3 attempts × 30s HTTP timeout + 2 × 0.2s sleep = ~90.4s Plus auth-fallback recursive call: up to ~180s total New worst case (max_retries=1, all attempts timing out): 2 attempts × 30s HTTP timeout + 1 × 0.2s sleep = ~60.2s Exceptions (timeout, connection error) now abort immediately: 1 attempt × 30s HTTP timeout = ~30s (no sleep, no retry) Changes: - Reduce max_retries from 2 to 1 (5xx responses still retry once) - Move try/except outside the retry loop so any exception (timeout, connection error) aborts immediately without retrying or sleeping, capping the exception path at a single HTTP timeout Addresses review comment on PR #4947 about retry sleeps holding the batch initialization lock and adding unexpected latency. --- .../listeners/tracing/trace_batch_manager.py | 19 +++++------------ lib/crewai/tests/tracing/test_tracing.py | 21 +++++++------------ 2 files changed, 13 insertions(+), 27 deletions(-) diff --git a/lib/crewai/src/crewai/events/listeners/tracing/trace_batch_manager.py b/lib/crewai/src/crewai/events/listeners/tracing/trace_batch_manager.py index 7ce51fa9c..3e5fe0868 100644 --- a/lib/crewai/src/crewai/events/listeners/tracing/trace_batch_manager.py +++ b/lib/crewai/src/crewai/events/listeners/tracing/trace_batch_manager.py @@ -144,12 +144,11 @@ class TraceBatchManager: payload["ephemeral_trace_id"] = self.current_batch.batch_id payload["user_identifier"] = get_user_id() - max_retries = 2 + max_retries = 1 response = None - last_exception = None - for attempt in range(max_retries + 1): - try: + try: + for attempt in range(max_retries + 1): response = ( self.plus_api.initialize_ephemeral_trace_batch(payload) if use_ephemeral @@ -163,17 +162,9 @@ class TraceBatchManager: f"(status={response.status_code if response else 'None'}), retrying..." ) time.sleep(0.2) - except Exception as e: - last_exception = e - if attempt < max_retries: - logger.debug( - f"Trace batch init attempt {attempt + 1} raised {type(e).__name__}, retrying..." - ) - time.sleep(0.2) - - if last_exception and response is None: + except Exception as e: logger.warning( - f"Error initializing trace batch: {last_exception}. Continuing without tracing." + f"Error initializing trace batch: {e}. Continuing without tracing." ) self.trace_batch_id = None return diff --git a/lib/crewai/tests/tracing/test_tracing.py b/lib/crewai/tests/tracing/test_tracing.py index 171808628..154f341d8 100644 --- a/lib/crewai/tests/tracing/test_tracing.py +++ b/lib/crewai/tests/tracing/test_tracing.py @@ -1133,15 +1133,9 @@ class TestInitializeBackendBatchRetry: assert bm.trace_batch_id == server_id assert mock_init.call_count == 2 - def test_retries_on_exception_then_succeeds(self): - """Retries on ConnectionError, succeeds on second attempt.""" + def test_no_retry_on_exception(self): + """Exceptions (e.g. timeout, connection error) abort immediately without retry.""" bm = self._make_batch_manager() - server_id = "server-id-after-exception" - - success_response = MagicMock( - status_code=201, - json=MagicMock(return_value={"ephemeral_trace_id": server_id}), - ) with ( patch( @@ -1151,9 +1145,9 @@ class TestInitializeBackendBatchRetry: patch.object( bm.plus_api, "initialize_ephemeral_trace_batch", - side_effect=[ConnectionError("network down"), success_response], + side_effect=ConnectionError("network down"), ) as mock_init, - patch("crewai.events.listeners.tracing.trace_batch_manager.time.sleep"), + patch("crewai.events.listeners.tracing.trace_batch_manager.time.sleep") as mock_sleep, ): bm._initialize_backend_batch( user_context={"privacy_level": "standard"}, @@ -1161,8 +1155,9 @@ class TestInitializeBackendBatchRetry: use_ephemeral=True, ) - assert bm.trace_batch_id == server_id - assert mock_init.call_count == 2 + assert bm.trace_batch_id is None + assert mock_init.call_count == 1 + mock_sleep.assert_not_called() def test_no_retry_on_4xx(self): """Does NOT retry on 422 — client error is not transient.""" @@ -1215,7 +1210,7 @@ class TestInitializeBackendBatchRetry: ) assert bm.trace_batch_id is None - assert mock_init.call_count == 3 # initial + 2 retries + assert mock_init.call_count == 2 # initial + 1 retry class TestFirstTimeHandlerBackendInitGuard: