fix(tracing): reduce retry attempts and skip retries on network exceptions

`_initialize_backend_batch` executes while holding `_batch_ready_cv`,
  blocking any concurrent thread waiting for batch readiness (e.g. event
  handlers). The previous retry logic could hold the lock for an
  unacceptable amount of time:

    Previous worst case (max_retries=2, all attempts timing out):
      3 attempts × 30s HTTP timeout + 2 × 0.2s sleep = ~90.4s
      Plus auth-fallback recursive call: up to ~180s total

    New worst case (max_retries=1, all attempts timing out):
      2 attempts × 30s HTTP timeout + 1 × 0.2s sleep = ~60.2s

    Exceptions (timeout, connection error) now abort immediately:
      1 attempt × 30s HTTP timeout = ~30s (no sleep, no retry)

  Changes:
  - Reduce max_retries from 2 to 1 (5xx responses still retry once)
  - Move try/except outside the retry loop so any exception
    (timeout, connection error) aborts immediately without retrying
    or sleeping, capping the exception path at a single HTTP timeout

  Addresses review comment on PR #4947 about retry sleeps holding the
  batch initialization lock and adding unexpected latency.
This commit is contained in:
Tiago Freire
2026-03-24 01:19:48 -03:00
parent 46bb7aa79e
commit 5a812a4cf3
2 changed files with 13 additions and 27 deletions

View File

@@ -144,12 +144,11 @@ class TraceBatchManager:
payload["ephemeral_trace_id"] = self.current_batch.batch_id
payload["user_identifier"] = get_user_id()
max_retries = 2
max_retries = 1
response = None
last_exception = None
for attempt in range(max_retries + 1):
try:
try:
for attempt in range(max_retries + 1):
response = (
self.plus_api.initialize_ephemeral_trace_batch(payload)
if use_ephemeral
@@ -163,17 +162,9 @@ class TraceBatchManager:
f"(status={response.status_code if response else 'None'}), retrying..."
)
time.sleep(0.2)
except Exception as e:
last_exception = e
if attempt < max_retries:
logger.debug(
f"Trace batch init attempt {attempt + 1} raised {type(e).__name__}, retrying..."
)
time.sleep(0.2)
if last_exception and response is None:
except Exception as e:
logger.warning(
f"Error initializing trace batch: {last_exception}. Continuing without tracing."
f"Error initializing trace batch: {e}. Continuing without tracing."
)
self.trace_batch_id = None
return

View File

@@ -1133,15 +1133,9 @@ class TestInitializeBackendBatchRetry:
assert bm.trace_batch_id == server_id
assert mock_init.call_count == 2
def test_retries_on_exception_then_succeeds(self):
"""Retries on ConnectionError, succeeds on second attempt."""
def test_no_retry_on_exception(self):
"""Exceptions (e.g. timeout, connection error) abort immediately without retry."""
bm = self._make_batch_manager()
server_id = "server-id-after-exception"
success_response = MagicMock(
status_code=201,
json=MagicMock(return_value={"ephemeral_trace_id": server_id}),
)
with (
patch(
@@ -1151,9 +1145,9 @@ class TestInitializeBackendBatchRetry:
patch.object(
bm.plus_api,
"initialize_ephemeral_trace_batch",
side_effect=[ConnectionError("network down"), success_response],
side_effect=ConnectionError("network down"),
) as mock_init,
patch("crewai.events.listeners.tracing.trace_batch_manager.time.sleep"),
patch("crewai.events.listeners.tracing.trace_batch_manager.time.sleep") as mock_sleep,
):
bm._initialize_backend_batch(
user_context={"privacy_level": "standard"},
@@ -1161,8 +1155,9 @@ class TestInitializeBackendBatchRetry:
use_ephemeral=True,
)
assert bm.trace_batch_id == server_id
assert mock_init.call_count == 2
assert bm.trace_batch_id is None
assert mock_init.call_count == 1
mock_sleep.assert_not_called()
def test_no_retry_on_4xx(self):
"""Does NOT retry on 422 — client error is not transient."""
@@ -1215,7 +1210,7 @@ class TestInitializeBackendBatchRetry:
)
assert bm.trace_batch_id is None
assert mock_init.call_count == 3 # initial + 2 retries
assert mock_init.call_count == 2 # initial + 1 retry
class TestFirstTimeHandlerBackendInitGuard: