mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-04-12 22:12:37 +00:00
fix(tracing): reduce retry attempts and skip retries on network exceptions
`_initialize_backend_batch` executes while holding `_batch_ready_cv`,
blocking any concurrent thread waiting for batch readiness (e.g. event
handlers). The previous retry logic could hold the lock for an
unacceptable amount of time:
Previous worst case (max_retries=2, all attempts timing out):
3 attempts × 30s HTTP timeout + 2 × 0.2s sleep = ~90.4s
Plus auth-fallback recursive call: up to ~180s total
New worst case (max_retries=1, all attempts timing out):
2 attempts × 30s HTTP timeout + 1 × 0.2s sleep = ~60.2s
Exceptions (timeout, connection error) now abort immediately:
1 attempt × 30s HTTP timeout = ~30s (no sleep, no retry)
Changes:
- Reduce max_retries from 2 to 1 (5xx responses still retry once)
- Move try/except outside the retry loop so any exception
(timeout, connection error) aborts immediately without retrying
or sleeping, capping the exception path at a single HTTP timeout
Addresses review comment on PR #4947 about retry sleeps holding the
batch initialization lock and adding unexpected latency.
This commit is contained in:
@@ -144,12 +144,11 @@ class TraceBatchManager:
|
||||
payload["ephemeral_trace_id"] = self.current_batch.batch_id
|
||||
payload["user_identifier"] = get_user_id()
|
||||
|
||||
max_retries = 2
|
||||
max_retries = 1
|
||||
response = None
|
||||
last_exception = None
|
||||
|
||||
for attempt in range(max_retries + 1):
|
||||
try:
|
||||
try:
|
||||
for attempt in range(max_retries + 1):
|
||||
response = (
|
||||
self.plus_api.initialize_ephemeral_trace_batch(payload)
|
||||
if use_ephemeral
|
||||
@@ -163,17 +162,9 @@ class TraceBatchManager:
|
||||
f"(status={response.status_code if response else 'None'}), retrying..."
|
||||
)
|
||||
time.sleep(0.2)
|
||||
except Exception as e:
|
||||
last_exception = e
|
||||
if attempt < max_retries:
|
||||
logger.debug(
|
||||
f"Trace batch init attempt {attempt + 1} raised {type(e).__name__}, retrying..."
|
||||
)
|
||||
time.sleep(0.2)
|
||||
|
||||
if last_exception and response is None:
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Error initializing trace batch: {last_exception}. Continuing without tracing."
|
||||
f"Error initializing trace batch: {e}. Continuing without tracing."
|
||||
)
|
||||
self.trace_batch_id = None
|
||||
return
|
||||
|
||||
@@ -1133,15 +1133,9 @@ class TestInitializeBackendBatchRetry:
|
||||
assert bm.trace_batch_id == server_id
|
||||
assert mock_init.call_count == 2
|
||||
|
||||
def test_retries_on_exception_then_succeeds(self):
|
||||
"""Retries on ConnectionError, succeeds on second attempt."""
|
||||
def test_no_retry_on_exception(self):
|
||||
"""Exceptions (e.g. timeout, connection error) abort immediately without retry."""
|
||||
bm = self._make_batch_manager()
|
||||
server_id = "server-id-after-exception"
|
||||
|
||||
success_response = MagicMock(
|
||||
status_code=201,
|
||||
json=MagicMock(return_value={"ephemeral_trace_id": server_id}),
|
||||
)
|
||||
|
||||
with (
|
||||
patch(
|
||||
@@ -1151,9 +1145,9 @@ class TestInitializeBackendBatchRetry:
|
||||
patch.object(
|
||||
bm.plus_api,
|
||||
"initialize_ephemeral_trace_batch",
|
||||
side_effect=[ConnectionError("network down"), success_response],
|
||||
side_effect=ConnectionError("network down"),
|
||||
) as mock_init,
|
||||
patch("crewai.events.listeners.tracing.trace_batch_manager.time.sleep"),
|
||||
patch("crewai.events.listeners.tracing.trace_batch_manager.time.sleep") as mock_sleep,
|
||||
):
|
||||
bm._initialize_backend_batch(
|
||||
user_context={"privacy_level": "standard"},
|
||||
@@ -1161,8 +1155,9 @@ class TestInitializeBackendBatchRetry:
|
||||
use_ephemeral=True,
|
||||
)
|
||||
|
||||
assert bm.trace_batch_id == server_id
|
||||
assert mock_init.call_count == 2
|
||||
assert bm.trace_batch_id is None
|
||||
assert mock_init.call_count == 1
|
||||
mock_sleep.assert_not_called()
|
||||
|
||||
def test_no_retry_on_4xx(self):
|
||||
"""Does NOT retry on 422 — client error is not transient."""
|
||||
@@ -1215,7 +1210,7 @@ class TestInitializeBackendBatchRetry:
|
||||
)
|
||||
|
||||
assert bm.trace_batch_id is None
|
||||
assert mock_init.call_count == 3 # initial + 2 retries
|
||||
assert mock_init.call_count == 2 # initial + 1 retry
|
||||
|
||||
|
||||
class TestFirstTimeHandlerBackendInitGuard:
|
||||
|
||||
Reference in New Issue
Block a user