From 000dd41fc3566f251c4e3e08a8398b82f4c7c245 Mon Sep 17 00:00:00 2001 From: Matt Aitchison Date: Thu, 4 Jun 2026 19:54:49 -0500 Subject: [PATCH] fix: dedupe artifact storage on repeated cached results The tool-result cache stores the raw FileArtifact and hands back the same instance on every cache hit, where store_if_artifact ran again and minted a fresh handle plus another byte copy. Repeated identical cached calls therefore stacked duplicate copies of the same payload until scope/TTL cleanup. The store now reuses an existing handle when the same FileArtifact instance is stored again under the same scope, keying off object identity (the cache keeps the object alive, so the id is stable within a run). --- lib/crewai/src/crewai/tools/file_artifact.py | 40 +++++++++++++++++--- lib/crewai/tests/tools/test_file_artifact.py | 9 +++++ 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/lib/crewai/src/crewai/tools/file_artifact.py b/lib/crewai/src/crewai/tools/file_artifact.py index 83a8b5b18..21d641f2d 100644 --- a/lib/crewai/src/crewai/tools/file_artifact.py +++ b/lib/crewai/src/crewai/tools/file_artifact.py @@ -98,6 +98,7 @@ class _Entry: artifact: FileArtifact scope_id: str | None expires_at: float | None + obj_id: int class _ArtifactStore: @@ -106,11 +107,17 @@ class _ArtifactStore: Entries are keyed by an opaque uuid (never by user-supplied content), so concurrent crews cannot collide. Cleanup is per-scope -- clearing one crew's artifacts never touches another's -- with a TTL prune as a backstop. + + Storing the same :class:`FileArtifact` instance again under the same scope + reuses its handle rather than minting a duplicate. The tool-result cache + hands back the same object on every cache hit, so this keeps repeated cached + calls from stacking identical byte copies in memory. """ def __init__(self) -> None: self._lock = threading.Lock() self._entries: dict[str, _Entry] = {} + self._handle_by_obj: dict[int, str] = {} def store( self, @@ -118,14 +125,29 @@ class _ArtifactStore: scope_id: str | None = None, ttl: int = DEFAULT_ARTIFACT_TTL, ) -> str: - handle_id = str(uuid4()) + norm_scope = str(scope_id) if scope_id is not None else None + obj_id = id(artifact) + expires_at = (time.monotonic() + ttl) if ttl > 0 else None with self._lock: self._prune_locked() + existing = self._handle_by_obj.get(obj_id) + if existing is not None: + entry = self._entries.get(existing) + if ( + entry is not None + and entry.artifact is artifact + and entry.scope_id == norm_scope + ): + entry.expires_at = expires_at + return f"{_HANDLE_SCHEME}://{existing}" + handle_id = str(uuid4()) self._entries[handle_id] = _Entry( artifact=artifact, - scope_id=str(scope_id) if scope_id is not None else None, - expires_at=(time.monotonic() + ttl) if ttl > 0 else None, + scope_id=norm_scope, + expires_at=expires_at, + obj_id=obj_id, ) + self._handle_by_obj[obj_id] = handle_id return f"{_HANDLE_SCHEME}://{handle_id}" def resolve(self, handle_id: str) -> FileArtifact | None: @@ -134,7 +156,7 @@ class _ArtifactStore: if entry is None: return None if entry.expires_at is not None and entry.expires_at <= time.monotonic(): - del self._entries[handle_id] + self._delete_locked(handle_id) return None return entry.artifact @@ -144,7 +166,7 @@ class _ArtifactStore: for handle_id in [ hid for hid, entry in self._entries.items() if entry.scope_id == scope ]: - del self._entries[handle_id] + self._delete_locked(handle_id) def _prune_locked(self) -> None: """Drop entries whose per-entry TTL has elapsed. Caller holds the lock.""" @@ -154,7 +176,13 @@ class _ArtifactStore: for hid, entry in self._entries.items() if entry.expires_at is not None and entry.expires_at <= now ]: - del self._entries[handle_id] + self._delete_locked(handle_id) + + def _delete_locked(self, handle_id: str) -> None: + """Remove an entry and its object-identity mapping. Caller holds lock.""" + entry = self._entries.pop(handle_id, None) + if entry is not None and self._handle_by_obj.get(entry.obj_id) == handle_id: + del self._handle_by_obj[entry.obj_id] _store: Final[_ArtifactStore] = _ArtifactStore() diff --git a/lib/crewai/tests/tools/test_file_artifact.py b/lib/crewai/tests/tools/test_file_artifact.py index acb8a74ce..0a1b9efc8 100644 --- a/lib/crewai/tests/tools/test_file_artifact.py +++ b/lib/crewai/tests/tools/test_file_artifact.py @@ -67,6 +67,15 @@ class TestStoreArtifact: h2 = _handle_in(store_artifact(FileArtifact(data=b"a"))) assert h1 != h2 + def test_restoring_same_instance_reuses_handle(self) -> None: + # The tool-result cache hands back the same FileArtifact on every cache + # hit; re-storing it must reuse the handle, not stack duplicate copies. + artifact = FileArtifact(data=b"payload" * 1000) + h1 = _handle_in(store_artifact(artifact, scope_id="s")) + h2 = _handle_in(store_artifact(artifact, scope_id="s")) + assert h1 == h2 + assert len(_store._entries) == 1 + def test_placeholder_escapes_quotes_in_metadata(self) -> None: artifact = FileArtifact(data=b"x", filename='a".pptx', mime_type='m"/x') placeholder = store_artifact(artifact)