fix: dedupe artifact storage on repeated cached results

The tool-result cache stores the raw FileArtifact and hands back the same
instance on every cache hit, where store_if_artifact ran again and minted a
fresh handle plus another byte copy. Repeated identical cached calls therefore
stacked duplicate copies of the same payload until scope/TTL cleanup. The store
now reuses an existing handle when the same FileArtifact instance is stored
again under the same scope, keying off object identity (the cache keeps the
object alive, so the id is stable within a run).
This commit is contained in:
Matt Aitchison
2026-06-04 19:54:49 -05:00
parent b827f7ee11
commit 000dd41fc3
2 changed files with 43 additions and 6 deletions

View File

@@ -98,6 +98,7 @@ class _Entry:
artifact: FileArtifact
scope_id: str | None
expires_at: float | None
obj_id: int
class _ArtifactStore:
@@ -106,11 +107,17 @@ class _ArtifactStore:
Entries are keyed by an opaque uuid (never by user-supplied content), so
concurrent crews cannot collide. Cleanup is per-scope -- clearing one crew's
artifacts never touches another's -- with a TTL prune as a backstop.
Storing the same :class:`FileArtifact` instance again under the same scope
reuses its handle rather than minting a duplicate. The tool-result cache
hands back the same object on every cache hit, so this keeps repeated cached
calls from stacking identical byte copies in memory.
"""
def __init__(self) -> None:
self._lock = threading.Lock()
self._entries: dict[str, _Entry] = {}
self._handle_by_obj: dict[int, str] = {}
def store(
self,
@@ -118,14 +125,29 @@ class _ArtifactStore:
scope_id: str | None = None,
ttl: int = DEFAULT_ARTIFACT_TTL,
) -> str:
handle_id = str(uuid4())
norm_scope = str(scope_id) if scope_id is not None else None
obj_id = id(artifact)
expires_at = (time.monotonic() + ttl) if ttl > 0 else None
with self._lock:
self._prune_locked()
existing = self._handle_by_obj.get(obj_id)
if existing is not None:
entry = self._entries.get(existing)
if (
entry is not None
and entry.artifact is artifact
and entry.scope_id == norm_scope
):
entry.expires_at = expires_at
return f"{_HANDLE_SCHEME}://{existing}"
handle_id = str(uuid4())
self._entries[handle_id] = _Entry(
artifact=artifact,
scope_id=str(scope_id) if scope_id is not None else None,
expires_at=(time.monotonic() + ttl) if ttl > 0 else None,
scope_id=norm_scope,
expires_at=expires_at,
obj_id=obj_id,
)
self._handle_by_obj[obj_id] = handle_id
return f"{_HANDLE_SCHEME}://{handle_id}"
def resolve(self, handle_id: str) -> FileArtifact | None:
@@ -134,7 +156,7 @@ class _ArtifactStore:
if entry is None:
return None
if entry.expires_at is not None and entry.expires_at <= time.monotonic():
del self._entries[handle_id]
self._delete_locked(handle_id)
return None
return entry.artifact
@@ -144,7 +166,7 @@ class _ArtifactStore:
for handle_id in [
hid for hid, entry in self._entries.items() if entry.scope_id == scope
]:
del self._entries[handle_id]
self._delete_locked(handle_id)
def _prune_locked(self) -> None:
"""Drop entries whose per-entry TTL has elapsed. Caller holds the lock."""
@@ -154,7 +176,13 @@ class _ArtifactStore:
for hid, entry in self._entries.items()
if entry.expires_at is not None and entry.expires_at <= now
]:
del self._entries[handle_id]
self._delete_locked(handle_id)
def _delete_locked(self, handle_id: str) -> None:
"""Remove an entry and its object-identity mapping. Caller holds lock."""
entry = self._entries.pop(handle_id, None)
if entry is not None and self._handle_by_obj.get(entry.obj_id) == handle_id:
del self._handle_by_obj[entry.obj_id]
_store: Final[_ArtifactStore] = _ArtifactStore()

View File

@@ -67,6 +67,15 @@ class TestStoreArtifact:
h2 = _handle_in(store_artifact(FileArtifact(data=b"a")))
assert h1 != h2
def test_restoring_same_instance_reuses_handle(self) -> None:
# The tool-result cache hands back the same FileArtifact on every cache
# hit; re-storing it must reuse the handle, not stack duplicate copies.
artifact = FileArtifact(data=b"payload" * 1000)
h1 = _handle_in(store_artifact(artifact, scope_id="s"))
h2 = _handle_in(store_artifact(artifact, scope_id="s"))
assert h1 == h2
assert len(_store._entries) == 1
def test_placeholder_escapes_quotes_in_metadata(self) -> None:
artifact = FileArtifact(data=b"x", filename='a".pptx', mime_type='m"/x')
placeholder = store_artifact(artifact)