Files
crewAI/lib/crewai/src/crewai/experimental/skills/cache.py
Rip&Tear fac3e3579b Fix symlink path traversal in skill archive extraction (#6235)
* Fix symlink path traversal in skill archive extraction

`_safe_extractall` (the Python < 3.12 fallback used by `crewai skills`
archive unpacking) validated each member's *name* against the destination
but never validated symlink/hardlink *targets*. A malicious skill tarball
could plant a symlink escaping the destination (e.g. `link -> /home/user/.ssh`)
followed by a regular member written through it (`link/authorized_keys`),
escaping `dest` even though every member name resolves inside it — the
classic symlink-extraction traversal.

The 3.12+ path (`extractall(..., filter="data")`) already blocks this; the
fallback now mirrors it by rejecting absolute link targets and any link
target that resolves outside the destination directory.

Adds regression tests covering absolute and relative escaping symlinks plus
benign in-tree symlinks and ordinary archives.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

* Harden skill cache archive extraction

* Reject special skill archive members

---------

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-24 08:50:41 -07:00

172 lines
6.0 KiB
Python

"""Cache manager for registry-downloaded skills.
Manages ~/.crewai/skills/{org}/{name}/ as the global skill cache.
One version is stored per skill (last install wins).
"""
from __future__ import annotations
from datetime import datetime, timezone
import json
import logging
import os
from pathlib import Path
import tarfile
from typing import TypedDict
import zipfile
_logger = logging.getLogger(__name__)
_CACHE_ROOT = Path.home() / ".crewai" / "skills"
_META_FILENAME = ".crewai_meta.json"
class SkillMetadata(TypedDict):
org: str
name: str
version: str | None
installed_at: str
class SkillCacheManager:
"""Manages the global skill cache at ~/.crewai/skills/."""
def __init__(self, cache_root: Path | None = None) -> None:
self._root = cache_root or _CACHE_ROOT
def _skill_dir(self, org: str, name: str) -> Path:
return self._root / org / name
def get_cached_path(self, org: str, name: str) -> Path | None:
"""Return the cached skill directory path if it exists, else None."""
skill_dir = self._skill_dir(org, name)
meta_file = skill_dir / _META_FILENAME
if skill_dir.is_dir() and meta_file.exists():
return skill_dir
return None
def store(
self, org: str, name: str, version: str | None, archive_bytes: bytes
) -> Path:
"""Unpack an archive into the cache and write metadata.
Uses tarfile with filter='data' for path-traversal protection.
Args:
org: Organisation slug.
name: Skill name.
version: Semantic version string, or None if unknown.
archive_bytes: Raw bytes of a .tar.gz archive.
Returns:
Path to the stored skill directory.
"""
skill_dir = self._skill_dir(org, name)
if skill_dir.exists():
import shutil
shutil.rmtree(skill_dir)
skill_dir.mkdir(parents=True, exist_ok=True)
import io
try:
with tarfile.open(fileobj=io.BytesIO(archive_bytes), mode="r:gz") as tf:
try:
tf.extractall(skill_dir, filter="data")
except TypeError:
_safe_extractall(tf, skill_dir)
except tarfile.TarError:
with zipfile.ZipFile(io.BytesIO(archive_bytes)) as zf:
_safe_extract_zip(zf, skill_dir)
meta: SkillMetadata = {
"org": org,
"name": name,
"version": version,
"installed_at": datetime.now(tz=timezone.utc).isoformat(),
}
(skill_dir / _META_FILENAME).write_text(json.dumps(meta, indent=2))
return skill_dir
def list_cached(self) -> list[SkillMetadata]:
"""Return metadata for every cached skill."""
results: list[SkillMetadata] = []
if not self._root.exists():
return results
for org_dir in sorted(self._root.iterdir()):
if not org_dir.is_dir():
continue
for skill_dir in sorted(org_dir.iterdir()):
meta_file = skill_dir / _META_FILENAME
if meta_file.exists():
try:
results.append(json.loads(meta_file.read_text()))
except (json.JSONDecodeError, KeyError):
_logger.debug(
"Skipping malformed cache entry: %s",
meta_file,
exc_info=True,
)
return results
def invalidate(self, org: str, name: str) -> bool:
"""Remove a cached skill.
Returns:
True if the cache entry existed and was removed, False otherwise.
"""
skill_dir = self._skill_dir(org, name)
if skill_dir.exists():
import shutil
shutil.rmtree(skill_dir)
return True
return False
def _safe_extractall(tf: tarfile.TarFile, dest: Path) -> None:
"""Path-traversal-safe extraction for Python versions without tar filters.
Validates both the member's own path and, for symlink/hardlink members,
the link target. Without the link-target check a malicious archive can
plant a symlink that escapes ``dest`` followed by a regular member written
through that link, escaping ``dest`` even though every member name resolves
inside it. This mirrors the protection that
``tarfile.extractall(..., filter="data")`` provides when available.
"""
dest_resolved = dest.resolve()
for member in tf.getmembers():
member_path = (dest / member.name).resolve()
if not member_path.is_relative_to(dest_resolved):
raise ValueError(f"Blocked path traversal attempt: {member.name!r}")
if not (member.isfile() or member.isdir() or member.issym() or member.islnk()):
raise ValueError(f"Blocked unsupported tar member: {member.name!r}")
if member.issym() or member.islnk():
link_target = member.linkname
if os.path.isabs(link_target):
raise ValueError(
f"Blocked link target escaping destination: "
f"{member.name!r} -> {link_target!r}"
)
anchor = dest if member.islnk() else (dest / member.name).parent
resolved_target = (anchor / link_target).resolve()
if not resolved_target.is_relative_to(dest_resolved):
raise ValueError(
f"Blocked link target escaping destination: "
f"{member.name!r} -> {link_target!r}"
)
tf.extractall(dest) # noqa: S202
def _safe_extract_zip(zf: zipfile.ZipFile, dest: Path) -> None:
"""Path-traversal-safe ZIP extraction."""
dest_resolved = dest.resolve()
for member in zf.namelist():
member_path = (dest / member).resolve()
if not member_path.is_relative_to(dest_resolved):
raise ValueError(f"Blocked path traversal attempt: {member!r}")
zf.extractall(dest) # noqa: S202