mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-04-15 15:32:40 +00:00
feat: enhance GrepTool with sensitive file exclusion and file size limit
- Added MAX_CONTEXT_LINES to define the upper limit for context lines shown in search results. - Introduced MAX_FILE_SIZE_BYTES to skip files larger than 10 MB during searches. - Implemented logic to exclude sensitive files (e.g., .env, .netrc) from search results to prevent accidental leakage of credentials. - Updated tests to validate sensitive file exclusion and file size limits, ensuring robustness in handling sensitive content.
This commit is contained in:
@@ -22,6 +22,8 @@ MAX_LINE_LENGTH = 500
|
||||
BINARY_CHECK_SIZE = 8192
|
||||
MAX_REGEX_LENGTH = 1_000
|
||||
REGEX_MATCH_TIMEOUT_SECONDS = 5
|
||||
MAX_CONTEXT_LINES = 10
|
||||
MAX_FILE_SIZE_BYTES = 10 * 1024 * 1024 # 10 MB
|
||||
|
||||
SKIP_DIRS = frozenset(
|
||||
{
|
||||
@@ -36,6 +38,44 @@ SKIP_DIRS = frozenset(
|
||||
}
|
||||
)
|
||||
|
||||
# File names that may contain secrets or credentials — always excluded from
|
||||
# search results to prevent accidental sensitive-content leakage.
|
||||
SENSITIVE_FILE_NAMES = frozenset(
|
||||
{
|
||||
".env",
|
||||
".env.local",
|
||||
".env.development",
|
||||
".env.production",
|
||||
".env.staging",
|
||||
".env.test",
|
||||
".netrc",
|
||||
".npmrc",
|
||||
".pypirc",
|
||||
".docker/config.json",
|
||||
".aws/credentials",
|
||||
".ssh/id_rsa",
|
||||
".ssh/id_ed25519",
|
||||
".ssh/id_ecdsa",
|
||||
".ssh/id_dsa",
|
||||
"credentials.json",
|
||||
"service-account.json",
|
||||
"secrets.yaml",
|
||||
"secrets.yml",
|
||||
"secrets.json",
|
||||
}
|
||||
)
|
||||
|
||||
# Glob-style suffixes that indicate sensitive content (matched against the
|
||||
# full file name, e.g. "app.env.bak" won't match, but ".env.bak" will).
|
||||
SENSITIVE_FILE_PATTERNS = (
|
||||
".pem",
|
||||
".key",
|
||||
".p12",
|
||||
".pfx",
|
||||
".jks",
|
||||
".keystore",
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MatchLine:
|
||||
@@ -79,7 +119,9 @@ class GrepToolSchema(BaseModel):
|
||||
)
|
||||
context_lines: int = Field(
|
||||
default=0,
|
||||
description="Number of lines to show before and after each match",
|
||||
ge=0,
|
||||
le=MAX_CONTEXT_LINES,
|
||||
description=f"Number of lines to show before and after each match (0-{MAX_CONTEXT_LINES})",
|
||||
)
|
||||
include_line_numbers: bool = Field(
|
||||
default=True,
|
||||
@@ -121,6 +163,13 @@ class GrepTool(BaseTool):
|
||||
"directory. Set to True to allow searching any path on the filesystem."
|
||||
),
|
||||
)
|
||||
max_file_size_bytes: int = Field(
|
||||
default=MAX_FILE_SIZE_BYTES,
|
||||
description=(
|
||||
"Maximum file size in bytes to search. Files larger than this are "
|
||||
"skipped. Defaults to 10 MB."
|
||||
),
|
||||
)
|
||||
|
||||
def _run(
|
||||
self,
|
||||
@@ -233,6 +282,10 @@ class GrepTool(BaseTool):
|
||||
def _collect_files(self, search_path: Path, glob_pattern: str | None) -> list[Path]:
|
||||
"""Collect files to search.
|
||||
|
||||
Sensitive files (e.g. ``.env``, ``.netrc``, key material) are
|
||||
automatically excluded even when searched by explicit path so that
|
||||
credentials cannot leak into tool output.
|
||||
|
||||
Args:
|
||||
search_path: File or directory to search.
|
||||
glob_pattern: Optional glob pattern to filter files.
|
||||
@@ -241,6 +294,8 @@ class GrepTool(BaseTool):
|
||||
List of file paths to search.
|
||||
"""
|
||||
if search_path.is_file():
|
||||
if self._is_sensitive_file(search_path):
|
||||
return []
|
||||
return [search_path]
|
||||
|
||||
patterns = self._expand_brace_pattern(glob_pattern) if glob_pattern else ["*"]
|
||||
@@ -255,6 +310,8 @@ class GrepTool(BaseTool):
|
||||
# Skip hidden/build directories
|
||||
if any(part in SKIP_DIRS for part in p.relative_to(search_path).parts):
|
||||
continue
|
||||
if self._is_sensitive_file(p):
|
||||
continue
|
||||
files.append(p)
|
||||
if len(files) >= MAX_FILES:
|
||||
break
|
||||
@@ -294,6 +351,46 @@ class GrepTool(BaseTool):
|
||||
signal.alarm(0)
|
||||
signal.signal(signal.SIGALRM, old_handler)
|
||||
|
||||
@staticmethod
|
||||
def _is_sensitive_file(file_path: Path) -> bool:
|
||||
"""Check whether a file is likely to contain secrets or credentials.
|
||||
|
||||
The check is deliberately conservative — it matches exact file names
|
||||
(e.g. ``.env``, ``.netrc``) as well as common key/certificate
|
||||
extensions. Files whose *name* starts with ``.env`` (including
|
||||
variants like ``.env.local``, ``.env.production``, etc.) are also
|
||||
excluded.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file.
|
||||
|
||||
Returns:
|
||||
True if the file should be skipped.
|
||||
"""
|
||||
name = file_path.name
|
||||
|
||||
# Exact-name match (e.g. ".env", ".netrc", "secrets.json")
|
||||
if name in SENSITIVE_FILE_NAMES:
|
||||
return True
|
||||
|
||||
# Any .env variant (.env.backup, .env.staging.old, …)
|
||||
if name.startswith(".env"):
|
||||
return True
|
||||
|
||||
# Extension-based match for key/cert material
|
||||
if any(name.endswith(ext) for ext in SENSITIVE_FILE_PATTERNS):
|
||||
return True
|
||||
|
||||
# Check path components for well-known sensitive dirs/files
|
||||
# e.g. ".aws/credentials" or ".ssh/id_rsa"
|
||||
parts = file_path.parts
|
||||
for i, _part in enumerate(parts):
|
||||
remaining = "/".join(parts[i:])
|
||||
if remaining in SENSITIVE_FILE_NAMES:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _is_binary_file(self, file_path: Path) -> bool:
|
||||
"""Check if a file is binary by looking for null bytes.
|
||||
|
||||
@@ -326,9 +423,20 @@ class GrepTool(BaseTool):
|
||||
Returns:
|
||||
FileSearchResult if matches found, None otherwise.
|
||||
"""
|
||||
if self._is_sensitive_file(file_path):
|
||||
return None
|
||||
|
||||
if self._is_binary_file(file_path):
|
||||
return None
|
||||
|
||||
# Skip files that are too large to safely read into memory
|
||||
try:
|
||||
file_size = file_path.stat().st_size
|
||||
except OSError:
|
||||
return None
|
||||
if file_size > self.max_file_size_bytes:
|
||||
return None
|
||||
|
||||
try:
|
||||
with open(file_path, encoding="utf-8", errors="replace") as f:
|
||||
lines = f.readlines()
|
||||
|
||||
@@ -5,9 +5,14 @@ from __future__ import annotations
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from pydantic import ValidationError
|
||||
|
||||
from crewai_tools import GrepTool
|
||||
from crewai_tools.tools.grep_tool.grep_tool import MAX_REGEX_LENGTH
|
||||
from crewai_tools.tools.grep_tool.grep_tool import (
|
||||
MAX_CONTEXT_LINES,
|
||||
MAX_REGEX_LENGTH,
|
||||
GrepToolSchema,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
||||
Reference in New Issue
Block a user