feat(parallel): add ParallelSearchTool (Search API v1beta) (#445)

* docs: add BUILDING_TOOLS.md * feat(parallel): add ParallelSearchTool (Search API v1beta), tests, README; register exports; regenerate tool.specs.json * test(parallel): replace URL substring assertion with hostname allowlist (CodeQL)
2026-01-09 16:18:30 +00:00 · 2025-09-08 10:53:06 -04:00
parent 47b64d3507
commit cb8a1da730
6 changed files with 330 additions and 0 deletions
--- a/src/crewai_tools/init.py
+++ b/src/crewai_tools/init.py
@@ -93,4 +93,5 @@ from .tools import (
    YoutubeChannelSearchTool,
    YoutubeVideoSearchTool,
    ZapierActionTools,
+    ParallelSearchTool,
 )
--- a/src/crewai_tools/tools/init.py
+++ b/src/crewai_tools/tools/init.py
@@ -121,3 +121,6 @@ from .youtube_channel_search_tool.youtube_channel_search_tool import (
 )
 from .youtube_video_search_tool.youtube_video_search_tool import YoutubeVideoSearchTool
 from .zapier_action_tool.zapier_action_tool import ZapierActionTools
+from .parallel_tools import (
+    ParallelSearchTool,
+)
--- a/src/crewai_tools/tools/parallel_tools/README.md
+++ b/src/crewai_tools/tools/parallel_tools/README.md
@@ -0,0 +1,153 @@
+# ParallelSearchTool
+
+Unified Parallel web search tool using the Parallel Search API (v1beta). Returns ranked results with compressed excerpts optimized for LLMs.
+
+- **Quickstart**: see the official docs: [Search API Quickstart](https://docs.parallel.ai/search-api/search-quickstart)
+- **Processors**: guidance on `base` vs `pro`: [Processors](https://docs.parallel.ai/search-api/processors)
+
+## Why this tool
+
+- **Single-call pipeline**: Replaces search → scrape → extract with a single, low‑latency API call.
+- **LLM‑ready**: Returns compressed excerpts that feed directly into LLM prompts (fewer tokens, less pre/post‑processing).
+- **Flexible**: Control result count and excerpt length; optionally restrict sources via `source_policy`.
+
+## Environment
+
+- `PARALLEL_API_KEY` (required)
+
+Optional (for the agent example):
+- `OPENAI_API_KEY` or other LLM provider keys supported by CrewAI
+
+## Parameters
+
+- `objective` (str, optional): Natural‑language research goal (≤ 5000 chars)
+- `search_queries` (list[str], optional): Up to 5 keyword queries (each ≤ 200 chars)
+- `processor` (str, default `base`): `base` (fast/low cost) or `pro` (freshness/quality)
+- `max_results` (int, default 10): ≤ 40 (subject to processor limits)
+- `max_chars_per_result` (int, default 6000): ≥ 100; values > 30000 not guaranteed
+- `source_policy` (dict, optional): Source policy for domain inclusion/exclusion
+
+Notes:
+- API is in beta; default rate limit is 600 RPM. Contact support for production capacity.
+
+## Direct usage (when published)
+
+```python
+from crewai_tools import ParallelSearchTool
+
+tool = ParallelSearchTool()
+resp_json = tool.run(
+  objective="When was the United Nations established? Prefer UN's websites.",
+  search_queries=["Founding year UN", "Year of founding United Nations"],
+  processor="base",
+  max_results=5,
+  max_chars_per_result=1500,
+)
+print(resp_json)  # => {"search_id": ..., "results": [{"url", "title", "excerpts": [...]}, ...]}
+```
+
+### Parameters you can pass
+
+Call `run(...)` with any of the following (at least one of `objective` or `search_queries` is required):
+
+```python
+tool.run(
+  objective: str | None = None,                 # ≤ 5000 chars
+  search_queries: list[str] | None = None,      # up to 5 items, each ≤ 200 chars
+  processor: str = "base",                      # "base" (fast) or "pro" (freshness/quality)
+  max_results: int = 10,                        # ≤ 40 (processor limits apply)
+  max_chars_per_result: int = 6000,             # ≥ 100 (values > 30000 not guaranteed)
+  source_policy: dict | None = None,            # optional SourcePolicy config
+)
+```
+
+Example with `source_policy`:
+
+```python
+source_policy = {
+  "allow": {"domains": ["un.org"]},
+  # "deny": {"domains": ["example.com"]},  # optional
+}
+
+resp_json = tool.run(
+  objective="When was the United Nations established?",
+  processor="base",
+  max_results=5,
+  max_chars_per_result=1500,
+  source_policy=source_policy,
+)
+```
+
+## Example with agents
+
+Here’s a minimal example that calls `ParallelSearchTool` to fetch sources and has an LLM produce a short, cited answer.
+
+```python
+import os
+from crewai import Agent, Task, Crew, LLM, Process
+from crewai_tools import ParallelSearchTool
+
+# LLM
+llm = LLM(
+  model="gemini/gemini-2.0-flash",
+  temperature=0.5,
+  api_key=os.getenv("GEMINI_API_KEY")
+)
+
+# Parallel Search
+search = ParallelSearchTool()
+
+# User query
+query = "find all the recent concerns about AI evals? please cite the sources"
+
+# Researcher agent 
+researcher = Agent(
+  role="Web Researcher",
+  backstory="You are an expert web researcher",
+  goal="Find cited, high-quality sources and provide a brief answer.",
+  tools=[search],
+  llm=llm,
+  verbose=True,
+)
+
+# Research task
+task = Task(
+  description=f"Research the {query} and produce a short, cited answer.",
+  expected_output="A concise, sourced answer to the question. The answer should be in this format: [query]: [answer] - [source]",
+  agent=researcher,
+  output_file="answer.mdx",
+)
+
+# Crew
+crew = Crew(
+    agents=[researcher], 
+    tasks=[task], 
+    verbose=True,
+    process=Process.sequential,
+)
+
+# Run the crew
+result = crew.kickoff(inputs={'query': query})
+print(result)
+```
+
+Output from the agent above:
+
+```md
+Recent concerns about AI evaluations include: the rise of AI-related incidents alongside a lack of standardized Responsible AI (RAI) evaluations among major industrial model developers - [https://hai.stanford.edu/ai-index/2025-ai-index-report]; flawed benchmark datasets that fail to account for critical factors, leading to unrealistic estimates of AI model abilities - [https://www.nature.com/articles/d41586-025-02462-5]; the need for multi-metric, context-aware evaluations in medical imaging AI to ensure reliability and clinical relevance - [https://www.sciencedirect.com/science/article/pii/S3050577125000283]; challenges related to data sets (insufficient, imbalanced, or poor quality), communication gaps, and misaligned expectations in AI model training - [https://www.oracle.com/artificial-intelligence/ai-model-training-challenges/]; the argument that LLM agents should be evaluated primarily on their riskiness, not just performance, due to unreliability, hallucinations, and brittleness - [https://www.technologyreview.com/2025/06/24/1119187/fix-ai-evaluation-crisis/]; the fact that the AI industry's embraced benchmarks may be close to meaningless, with top makers of AI models picking and choosing different responsible AI benchmarks, complicating efforts to systematically compare risks and limitations - [https://themarkup.org/artificial-intelligence/2024/07/17/everyone-is-judging-ai-by-these-tests-but-experts-say-theyre-close-to-meaningless]; and the difficulty of building robust and reliable model evaluations, as many existing evaluation suites are limited in their ability to serve as accurate indicators of model capabilities or safety - [https://www.anthropic.com/research/evaluating-ai-systems].
+```
+
+Tips:
+- Ensure your LLM provider keys are set (e.g., `GEMINI_API_KEY`) and CrewAI model config is in place.
+- For longer analyses, raise `max_chars_per_result` or use `processor="pro"` (higher quality, higher latency).
+
+## Behavior
+
+- Single‑request web research; no scraping/post‑processing required.
+- Returns `search_id` and ranked `results` with compressed `excerpts`.
+- Clear error handling on HTTP/timeouts.
+
+## References
+
+- Search API Quickstart: https://docs.parallel.ai/search-api/search-quickstart
+- Processors: https://docs.parallel.ai/search-api/processors
--- a/src/crewai_tools/tools/parallel_tools/init.py
+++ b/src/crewai_tools/tools/parallel_tools/init.py
@@ -0,0 +1,7 @@
+from .parallel_search_tool import ParallelSearchTool
+
+__all__ = [
+    "ParallelSearchTool",
+]
+
+
--- a/src/crewai_tools/tools/parallel_tools/parallel_search_tool.py
+++ b/src/crewai_tools/tools/parallel_tools/parallel_search_tool.py
@@ -0,0 +1,119 @@
+import os
+from typing import Any, Dict, List, Optional, Type, Annotated
+
+import requests
+from crewai.tools import BaseTool, EnvVar
+from pydantic import BaseModel, Field
+
+
+class ParallelSearchInput(BaseModel):
+    """Input schema for ParallelSearchTool using the Search API (v1beta).
+
+    At least one of objective or search_queries is required.
+    """
+
+    objective: Optional[str] = Field(
+        None,
+        description="Natural-language goal for the web research (<=5000 chars)",
+        max_length=5000,
+    )
+    search_queries: Optional[List[Annotated[str, Field(max_length=200)]]] = Field(
+        default=None,
+        description="Optional list of keyword queries (<=5 items, each <=200 chars)",
+        min_length=1,
+        max_length=5,
+    )
+    processor: str = Field(
+        default="base",
+        description="Search processor: 'base' (fast/low cost) or 'pro' (higher quality/freshness)",
+        pattern=r"^(base|pro)$",
+    )
+    max_results: int = Field(
+        default=10,
+        ge=1,
+        le=40,
+        description="Maximum number of search results to return (processor limits apply)",
+    )
+    max_chars_per_result: int = Field(
+        default=6000,
+        ge=100,
+        description="Maximum characters per result excerpt (values >30000 not guaranteed)",
+    )
+    source_policy: Optional[Dict[str, Any]] = Field(
+        default=None, description="Optional source policy configuration"
+    )
+
+
+class ParallelSearchTool(BaseTool):
+    name: str = "Parallel Web Search Tool"
+    description: str = (
+        "Search the web using Parallel's Search API (v1beta). Returns ranked results with "
+        "compressed excerpts optimized for LLMs."
+    )
+    args_schema: Type[BaseModel] = ParallelSearchInput
+
+    env_vars: List[EnvVar] = [
+        EnvVar(
+            name="PARALLEL_API_KEY",
+            description="API key for Parallel",
+            required=True,
+        ),
+    ]
+    package_dependencies: List[str] = ["requests"]
+
+    search_url: str = "https://api.parallel.ai/v1beta/search"
+
+    def _run(
+        self,
+        objective: Optional[str] = None,
+        search_queries: Optional[List[str]] = None,
+        processor: str = "base",
+        max_results: int = 10,
+        max_chars_per_result: int = 6000,
+        source_policy: Optional[Dict[str, Any]] = None,
+        **_: Any,
+    ) -> str:
+        api_key = os.environ.get("PARALLEL_API_KEY")
+        if not api_key:
+            return "Error: PARALLEL_API_KEY environment variable is required"
+
+        if not objective and not search_queries:
+            return "Error: Provide at least one of 'objective' or 'search_queries'"
+
+        headers = {
+            "x-api-key": api_key,
+            "Content-Type": "application/json",
+        }
+
+        try:
+            payload: Dict[str, Any] = {
+                "processor": processor,
+                "max_results": max_results,
+                "max_chars_per_result": max_chars_per_result,
+            }
+            if objective is not None:
+                payload["objective"] = objective
+            if search_queries is not None:
+                payload["search_queries"] = search_queries
+            if source_policy is not None:
+                payload["source_policy"] = source_policy
+
+            request_timeout = 90 if processor == "pro" else 30
+            resp = requests.post(self.search_url, json=payload, headers=headers, timeout=request_timeout)
+            if resp.status_code >= 300:
+                return f"Parallel Search API error: {resp.status_code} {resp.text[:200]}"
+            data = resp.json()
+            return self._format_output(data)
+        except requests.Timeout:
+            return "Parallel Search API timeout. Please try again later."
+        except Exception as exc:  # noqa: BLE001
+            return f"Unexpected error calling Parallel Search API: {exc}"
+
+    def _format_output(self, result: Dict[str, Any]) -> str:
+        # Return the full JSON payload (search_id + results) as a compact JSON string
+        try:
+            import json
+
+            return json.dumps(result or {}, ensure_ascii=False)
+        except Exception:
+            return str(result or {})
--- a/tests/tools/parallel_search_tool_test.py
+++ b/tests/tools/parallel_search_tool_test.py
@@ -0,0 +1,47 @@
+import os
+import json
+from urllib.parse import urlparse
+from unittest.mock import patch
+
+import pytest
+
+from crewai_tools.tools.parallel_tools.parallel_search_tool import (
+    ParallelSearchTool,
+)
+
+
+def test_requires_env_var(monkeypatch):
+    monkeypatch.delenv("PARALLEL_API_KEY", raising=False)
+    tool = ParallelSearchTool()
+    result = tool.run(objective="test")
+    assert "PARALLEL_API_KEY" in result
+
+
+@patch("crewai_tools.tools.parallel_tools.parallel_search_tool.requests.post")
+def test_happy_path(mock_post, monkeypatch):
+    monkeypatch.setenv("PARALLEL_API_KEY", "test")
+
+    mock_post.return_value.status_code = 200
+    mock_post.return_value.json.return_value = {
+        "search_id": "search_123",
+        "results": [
+            {
+                "url": "https://www.un.org/en/about-us/history-of-the-un",
+                "title": "History of the United Nations",
+                "excerpts": [
+                    "Four months after the San Francisco Conference ended, the United Nations officially began, on 24 October 1945..."
+                ],
+            }
+        ],
+    }
+
+    tool = ParallelSearchTool()
+    result = tool.run(objective="When was the UN established?", search_queries=["Founding year UN"]) 
+    data = json.loads(result)
+    assert "search_id" in data
+    urls = [r.get("url", "") for r in data.get("results", [])]
+    # Validate host against allowed set instead of substring matching
+    allowed_hosts = {"www.un.org", "un.org"}
+    assert any(urlparse(u).netloc in allowed_hosts for u in urls)
+
+