Squashed 'packages/tools/' changes from 78317b9c..0b3f00e6

0b3f00e6 chore: update project version to 0.73.0 and revise uv.lock dependencies (#455) ad19b074 feat: replace embedchain with native crewai adapter (#451) git-subtree-dir: packages/tools git-subtree-split: 0b3f00e67c0dae24d188c292dc99759fd1c841f7
2026-01-26 00:28:13 +00:00 · 2025-09-18 23:38:08 -04:00
parent e16606672a
commit c960f26601
35 changed files with 4897 additions and 3951 deletions
--- a/crewai_tools/rag/loaders/docs_site_loader.py
+++ b/crewai_tools/rag/loaders/docs_site_loader.py
@@ -0,0 +1,98 @@
+"""Documentation site loader."""
+
+from typing import Any
+from urllib.parse import urljoin, urlparse
+
+import requests
+from bs4 import BeautifulSoup
+
+from crewai_tools.rag.base_loader import BaseLoader, LoaderResult
+from crewai_tools.rag.source_content import SourceContent
+
+
+class DocsSiteLoader(BaseLoader):
+    """Loader for documentation websites."""
+
+    def load(self, source: SourceContent, **kwargs) -> LoaderResult:
+        """Load content from a documentation site.
+        
+        Args:
+            source: Documentation site URL
+            **kwargs: Additional arguments
+            
+        Returns:
+            LoaderResult with documentation content
+        """
+        docs_url = source.source
+        
+        try:
+            response = requests.get(docs_url, timeout=30)
+            response.raise_for_status()
+        except requests.RequestException as e:
+            raise ValueError(f"Unable to fetch documentation from {docs_url}: {e}")
+        
+        soup = BeautifulSoup(response.text, "html.parser")
+        
+        for script in soup(["script", "style"]):
+            script.decompose()
+        
+        title = soup.find("title")
+        title_text = title.get_text(strip=True) if title else "Documentation"
+        
+        main_content = None
+        for selector in ["main", "article", '[role="main"]', ".content", "#content", ".documentation"]:
+            main_content = soup.select_one(selector)
+            if main_content:
+                break
+        
+        if not main_content:
+            main_content = soup.find("body")
+        
+        if not main_content:
+            raise ValueError(f"Unable to extract content from documentation site: {docs_url}")
+        
+        text_parts = [f"Title: {title_text}", ""]
+        
+        headings = main_content.find_all(["h1", "h2", "h3"])
+        if headings:
+            text_parts.append("Table of Contents:")
+            for heading in headings[:15]:
+                level = int(heading.name[1])
+                indent = "  " * (level - 1)
+                text_parts.append(f"{indent}- {heading.get_text(strip=True)}")
+            text_parts.append("")
+        
+        text = main_content.get_text(separator="\n", strip=True)
+        lines = [line.strip() for line in text.split("\n") if line.strip()]
+        text_parts.extend(lines)
+        
+        nav_links = []
+        for nav_selector in ["nav", ".sidebar", ".toc", ".navigation"]:
+            nav = soup.select_one(nav_selector)
+            if nav:
+                links = nav.find_all("a", href=True)
+                for link in links[:20]:
+                    href = link["href"]
+                    if not href.startswith(("http://", "https://", "mailto:", "#")):
+                        full_url = urljoin(docs_url, href)
+                        nav_links.append(f"- {link.get_text(strip=True)}: {full_url}")
+        
+        if nav_links:
+            text_parts.append("")
+            text_parts.append("Related documentation pages:")
+            text_parts.extend(nav_links[:10])
+        
+        content = "\n".join(text_parts)
+        
+        if len(content) > 100000:
+            content = content[:100000] + "\n\n[Content truncated...]"
+        
+        return LoaderResult(
+            content=content,
+            metadata={
+                "source": docs_url,
+                "title": title_text,
+                "domain": urlparse(docs_url).netloc
+            },
+            doc_id=self.generate_doc_id(source_ref=docs_url, content=content)
+        )