feat: add crewai-tools workspace and fix tests/dependencies

* feat: add crewai-tools workspace structure * Squashed 'temp-crewai-tools/' content from commit 9bae5633 git-subtree-dir: temp-crewai-tools git-subtree-split: 9bae56339096cb70f03873e600192bd2cd207ac9 * feat: configure crewai-tools workspace package with dependencies * fix: apply ruff auto-formatting to crewai-tools code * chore: update lockfile * fix: don't allow tool tests yet * fix: comment out extra pytest flags for now * fix: remove conflicting conftest.py from crewai-tools tests * fix: resolve dependency conflicts and test issues - Pin vcrpy to 7.0.0 to fix pytest-recording compatibility - Comment out types-requests to resolve urllib3 conflict - Update requests requirement in crewai-tools to >=2.32.0
2026-05-03 08:12:39 +00:00 · 2025-09-28 00:05:42 -04:00
parent c591c1ac87
commit 289b90f00a
304 changed files with 46489 additions and 376 deletions
--- a/lib/crewai-tools/src/crewai_tools/rag/loaders/mdx_loader.py
+++ b/lib/crewai-tools/src/crewai_tools/rag/loaders/mdx_loader.py
@@ -0,0 +1,67 @@
+import re
+
+from crewai_tools.rag.base_loader import BaseLoader, LoaderResult
+from crewai_tools.rag.source_content import SourceContent
+
+
+class MDXLoader(BaseLoader):
+    def load(self, source_content: SourceContent, **kwargs) -> LoaderResult:
+        source_ref = source_content.source_ref
+        content = source_content.source
+
+        if source_content.is_url():
+            content = self._load_from_url(source_ref, kwargs)
+        elif source_content.path_exists():
+            content = self._load_from_file(source_ref)
+
+        return self._parse_mdx(content, source_ref)
+
+    def _load_from_url(self, url: str, kwargs: dict) -> str:
+        import requests
+
+        headers = kwargs.get(
+            "headers",
+            {
+                "Accept": "text/markdown, text/x-markdown, text/plain",
+                "User-Agent": "Mozilla/5.0 (compatible; crewai-tools MDXLoader)",
+            },
+        )
+
+        try:
+            response = requests.get(url, headers=headers, timeout=30)
+            response.raise_for_status()
+            return response.text
+        except Exception as e:
+            raise ValueError(f"Error fetching MDX from URL {url}: {e!s}")
+
+    def _load_from_file(self, path: str) -> str:
+        with open(path, "r", encoding="utf-8") as file:
+            return file.read()
+
+    def _parse_mdx(self, content: str, source_ref: str) -> LoaderResult:
+        cleaned_content = content
+
+        # Remove import statements
+        cleaned_content = re.sub(
+            r"^import\s+.*?\n", "", cleaned_content, flags=re.MULTILINE
+        )
+
+        # Remove export statements
+        cleaned_content = re.sub(
+            r"^export\s+.*?(?:\n|$)", "", cleaned_content, flags=re.MULTILINE
+        )
+
+        # Remove JSX tags (simple approach)
+        cleaned_content = re.sub(r"<[^>]+>", "", cleaned_content)
+
+        # Clean up extra whitespace
+        cleaned_content = re.sub(r"\n\s*\n\s*\n", "\n\n", cleaned_content)
+        cleaned_content = cleaned_content.strip()
+
+        metadata = {"format": "mdx"}
+        return LoaderResult(
+            content=cleaned_content,
+            source=source_ref,
+            metadata=metadata,
+            doc_id=self.generate_doc_id(source_ref=source_ref, content=cleaned_content),
+        )