mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-05-03 08:12:39 +00:00
feat: add crewai-tools workspace and fix tests/dependencies
* feat: add crewai-tools workspace structure * Squashed 'temp-crewai-tools/' content from commit 9bae5633 git-subtree-dir: temp-crewai-tools git-subtree-split: 9bae56339096cb70f03873e600192bd2cd207ac9 * feat: configure crewai-tools workspace package with dependencies * fix: apply ruff auto-formatting to crewai-tools code * chore: update lockfile * fix: don't allow tool tests yet * fix: comment out extra pytest flags for now * fix: remove conflicting conftest.py from crewai-tools tests * fix: resolve dependency conflicts and test issues - Pin vcrpy to 7.0.0 to fix pytest-recording compatibility - Comment out types-requests to resolve urllib3 conflict - Update requests requirement in crewai-tools to >=2.32.0
This commit is contained in:
67
lib/crewai-tools/src/crewai_tools/rag/loaders/mdx_loader.py
Normal file
67
lib/crewai-tools/src/crewai_tools/rag/loaders/mdx_loader.py
Normal file
@@ -0,0 +1,67 @@
|
||||
import re
|
||||
|
||||
from crewai_tools.rag.base_loader import BaseLoader, LoaderResult
|
||||
from crewai_tools.rag.source_content import SourceContent
|
||||
|
||||
|
||||
class MDXLoader(BaseLoader):
|
||||
def load(self, source_content: SourceContent, **kwargs) -> LoaderResult:
|
||||
source_ref = source_content.source_ref
|
||||
content = source_content.source
|
||||
|
||||
if source_content.is_url():
|
||||
content = self._load_from_url(source_ref, kwargs)
|
||||
elif source_content.path_exists():
|
||||
content = self._load_from_file(source_ref)
|
||||
|
||||
return self._parse_mdx(content, source_ref)
|
||||
|
||||
def _load_from_url(self, url: str, kwargs: dict) -> str:
|
||||
import requests
|
||||
|
||||
headers = kwargs.get(
|
||||
"headers",
|
||||
{
|
||||
"Accept": "text/markdown, text/x-markdown, text/plain",
|
||||
"User-Agent": "Mozilla/5.0 (compatible; crewai-tools MDXLoader)",
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
response = requests.get(url, headers=headers, timeout=30)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
except Exception as e:
|
||||
raise ValueError(f"Error fetching MDX from URL {url}: {e!s}")
|
||||
|
||||
def _load_from_file(self, path: str) -> str:
|
||||
with open(path, "r", encoding="utf-8") as file:
|
||||
return file.read()
|
||||
|
||||
def _parse_mdx(self, content: str, source_ref: str) -> LoaderResult:
|
||||
cleaned_content = content
|
||||
|
||||
# Remove import statements
|
||||
cleaned_content = re.sub(
|
||||
r"^import\s+.*?\n", "", cleaned_content, flags=re.MULTILINE
|
||||
)
|
||||
|
||||
# Remove export statements
|
||||
cleaned_content = re.sub(
|
||||
r"^export\s+.*?(?:\n|$)", "", cleaned_content, flags=re.MULTILINE
|
||||
)
|
||||
|
||||
# Remove JSX tags (simple approach)
|
||||
cleaned_content = re.sub(r"<[^>]+>", "", cleaned_content)
|
||||
|
||||
# Clean up extra whitespace
|
||||
cleaned_content = re.sub(r"\n\s*\n\s*\n", "\n\n", cleaned_content)
|
||||
cleaned_content = cleaned_content.strip()
|
||||
|
||||
metadata = {"format": "mdx"}
|
||||
return LoaderResult(
|
||||
content=cleaned_content,
|
||||
source=source_ref,
|
||||
metadata=metadata,
|
||||
doc_id=self.generate_doc_id(source_ref=source_ref, content=cleaned_content),
|
||||
)
|
||||
Reference in New Issue
Block a user