Squashed 'packages/tools/' changes from 78317b9c..0b3f00e6

0b3f00e6 chore: update project version to 0.73.0 and revise uv.lock dependencies (#455)
ad19b074 feat: replace embedchain with native crewai adapter (#451)

git-subtree-dir: packages/tools
git-subtree-split: 0b3f00e67c0dae24d188c292dc99759fd1c841f7
This commit is contained in:
Greyson LaLonde
2025-09-18 23:38:08 -04:00
parent e16606672a
commit c960f26601
35 changed files with 4897 additions and 3951 deletions

View File

@@ -112,7 +112,10 @@ class RecursiveCharacterTextSplitter:
if separator == "":
doc = "".join(current_doc)
else:
doc = separator.join(current_doc)
if self._keep_separator and separator == " ":
doc = "".join(current_doc)
else:
doc = separator.join(current_doc)
if doc:
docs.append(doc)
@@ -133,7 +136,10 @@ class RecursiveCharacterTextSplitter:
if separator == "":
doc = "".join(current_doc)
else:
doc = separator.join(current_doc)
if self._keep_separator and separator == " ":
doc = "".join(current_doc)
else:
doc = separator.join(current_doc)
if doc:
docs.append(doc)

View File

@@ -25,6 +25,8 @@ class DataType(str, Enum):
# Web types
WEBSITE = "website"
DOCS_SITE = "docs_site"
YOUTUBE_VIDEO = "youtube_video"
YOUTUBE_CHANNEL = "youtube_channel"
# Raw types
TEXT = "text"
@@ -34,6 +36,7 @@ class DataType(str, Enum):
from importlib import import_module
chunkers = {
DataType.PDF_FILE: ("text_chunker", "TextChunker"),
DataType.TEXT_FILE: ("text_chunker", "TextChunker"),
DataType.TEXT: ("text_chunker", "TextChunker"),
DataType.DOCX: ("text_chunker", "DocxChunker"),
@@ -45,9 +48,18 @@ class DataType(str, Enum):
DataType.XML: ("structured_chunker", "XmlChunker"),
DataType.WEBSITE: ("web_chunker", "WebsiteChunker"),
DataType.DIRECTORY: ("text_chunker", "TextChunker"),
DataType.YOUTUBE_VIDEO: ("text_chunker", "TextChunker"),
DataType.YOUTUBE_CHANNEL: ("text_chunker", "TextChunker"),
DataType.GITHUB: ("text_chunker", "TextChunker"),
DataType.DOCS_SITE: ("text_chunker", "TextChunker"),
DataType.MYSQL: ("text_chunker", "TextChunker"),
DataType.POSTGRES: ("text_chunker", "TextChunker"),
}
module_name, class_name = chunkers.get(self, ("default_chunker", "DefaultChunker"))
if self not in chunkers:
raise ValueError(f"No chunker defined for {self}")
module_name, class_name = chunkers[self]
module_path = f"crewai_tools.rag.chunkers.{module_name}"
try:
@@ -60,6 +72,7 @@ class DataType(str, Enum):
from importlib import import_module
loaders = {
DataType.PDF_FILE: ("pdf_loader", "PDFLoader"),
DataType.TEXT_FILE: ("text_loader", "TextFileLoader"),
DataType.TEXT: ("text_loader", "TextLoader"),
DataType.XML: ("xml_loader", "XMLLoader"),
@@ -69,9 +82,17 @@ class DataType(str, Enum):
DataType.DOCX: ("docx_loader", "DOCXLoader"),
DataType.CSV: ("csv_loader", "CSVLoader"),
DataType.DIRECTORY: ("directory_loader", "DirectoryLoader"),
DataType.YOUTUBE_VIDEO: ("youtube_video_loader", "YoutubeVideoLoader"),
DataType.YOUTUBE_CHANNEL: ("youtube_channel_loader", "YoutubeChannelLoader"),
DataType.GITHUB: ("github_loader", "GithubLoader"),
DataType.DOCS_SITE: ("docs_site_loader", "DocsSiteLoader"),
DataType.MYSQL: ("mysql_loader", "MySQLLoader"),
DataType.POSTGRES: ("postgres_loader", "PostgresLoader"),
}
module_name, class_name = loaders.get(self, ("text_loader", "TextLoader"))
if self not in loaders:
raise ValueError(f"No loader defined for {self}")
module_name, class_name = loaders[self]
module_path = f"crewai_tools.rag.loaders.{module_name}"
try:
module = import_module(module_path)

View File

@@ -6,6 +6,9 @@ from crewai_tools.rag.loaders.json_loader import JSONLoader
from crewai_tools.rag.loaders.docx_loader import DOCXLoader
from crewai_tools.rag.loaders.csv_loader import CSVLoader
from crewai_tools.rag.loaders.directory_loader import DirectoryLoader
from crewai_tools.rag.loaders.pdf_loader import PDFLoader
from crewai_tools.rag.loaders.youtube_video_loader import YoutubeVideoLoader
from crewai_tools.rag.loaders.youtube_channel_loader import YoutubeChannelLoader
__all__ = [
"TextFileLoader",
@@ -17,4 +20,7 @@ __all__ = [
"DOCXLoader",
"CSVLoader",
"DirectoryLoader",
"PDFLoader",
"YoutubeVideoLoader",
"YoutubeChannelLoader",
]

View File

@@ -0,0 +1,98 @@
"""Documentation site loader."""
from typing import Any
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from crewai_tools.rag.base_loader import BaseLoader, LoaderResult
from crewai_tools.rag.source_content import SourceContent
class DocsSiteLoader(BaseLoader):
"""Loader for documentation websites."""
def load(self, source: SourceContent, **kwargs) -> LoaderResult:
"""Load content from a documentation site.
Args:
source: Documentation site URL
**kwargs: Additional arguments
Returns:
LoaderResult with documentation content
"""
docs_url = source.source
try:
response = requests.get(docs_url, timeout=30)
response.raise_for_status()
except requests.RequestException as e:
raise ValueError(f"Unable to fetch documentation from {docs_url}: {e}")
soup = BeautifulSoup(response.text, "html.parser")
for script in soup(["script", "style"]):
script.decompose()
title = soup.find("title")
title_text = title.get_text(strip=True) if title else "Documentation"
main_content = None
for selector in ["main", "article", '[role="main"]', ".content", "#content", ".documentation"]:
main_content = soup.select_one(selector)
if main_content:
break
if not main_content:
main_content = soup.find("body")
if not main_content:
raise ValueError(f"Unable to extract content from documentation site: {docs_url}")
text_parts = [f"Title: {title_text}", ""]
headings = main_content.find_all(["h1", "h2", "h3"])
if headings:
text_parts.append("Table of Contents:")
for heading in headings[:15]:
level = int(heading.name[1])
indent = " " * (level - 1)
text_parts.append(f"{indent}- {heading.get_text(strip=True)}")
text_parts.append("")
text = main_content.get_text(separator="\n", strip=True)
lines = [line.strip() for line in text.split("\n") if line.strip()]
text_parts.extend(lines)
nav_links = []
for nav_selector in ["nav", ".sidebar", ".toc", ".navigation"]:
nav = soup.select_one(nav_selector)
if nav:
links = nav.find_all("a", href=True)
for link in links[:20]:
href = link["href"]
if not href.startswith(("http://", "https://", "mailto:", "#")):
full_url = urljoin(docs_url, href)
nav_links.append(f"- {link.get_text(strip=True)}: {full_url}")
if nav_links:
text_parts.append("")
text_parts.append("Related documentation pages:")
text_parts.extend(nav_links[:10])
content = "\n".join(text_parts)
if len(content) > 100000:
content = content[:100000] + "\n\n[Content truncated...]"
return LoaderResult(
content=content,
metadata={
"source": docs_url,
"title": title_text,
"domain": urlparse(docs_url).netloc
},
doc_id=self.generate_doc_id(source_ref=docs_url, content=content)
)

View File

@@ -0,0 +1,110 @@
"""GitHub repository content loader."""
from typing import Any
from github import Github, GithubException
from crewai_tools.rag.base_loader import BaseLoader, LoaderResult
from crewai_tools.rag.source_content import SourceContent
class GithubLoader(BaseLoader):
"""Loader for GitHub repository content."""
def load(self, source: SourceContent, **kwargs) -> LoaderResult:
"""Load content from a GitHub repository.
Args:
source: GitHub repository URL
**kwargs: Additional arguments including gh_token and content_types
Returns:
LoaderResult with repository content
"""
metadata = kwargs.get("metadata", {})
gh_token = metadata.get("gh_token")
content_types = metadata.get("content_types", ["code", "repo"])
repo_url = source.source
if not repo_url.startswith("https://github.com/"):
raise ValueError(f"Invalid GitHub URL: {repo_url}")
parts = repo_url.replace("https://github.com/", "").strip("/").split("/")
if len(parts) < 2:
raise ValueError(f"Invalid GitHub repository URL: {repo_url}")
repo_name = f"{parts[0]}/{parts[1]}"
g = Github(gh_token) if gh_token else Github()
try:
repo = g.get_repo(repo_name)
except GithubException as e:
raise ValueError(f"Unable to access repository {repo_name}: {e}")
all_content = []
if "repo" in content_types:
all_content.append(f"Repository: {repo.full_name}")
all_content.append(f"Description: {repo.description or 'No description'}")
all_content.append(f"Language: {repo.language or 'Not specified'}")
all_content.append(f"Stars: {repo.stargazers_count}")
all_content.append(f"Forks: {repo.forks_count}")
all_content.append("")
if "code" in content_types:
try:
readme = repo.get_readme()
all_content.append("README:")
all_content.append(readme.decoded_content.decode("utf-8", errors="ignore"))
all_content.append("")
except GithubException:
pass
try:
contents = repo.get_contents("")
if isinstance(contents, list):
all_content.append("Repository structure:")
for content_file in contents[:20]:
all_content.append(f"- {content_file.path} ({content_file.type})")
all_content.append("")
except GithubException:
pass
if "pr" in content_types:
prs = repo.get_pulls(state="open")
pr_list = list(prs[:5])
if pr_list:
all_content.append("Recent Pull Requests:")
for pr in pr_list:
all_content.append(f"- PR #{pr.number}: {pr.title}")
if pr.body:
body_preview = pr.body[:200].replace("\n", " ")
all_content.append(f" {body_preview}")
all_content.append("")
if "issue" in content_types:
issues = repo.get_issues(state="open")
issue_list = [i for i in list(issues[:10]) if not i.pull_request][:5]
if issue_list:
all_content.append("Recent Issues:")
for issue in issue_list:
all_content.append(f"- Issue #{issue.number}: {issue.title}")
if issue.body:
body_preview = issue.body[:200].replace("\n", " ")
all_content.append(f" {body_preview}")
all_content.append("")
if not all_content:
raise ValueError(f"No content could be loaded from repository: {repo_url}")
content = "\n".join(all_content)
return LoaderResult(
content=content,
metadata={
"source": repo_url,
"repo": repo_name,
"content_types": content_types
},
doc_id=self.generate_doc_id(source_ref=repo_url, content=content)
)

View File

@@ -0,0 +1,99 @@
"""MySQL database loader."""
from typing import Any
from urllib.parse import urlparse
import pymysql
from crewai_tools.rag.base_loader import BaseLoader, LoaderResult
from crewai_tools.rag.source_content import SourceContent
class MySQLLoader(BaseLoader):
"""Loader for MySQL database content."""
def load(self, source: SourceContent, **kwargs) -> LoaderResult:
"""Load content from a MySQL database table.
Args:
source: SQL query (e.g., "SELECT * FROM table_name")
**kwargs: Additional arguments including db_uri
Returns:
LoaderResult with database content
"""
metadata = kwargs.get("metadata", {})
db_uri = metadata.get("db_uri")
if not db_uri:
raise ValueError("Database URI is required for MySQL loader")
query = source.source
parsed = urlparse(db_uri)
if parsed.scheme not in ["mysql", "mysql+pymysql"]:
raise ValueError(f"Invalid MySQL URI scheme: {parsed.scheme}")
connection_params = {
"host": parsed.hostname or "localhost",
"port": parsed.port or 3306,
"user": parsed.username,
"password": parsed.password,
"database": parsed.path.lstrip("/") if parsed.path else None,
"charset": "utf8mb4",
"cursorclass": pymysql.cursors.DictCursor
}
if not connection_params["database"]:
raise ValueError("Database name is required in the URI")
try:
connection = pymysql.connect(**connection_params)
try:
with connection.cursor() as cursor:
cursor.execute(query)
rows = cursor.fetchall()
if not rows:
content = "No data found in the table"
return LoaderResult(
content=content,
metadata={"source": query, "row_count": 0},
doc_id=self.generate_doc_id(source_ref=query, content=content)
)
text_parts = []
columns = list(rows[0].keys())
text_parts.append(f"Columns: {', '.join(columns)}")
text_parts.append(f"Total rows: {len(rows)}")
text_parts.append("")
for i, row in enumerate(rows, 1):
text_parts.append(f"Row {i}:")
for col, val in row.items():
if val is not None:
text_parts.append(f" {col}: {val}")
text_parts.append("")
content = "\n".join(text_parts)
if len(content) > 100000:
content = content[:100000] + "\n\n[Content truncated...]"
return LoaderResult(
content=content,
metadata={
"source": query,
"database": connection_params["database"],
"row_count": len(rows),
"columns": columns
},
doc_id=self.generate_doc_id(source_ref=query, content=content)
)
finally:
connection.close()
except pymysql.Error as e:
raise ValueError(f"MySQL database error: {e}")
except Exception as e:
raise ValueError(f"Failed to load data from MySQL: {e}")

View File

@@ -0,0 +1,72 @@
"""PDF loader for extracting text from PDF files."""
import os
from pathlib import Path
from typing import Any
from crewai_tools.rag.base_loader import BaseLoader, LoaderResult
from crewai_tools.rag.source_content import SourceContent
class PDFLoader(BaseLoader):
"""Loader for PDF files."""
def load(self, source: SourceContent, **kwargs) -> LoaderResult:
"""Load and extract text from a PDF file.
Args:
source: The source content containing the PDF file path
Returns:
LoaderResult with extracted text content
Raises:
FileNotFoundError: If the PDF file doesn't exist
ImportError: If required PDF libraries aren't installed
"""
try:
import pypdf
except ImportError:
try:
import PyPDF2 as pypdf
except ImportError:
raise ImportError(
"PDF support requires pypdf or PyPDF2. "
"Install with: uv add pypdf"
)
file_path = source.source
if not os.path.isfile(file_path):
raise FileNotFoundError(f"PDF file not found: {file_path}")
text_content = []
metadata: dict[str, Any] = {
"source": str(file_path),
"file_name": Path(file_path).name,
"file_type": "pdf"
}
try:
with open(file_path, 'rb') as file:
pdf_reader = pypdf.PdfReader(file)
metadata["num_pages"] = len(pdf_reader.pages)
for page_num, page in enumerate(pdf_reader.pages, 1):
page_text = page.extract_text()
if page_text.strip():
text_content.append(f"Page {page_num}:\n{page_text}")
except Exception as e:
raise ValueError(f"Error reading PDF file {file_path}: {str(e)}")
if not text_content:
content = f"[PDF file with no extractable text: {Path(file_path).name}]"
else:
content = "\n\n".join(text_content)
return LoaderResult(
content=content,
source=str(file_path),
metadata=metadata,
doc_id=self.generate_doc_id(source_ref=str(file_path), content=content)
)

View File

@@ -0,0 +1,99 @@
"""PostgreSQL database loader."""
from typing import Any
from urllib.parse import urlparse
import psycopg2
from psycopg2.extras import RealDictCursor
from crewai_tools.rag.base_loader import BaseLoader, LoaderResult
from crewai_tools.rag.source_content import SourceContent
class PostgresLoader(BaseLoader):
"""Loader for PostgreSQL database content."""
def load(self, source: SourceContent, **kwargs) -> LoaderResult:
"""Load content from a PostgreSQL database table.
Args:
source: SQL query (e.g., "SELECT * FROM table_name")
**kwargs: Additional arguments including db_uri
Returns:
LoaderResult with database content
"""
metadata = kwargs.get("metadata", {})
db_uri = metadata.get("db_uri")
if not db_uri:
raise ValueError("Database URI is required for PostgreSQL loader")
query = source.source
parsed = urlparse(db_uri)
if parsed.scheme not in ["postgresql", "postgres", "postgresql+psycopg2"]:
raise ValueError(f"Invalid PostgreSQL URI scheme: {parsed.scheme}")
connection_params = {
"host": parsed.hostname or "localhost",
"port": parsed.port or 5432,
"user": parsed.username,
"password": parsed.password,
"database": parsed.path.lstrip("/") if parsed.path else None,
"cursor_factory": RealDictCursor
}
if not connection_params["database"]:
raise ValueError("Database name is required in the URI")
try:
connection = psycopg2.connect(**connection_params)
try:
with connection.cursor() as cursor:
cursor.execute(query)
rows = cursor.fetchall()
if not rows:
content = "No data found in the table"
return LoaderResult(
content=content,
metadata={"source": query, "row_count": 0},
doc_id=self.generate_doc_id(source_ref=query, content=content)
)
text_parts = []
columns = list(rows[0].keys())
text_parts.append(f"Columns: {', '.join(columns)}")
text_parts.append(f"Total rows: {len(rows)}")
text_parts.append("")
for i, row in enumerate(rows, 1):
text_parts.append(f"Row {i}:")
for col, val in row.items():
if val is not None:
text_parts.append(f" {col}: {val}")
text_parts.append("")
content = "\n".join(text_parts)
if len(content) > 100000:
content = content[:100000] + "\n\n[Content truncated...]"
return LoaderResult(
content=content,
metadata={
"source": query,
"database": connection_params["database"],
"row_count": len(rows),
"columns": columns
},
doc_id=self.generate_doc_id(source_ref=query, content=content)
)
finally:
connection.close()
except psycopg2.Error as e:
raise ValueError(f"PostgreSQL database error: {e}")
except Exception as e:
raise ValueError(f"Failed to load data from PostgreSQL: {e}")

View File

@@ -11,7 +11,7 @@ class XMLLoader(BaseLoader):
if source_content.is_url():
content = self._load_from_url(source_ref, kwargs)
elif os.path.exists(source_ref):
elif source_content.path_exists():
content = self._load_from_file(source_ref)
return self._parse_xml(content, source_ref)

View File

@@ -0,0 +1,141 @@
"""YouTube channel loader for extracting content from YouTube channels."""
import re
from typing import Any
from crewai_tools.rag.base_loader import BaseLoader, LoaderResult
from crewai_tools.rag.source_content import SourceContent
class YoutubeChannelLoader(BaseLoader):
"""Loader for YouTube channels."""
def load(self, source: SourceContent, **kwargs) -> LoaderResult:
"""Load and extract content from a YouTube channel.
Args:
source: The source content containing the YouTube channel URL
Returns:
LoaderResult with channel content
Raises:
ImportError: If required YouTube libraries aren't installed
ValueError: If the URL is not a valid YouTube channel URL
"""
try:
from pytube import Channel
except ImportError:
raise ImportError(
"YouTube channel support requires pytube. "
"Install with: uv add pytube"
)
channel_url = source.source
if not any(pattern in channel_url for pattern in ['youtube.com/channel/', 'youtube.com/c/', 'youtube.com/@', 'youtube.com/user/']):
raise ValueError(f"Invalid YouTube channel URL: {channel_url}")
metadata: dict[str, Any] = {
"source": channel_url,
"data_type": "youtube_channel"
}
try:
channel = Channel(channel_url)
metadata["channel_name"] = channel.channel_name
metadata["channel_id"] = channel.channel_id
max_videos = kwargs.get('max_videos', 10)
video_urls = list(channel.video_urls)[:max_videos]
metadata["num_videos_loaded"] = len(video_urls)
metadata["total_videos"] = len(list(channel.video_urls))
content_parts = [
f"YouTube Channel: {channel.channel_name}",
f"Channel ID: {channel.channel_id}",
f"Total Videos: {metadata['total_videos']}",
f"Videos Loaded: {metadata['num_videos_loaded']}",
"\n--- Video Summaries ---\n"
]
try:
from youtube_transcript_api import YouTubeTranscriptApi
from pytube import YouTube
for i, video_url in enumerate(video_urls, 1):
try:
video_id = self._extract_video_id(video_url)
if not video_id:
continue
yt = YouTube(video_url)
title = yt.title or f"Video {i}"
description = yt.description[:200] if yt.description else "No description"
content_parts.append(f"\n{i}. {title}")
content_parts.append(f" URL: {video_url}")
content_parts.append(f" Description: {description}...")
try:
api = YouTubeTranscriptApi()
transcript_list = api.list(video_id)
transcript = None
try:
transcript = transcript_list.find_transcript(['en'])
except:
try:
transcript = transcript_list.find_generated_transcript(['en'])
except:
transcript = next(iter(transcript_list), None)
if transcript:
transcript_data = transcript.fetch()
text_parts = []
char_count = 0
for entry in transcript_data:
text = entry.text.strip() if hasattr(entry, 'text') else ''
if text:
text_parts.append(text)
char_count += len(text)
if char_count > 500:
break
if text_parts:
preview = ' '.join(text_parts)[:500]
content_parts.append(f" Transcript Preview: {preview}...")
except:
content_parts.append(" Transcript: Not available")
except Exception as e:
content_parts.append(f"\n{i}. Error loading video: {str(e)}")
except ImportError:
for i, video_url in enumerate(video_urls, 1):
content_parts.append(f"\n{i}. {video_url}")
content = '\n'.join(content_parts)
except Exception as e:
raise ValueError(f"Unable to load YouTube channel {channel_url}: {str(e)}") from e
return LoaderResult(
content=content,
source=channel_url,
metadata=metadata,
doc_id=self.generate_doc_id(source_ref=channel_url, content=content)
)
def _extract_video_id(self, url: str) -> str | None:
"""Extract video ID from YouTube URL."""
patterns = [
r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/v\/)([^&\n?#]+)',
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1)
return None

View File

@@ -0,0 +1,123 @@
"""YouTube video loader for extracting transcripts from YouTube videos."""
import re
from typing import Any
from urllib.parse import urlparse, parse_qs
from crewai_tools.rag.base_loader import BaseLoader, LoaderResult
from crewai_tools.rag.source_content import SourceContent
class YoutubeVideoLoader(BaseLoader):
"""Loader for YouTube videos."""
def load(self, source: SourceContent, **kwargs) -> LoaderResult:
"""Load and extract transcript from a YouTube video.
Args:
source: The source content containing the YouTube URL
Returns:
LoaderResult with transcript content
Raises:
ImportError: If required YouTube libraries aren't installed
ValueError: If the URL is not a valid YouTube video URL
"""
try:
from youtube_transcript_api import YouTubeTranscriptApi
except ImportError:
raise ImportError(
"YouTube support requires youtube-transcript-api. "
"Install with: uv add youtube-transcript-api"
)
video_url = source.source
video_id = self._extract_video_id(video_url)
if not video_id:
raise ValueError(f"Invalid YouTube URL: {video_url}")
metadata: dict[str, Any] = {
"source": video_url,
"video_id": video_id,
"data_type": "youtube_video"
}
try:
api = YouTubeTranscriptApi()
transcript_list = api.list(video_id)
transcript = None
try:
transcript = transcript_list.find_transcript(['en'])
except:
try:
transcript = transcript_list.find_generated_transcript(['en'])
except:
transcript = next(iter(transcript_list))
if transcript:
metadata["language"] = transcript.language
metadata["is_generated"] = transcript.is_generated
transcript_data = transcript.fetch()
text_content = []
for entry in transcript_data:
text = entry.text.strip() if hasattr(entry, 'text') else ''
if text:
text_content.append(text)
content = ' '.join(text_content)
try:
from pytube import YouTube
yt = YouTube(video_url)
metadata["title"] = yt.title
metadata["author"] = yt.author
metadata["length_seconds"] = yt.length
metadata["description"] = yt.description[:500] if yt.description else None
if yt.title:
content = f"Title: {yt.title}\n\nAuthor: {yt.author or 'Unknown'}\n\nTranscript:\n{content}"
except:
pass
else:
raise ValueError(f"No transcript available for YouTube video: {video_id}")
except Exception as e:
raise ValueError(f"Unable to extract transcript from YouTube video {video_id}: {str(e)}") from e
return LoaderResult(
content=content,
source=video_url,
metadata=metadata,
doc_id=self.generate_doc_id(source_ref=video_url, content=content)
)
def _extract_video_id(self, url: str) -> str | None:
"""Extract video ID from various YouTube URL formats."""
patterns = [
r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/v\/)([^&\n?#]+)',
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1)
try:
parsed = urlparse(url)
hostname = parsed.hostname
if hostname:
hostname_lower = hostname.lower()
# Allow youtube.com and any subdomain of youtube.com, plus youtu.be shortener
if hostname_lower == 'youtube.com' or hostname_lower.endswith('.youtube.com') or hostname_lower == 'youtu.be':
query_params = parse_qs(parsed.query)
if 'v' in query_params:
return query_params['v'][0]
except:
pass
return None

View File

@@ -1,4 +1,29 @@
import hashlib
from typing import Any
def compute_sha256(content: str) -> str:
return hashlib.sha256(content.encode("utf-8")).hexdigest()
def sanitize_metadata_for_chromadb(metadata: dict[str, Any]) -> dict[str, Any]:
"""Sanitize metadata to ensure ChromaDB compatibility.
ChromaDB only accepts str, int, float, or bool values in metadata.
This function converts other types to strings.
Args:
metadata: Dictionary of metadata to sanitize
Returns:
Sanitized metadata dictionary with only ChromaDB-compatible types
"""
sanitized = {}
for key, value in metadata.items():
if isinstance(value, (str, int, float, bool)) or value is None:
sanitized[key] = value
elif isinstance(value, (list, tuple)):
# Convert lists/tuples to pipe-separated strings
sanitized[key] = " | ".join(str(v) for v in value)
else:
# Convert other types to string
sanitized[key] = str(value)
return sanitized