mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-11 17:18:29 +00:00
0b3f00e6 chore: update project version to 0.73.0 and revise uv.lock dependencies (#455) ad19b074 feat: replace embedchain with native crewai adapter (#451) git-subtree-dir: packages/tools git-subtree-split: 0b3f00e67c0dae24d188c292dc99759fd1c841f7
110 lines
4.2 KiB
Python
110 lines
4.2 KiB
Python
"""GitHub repository content loader."""
|
|
|
|
from typing import Any
|
|
|
|
from github import Github, GithubException
|
|
|
|
from crewai_tools.rag.base_loader import BaseLoader, LoaderResult
|
|
from crewai_tools.rag.source_content import SourceContent
|
|
|
|
|
|
class GithubLoader(BaseLoader):
|
|
"""Loader for GitHub repository content."""
|
|
|
|
def load(self, source: SourceContent, **kwargs) -> LoaderResult:
|
|
"""Load content from a GitHub repository.
|
|
|
|
Args:
|
|
source: GitHub repository URL
|
|
**kwargs: Additional arguments including gh_token and content_types
|
|
|
|
Returns:
|
|
LoaderResult with repository content
|
|
"""
|
|
metadata = kwargs.get("metadata", {})
|
|
gh_token = metadata.get("gh_token")
|
|
content_types = metadata.get("content_types", ["code", "repo"])
|
|
|
|
repo_url = source.source
|
|
if not repo_url.startswith("https://github.com/"):
|
|
raise ValueError(f"Invalid GitHub URL: {repo_url}")
|
|
|
|
parts = repo_url.replace("https://github.com/", "").strip("/").split("/")
|
|
if len(parts) < 2:
|
|
raise ValueError(f"Invalid GitHub repository URL: {repo_url}")
|
|
|
|
repo_name = f"{parts[0]}/{parts[1]}"
|
|
|
|
g = Github(gh_token) if gh_token else Github()
|
|
|
|
try:
|
|
repo = g.get_repo(repo_name)
|
|
except GithubException as e:
|
|
raise ValueError(f"Unable to access repository {repo_name}: {e}")
|
|
|
|
all_content = []
|
|
|
|
if "repo" in content_types:
|
|
all_content.append(f"Repository: {repo.full_name}")
|
|
all_content.append(f"Description: {repo.description or 'No description'}")
|
|
all_content.append(f"Language: {repo.language or 'Not specified'}")
|
|
all_content.append(f"Stars: {repo.stargazers_count}")
|
|
all_content.append(f"Forks: {repo.forks_count}")
|
|
all_content.append("")
|
|
|
|
if "code" in content_types:
|
|
try:
|
|
readme = repo.get_readme()
|
|
all_content.append("README:")
|
|
all_content.append(readme.decoded_content.decode("utf-8", errors="ignore"))
|
|
all_content.append("")
|
|
except GithubException:
|
|
pass
|
|
|
|
try:
|
|
contents = repo.get_contents("")
|
|
if isinstance(contents, list):
|
|
all_content.append("Repository structure:")
|
|
for content_file in contents[:20]:
|
|
all_content.append(f"- {content_file.path} ({content_file.type})")
|
|
all_content.append("")
|
|
except GithubException:
|
|
pass
|
|
|
|
if "pr" in content_types:
|
|
prs = repo.get_pulls(state="open")
|
|
pr_list = list(prs[:5])
|
|
if pr_list:
|
|
all_content.append("Recent Pull Requests:")
|
|
for pr in pr_list:
|
|
all_content.append(f"- PR #{pr.number}: {pr.title}")
|
|
if pr.body:
|
|
body_preview = pr.body[:200].replace("\n", " ")
|
|
all_content.append(f" {body_preview}")
|
|
all_content.append("")
|
|
|
|
if "issue" in content_types:
|
|
issues = repo.get_issues(state="open")
|
|
issue_list = [i for i in list(issues[:10]) if not i.pull_request][:5]
|
|
if issue_list:
|
|
all_content.append("Recent Issues:")
|
|
for issue in issue_list:
|
|
all_content.append(f"- Issue #{issue.number}: {issue.title}")
|
|
if issue.body:
|
|
body_preview = issue.body[:200].replace("\n", " ")
|
|
all_content.append(f" {body_preview}")
|
|
all_content.append("")
|
|
|
|
if not all_content:
|
|
raise ValueError(f"No content could be loaded from repository: {repo_url}")
|
|
|
|
content = "\n".join(all_content)
|
|
return LoaderResult(
|
|
content=content,
|
|
metadata={
|
|
"source": repo_url,
|
|
"repo": repo_name,
|
|
"content_types": content_types
|
|
},
|
|
doc_id=self.generate_doc_id(source_ref=repo_url, content=content)
|
|
) |