mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-14 18:48:29 +00:00
Squashed 'packages/tools/' changes from 78317b9c..0b3f00e6
0b3f00e6 chore: update project version to 0.73.0 and revise uv.lock dependencies (#455) ad19b074 feat: replace embedchain with native crewai adapter (#451) git-subtree-dir: packages/tools git-subtree-split: 0b3f00e67c0dae24d188c292dc99759fd1c841f7
This commit is contained in:
110
crewai_tools/rag/loaders/github_loader.py
Normal file
110
crewai_tools/rag/loaders/github_loader.py
Normal file
@@ -0,0 +1,110 @@
|
||||
"""GitHub repository content loader."""
|
||||
|
||||
from typing import Any
|
||||
|
||||
from github import Github, GithubException
|
||||
|
||||
from crewai_tools.rag.base_loader import BaseLoader, LoaderResult
|
||||
from crewai_tools.rag.source_content import SourceContent
|
||||
|
||||
|
||||
class GithubLoader(BaseLoader):
|
||||
"""Loader for GitHub repository content."""
|
||||
|
||||
def load(self, source: SourceContent, **kwargs) -> LoaderResult:
|
||||
"""Load content from a GitHub repository.
|
||||
|
||||
Args:
|
||||
source: GitHub repository URL
|
||||
**kwargs: Additional arguments including gh_token and content_types
|
||||
|
||||
Returns:
|
||||
LoaderResult with repository content
|
||||
"""
|
||||
metadata = kwargs.get("metadata", {})
|
||||
gh_token = metadata.get("gh_token")
|
||||
content_types = metadata.get("content_types", ["code", "repo"])
|
||||
|
||||
repo_url = source.source
|
||||
if not repo_url.startswith("https://github.com/"):
|
||||
raise ValueError(f"Invalid GitHub URL: {repo_url}")
|
||||
|
||||
parts = repo_url.replace("https://github.com/", "").strip("/").split("/")
|
||||
if len(parts) < 2:
|
||||
raise ValueError(f"Invalid GitHub repository URL: {repo_url}")
|
||||
|
||||
repo_name = f"{parts[0]}/{parts[1]}"
|
||||
|
||||
g = Github(gh_token) if gh_token else Github()
|
||||
|
||||
try:
|
||||
repo = g.get_repo(repo_name)
|
||||
except GithubException as e:
|
||||
raise ValueError(f"Unable to access repository {repo_name}: {e}")
|
||||
|
||||
all_content = []
|
||||
|
||||
if "repo" in content_types:
|
||||
all_content.append(f"Repository: {repo.full_name}")
|
||||
all_content.append(f"Description: {repo.description or 'No description'}")
|
||||
all_content.append(f"Language: {repo.language or 'Not specified'}")
|
||||
all_content.append(f"Stars: {repo.stargazers_count}")
|
||||
all_content.append(f"Forks: {repo.forks_count}")
|
||||
all_content.append("")
|
||||
|
||||
if "code" in content_types:
|
||||
try:
|
||||
readme = repo.get_readme()
|
||||
all_content.append("README:")
|
||||
all_content.append(readme.decoded_content.decode("utf-8", errors="ignore"))
|
||||
all_content.append("")
|
||||
except GithubException:
|
||||
pass
|
||||
|
||||
try:
|
||||
contents = repo.get_contents("")
|
||||
if isinstance(contents, list):
|
||||
all_content.append("Repository structure:")
|
||||
for content_file in contents[:20]:
|
||||
all_content.append(f"- {content_file.path} ({content_file.type})")
|
||||
all_content.append("")
|
||||
except GithubException:
|
||||
pass
|
||||
|
||||
if "pr" in content_types:
|
||||
prs = repo.get_pulls(state="open")
|
||||
pr_list = list(prs[:5])
|
||||
if pr_list:
|
||||
all_content.append("Recent Pull Requests:")
|
||||
for pr in pr_list:
|
||||
all_content.append(f"- PR #{pr.number}: {pr.title}")
|
||||
if pr.body:
|
||||
body_preview = pr.body[:200].replace("\n", " ")
|
||||
all_content.append(f" {body_preview}")
|
||||
all_content.append("")
|
||||
|
||||
if "issue" in content_types:
|
||||
issues = repo.get_issues(state="open")
|
||||
issue_list = [i for i in list(issues[:10]) if not i.pull_request][:5]
|
||||
if issue_list:
|
||||
all_content.append("Recent Issues:")
|
||||
for issue in issue_list:
|
||||
all_content.append(f"- Issue #{issue.number}: {issue.title}")
|
||||
if issue.body:
|
||||
body_preview = issue.body[:200].replace("\n", " ")
|
||||
all_content.append(f" {body_preview}")
|
||||
all_content.append("")
|
||||
|
||||
if not all_content:
|
||||
raise ValueError(f"No content could be loaded from repository: {repo_url}")
|
||||
|
||||
content = "\n".join(all_content)
|
||||
return LoaderResult(
|
||||
content=content,
|
||||
metadata={
|
||||
"source": repo_url,
|
||||
"repo": repo_name,
|
||||
"content_types": content_types
|
||||
},
|
||||
doc_id=self.generate_doc_id(source_ref=repo_url, content=content)
|
||||
)
|
||||
Reference in New Issue
Block a user