Squashed 'packages/tools/' content from commit 78317b9c

git-subtree-dir: packages/tools
git-subtree-split: 78317b9c127f18bd040c1d77e3c0840cdc9a5b38
This commit is contained in:
Greyson Lalonde
2025-09-12 21:58:02 -04:00
commit e16606672a
303 changed files with 49010 additions and 0 deletions

View File

@@ -0,0 +1,67 @@
# GithubSearchTool
## Description
The GithubSearchTool is a Retrieval Augmented Generation (RAG) tool specifically designed for conducting semantic searches within GitHub repositories. Utilizing advanced semantic search capabilities, it sifts through code, pull requests, issues, and repositories, making it an essential tool for developers, researchers, or anyone in need of precise information from GitHub.
## Installation
To use the GithubSearchTool, first ensure the crewai_tools package is installed in your Python environment:
```shell
pip install 'crewai[tools]'
```
This command installs the necessary package to run the GithubSearchTool along with any other tools included in the crewai_tools package.
## Example
Heres how you can use the GithubSearchTool to perform semantic searches within a GitHub repository:
```python
from crewai_tools import GithubSearchTool
# Initialize the tool for semantic searches within a specific GitHub repository
tool = GithubSearchTool(
gh_token='...',
github_repo='https://github.com/example/repo',
content_types=['code', 'issue'] # Options: code, repo, pr, issue
)
# OR
# Initialize the tool for semantic searches within a specific GitHub repository, so the agent can search any repository if it learns about during its execution
tool = GithubSearchTool(
gh_token='...',
content_types=['code', 'issue'] # Options: code, repo, pr, issue
)
```
## Arguments
- `gh_token` : The GitHub token used to authenticate the search. This is a mandatory field and allows the tool to access the GitHub API for conducting searches.
- `github_repo` : The URL of the GitHub repository where the search will be conducted. This is a mandatory field and specifies the target repository for your search.
- `content_types` : Specifies the types of content to include in your search. You must provide a list of content types from the following options: `code` for searching within the code, `repo` for searching within the repository's general information, `pr` for searching within pull requests, and `issue` for searching within issues. This field is mandatory and allows tailoring the search to specific content types within the GitHub repository.
## Custom model and embeddings
By default, the tool uses OpenAI for both embeddings and summarization. To customize the model, you can use a config dictionary as follows:
```python
tool = GithubSearchTool(
config=dict(
llm=dict(
provider="ollama", # or google, openai, anthropic, llama2, ...
config=dict(
model="llama2",
# temperature=0.5,
# top_p=1,
# stream=true,
),
),
embedder=dict(
provider="google",
config=dict(
model="models/embedding-001",
task_type="retrieval_document",
# title="Embeddings",
),
),
)
)
```

View File

@@ -0,0 +1,88 @@
from typing import List, Optional, Type, Any
try:
from embedchain.loaders.github import GithubLoader
EMBEDCHAIN_AVAILABLE = True
except ImportError:
EMBEDCHAIN_AVAILABLE = False
from pydantic import BaseModel, Field, PrivateAttr
from ..rag.rag_tool import RagTool
class FixedGithubSearchToolSchema(BaseModel):
"""Input for GithubSearchTool."""
search_query: str = Field(
...,
description="Mandatory search query you want to use to search the github repo's content",
)
class GithubSearchToolSchema(FixedGithubSearchToolSchema):
"""Input for GithubSearchTool."""
github_repo: str = Field(..., description="Mandatory github you want to search")
content_types: List[str] = Field(
...,
description="Mandatory content types you want to be included search, options: [code, repo, pr, issue]",
)
class GithubSearchTool(RagTool):
name: str = "Search a github repo's content"
description: str = (
"A tool that can be used to semantic search a query from a github repo's content. This is not the GitHub API, but instead a tool that can provide semantic search capabilities."
)
summarize: bool = False
gh_token: str
args_schema: Type[BaseModel] = GithubSearchToolSchema
content_types: List[str] = Field(
default_factory=lambda: ["code", "repo", "pr", "issue"],
description="Content types you want to be included search, options: [code, repo, pr, issue]",
)
_loader: Any | None = PrivateAttr(default=None)
def __init__(
self,
github_repo: Optional[str] = None,
content_types: Optional[List[str]] = None,
**kwargs,
):
if not EMBEDCHAIN_AVAILABLE:
raise ImportError("embedchain is not installed. Please install it with `pip install crewai-tools[embedchain]`")
super().__init__(**kwargs)
self._loader = GithubLoader(config={"token": self.gh_token})
if github_repo and content_types:
self.add(repo=github_repo, content_types=content_types)
self.description = f"A tool that can be used to semantic search a query the {github_repo} github repo's content. This is not the GitHub API, but instead a tool that can provide semantic search capabilities."
self.args_schema = FixedGithubSearchToolSchema
self._generate_description()
def add(
self,
repo: str,
content_types: Optional[List[str]] = None,
) -> None:
content_types = content_types or self.content_types
super().add(
f"repo:{repo} type:{','.join(content_types)}",
data_type="github",
loader=self._loader,
)
def _run(
self,
search_query: str,
github_repo: Optional[str] = None,
content_types: Optional[List[str]] = None,
) -> str:
if github_repo:
self.add(
repo=github_repo,
content_types=content_types,
)
return super()._run(query=search_query)