adding RAG spefic readme and implementing specific helpers

2026-01-10 00:28:31 +00:00 · 2024-02-15 18:01:53 -03:00
parent aa7e336989
commit a20481d023
2 changed files with 148 additions and 8 deletions
--- a/src/crewai_tools/tools/rag/README.md
+++ b/src/crewai_tools/tools/rag/README.md
@@ -0,0 +1,64 @@
+# RagTool: A Dynamic Knowledge Base Tool
+
+RagTool is designed to answer questions by leveraging the power of RAG by leveraging (EmbedChain). It integrates seamlessly with the CrewAI ecosystem, offering a versatile and powerful solution for information retrieval.
+
+## **Overview**
+
+RagTool enables users to dynamically query a knowledge base, making it an ideal tool for applications requiring access to a vast array of information. Its flexible design allows for integration with various data sources, including files, directories, web pages, yoututbe videos and custom configurations.
+
+## **Usage**
+
+RagTool can be instantiated with data from different sources, including:
+
+- 📰 PDF file
+- 📊 CSV file
+- 📃 JSON file
+- 📝 Text
+- 📁 Directory/ Folder
+- 🌐 HTML Web page
+- 📽️ Youtube Channel
+- 📺 Youtube Video
+- 📚 Docs website
+- 📝 MDX file
+- 📄 DOCX file
+- 🧾 XML file
+- 📬 Gmail
+- 📝 Github
+- 🐘 Postgres
+- 🐬 MySQL
+- 🤖 Slack
+- 💬 Discord
+- 🗨️ Discourse
+- 📝 Substack
+- 🐝 Beehiiv
+- 💾 Dropbox
+- 🖼️ Image
+- ⚙️ Custom
+
+#### **Creating an Instance**
+
+```python
+from crewai_tools.tools.rag_tool import RagTool
+
+# Example: Loading from a file
+rag_tool = RagTool().from_file('path/to/your/file.txt')
+
+# Example: Loading from a directory
+rag_tool = RagTool().from_directory('path/to/your/directory')
+
+# Example: Loading from a web page
+rag_tool = RagTool().from_web_page('https://example.com')
+
+# Example: Loading from an Embedchain configuration
+rag_tool = RagTool().from_embedchain('path/to/your/config.json')
+```
+
+## **Contribution**
+
+Contributions to RagTool and the broader CrewAI tools ecosystem are welcome. To contribute, please follow the standard GitHub workflow for forking the repository, making changes, and submitting a pull request.
+
+## **License**
+
+RagTool is open-source and available under the MIT license.
+
+Thank you for considering RagTool for your knowledge base needs. Your contributions and feedback are invaluable to making RagTool even better.
--- a/src/crewai_tools/tools/rag/rag_tool.py
+++ b/src/crewai_tools/tools/rag/rag_tool.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any
+from typing import Any, List

 from pydantic import BaseModel, ConfigDict

@@ -13,7 +13,6 @@ class Adapter(BaseModel, ABC):
    def query(self, question: str) -> str:
        """Query the knowledge base with a question and return the answer."""

-
 class RagTool(BaseTool):
    name: str = "Knowledge base"
    description: str = "A knowledge base that can be used to answer questions."
@@ -52,23 +51,100 @@ class RagTool(BaseTool):
        adapter = EmbedchainAdapter(embedchain_app=app)
        return RagTool(adapter=adapter)

-    def from_web_page(self, url: str):
+    def from_pg_db(self, db_uri: str, table_name: str):
        from embedchain import App
-        from embedchain.models.data_type import DataType
-
+        from embedchain.loaders.postgres import PostgresLoader
        from crewai_tools.adapters.embedchain_adapter import EmbedchainAdapter

+        config = { "url":  db_uri }
+        postgres_loader = PostgresLoader(config=config)
        app = App()
-        app.add(url, data_type=DataType.WEB_PAGE)
-
+        app.add(
+            f"SELECT * FROM {table_name};",
+            data_type='postgres',
+            loader=postgres_loader
+        )
        adapter = EmbedchainAdapter(embedchain_app=app)
        return RagTool(adapter=adapter)

+
+    def from_github_repo(self, gh_token: str, gh_repo: str, type: List[str] = ["repo"]):
+        from embedchain import App
+        from embedchain.loaders.github import GithubLoader
+        from crewai_tools.adapters.embedchain_adapter import EmbedchainAdapter
+
+        loader = GithubLoader(
+            config={
+                "token": gh_token,
+                }
+            )
+        app = App()
+        app.add(f"repo:{gh_repo} type:{",".joing(type)}", data_type="github", loader=loader)
+        adapter = EmbedchainAdapter(embedchain_app=app)
+        return RagTool(adapter=adapter)
+
+    def from_xml_file(self, file_url: str):
+        from embedchain.models.data_type import DataType
+        return self._from_generic(file_url, DataType.XML)
+
+    def from_docx_file(self, file_url: str):
+        from embedchain.models.data_type import DataType
+        return self._from_generic(file_url, DataType.DOCX)
+
+    def from_docx_file(self, file_url: str):
+        from embedchain.models.data_type import DataType
+        return self._from_generic(file_url, DataType.DOCX)
+
+    def from_mdx_file(self, file_url: str):
+        from embedchain.models.data_type import DataType
+        return self._from_generic(file_url, DataType.MDX)
+
+    def from_code_docs(self, docs_url: str):
+        from embedchain.models.data_type import DataType
+        return self._from_generic(docs_url, DataType.DOCS_SITE)
+
+    def from_youtube_channel(self, channel_handle: str):
+        from embedchain.models.data_type import DataType
+        if not channel_handle.startswith("@"):
+            channel_handle = f"@{channel_handle}"
+        return self._from_generic(channel_handle, DataType.YOUTUBE_CHANNEL)
+
+    def from_website(self, url: str):
+        from embedchain.models.data_type import DataType
+        return self._from_generic(url, DataType.WEB_PAGE)
+
+    def from_text(self, text: str):
+        from embedchain.models.data_type import DataType
+        return self._from_generic(text, DataType.TEXT)
+
+    def from_json(self, file_path: str):
+        from embedchain.models.data_type import DataType
+        return self._from_generic(file_path, DataType.JSON)
+
+    def from_csv(self, file_path: str):
+        from embedchain.models.data_type import DataType
+        return self._from_generic(file_path, DataType.CSV)
+
+    def from_pdf(self, file_path: str):
+        from embedchain.models.data_type import DataType
+        return self._from_generic(file_path, DataType.PDF_FILE)
+
+    def from_web_page(self, url: str):
+        from embedchain.models.data_type import DataType
+        return self._from_generic(url, DataType.WEB_PAGE)
+
    def from_embedchain(self, config_path: str):
        from embedchain import App
-
        from crewai_tools.adapters.embedchain_adapter import EmbedchainAdapter

        app = App.from_config(config_path=config_path)
        adapter = EmbedchainAdapter(embedchain_app=app)
        return RagTool(adapter=adapter)
+
+    def _from_generic(self, source: str, type: str):
+        from embedchain import App
+        from crewai_tools.adapters.embedchain_adapter import EmbedchainAdapter
+        app = App()
+        app.add(source, data_type=type)
+        adapter = EmbedchainAdapter(embedchain_app=app)
+        return RagTool(adapter=adapter)