updated load_content() function in excel_knowledge_source.py to reduce memory usage and provide better documentation

This commit is contained in:
Dergky
2025-01-18 18:18:00 -05:00
parent bda9e34c57
commit 1002af8a31

View File

@@ -7,6 +7,29 @@ from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledge
class ExcelKnowledgeSource(BaseFileKnowledgeSource): class ExcelKnowledgeSource(BaseFileKnowledgeSource):
"""A knowledge source that stores and queries Excel file content using embeddings.""" """A knowledge source that stores and queries Excel file content using embeddings."""
def load_content(self) -> Dict[Path, Dict[str, str]]:
"""Load and preprocess Excel file content from multiple sheets.
Each sheet's content is converted to CSV format and stored.
Returns:
Dict[Path, Dict[str, str]]: A mapping of file paths to their respective sheet contents.
Raises:
ImportError: If required dependencies are missing.
FileNotFoundError: If the specified Excel file cannot be opened.
"""
pd = self._import_dependencies()
content_dict = {}
for file_path in self.safe_file_paths:
with pd.ExcelFile(file_path) as xl:
sheet_dict = {
sheet_name: pd.read_excel(xl, sheet_name).to_csv(index=False)
for sheet_name in xl.sheet_names
}
content_dict[file_path] = sheet_dict
return content_dict
def load_content(self) -> Dict[Path, str]: def load_content(self) -> Dict[Path, str]:
"""Load and preprocess Excel file content. Updated to account for .xlsx workbooks with multiple tabs/sheets""" """Load and preprocess Excel file content. Updated to account for .xlsx workbooks with multiple tabs/sheets"""
pd, openpyxl, load_workbook = self._import_dependencies() pd, openpyxl, load_workbook = self._import_dependencies()
@@ -33,7 +56,6 @@ class ExcelKnowledgeSource(BaseFileKnowledgeSource):
sheet_str += str(cell) + "," sheet_str += str(cell) + ","
sheet_str += "\n" sheet_str += "\n"
print(sheet_str)
# Add the sheet content to the file sheet dictionary # Add the sheet content to the file sheet dictionary
sheet_dict[sheet_name] = sheet_str sheet_dict[sheet_name] = sheet_str
# Add the file sheet dictionary to the content dictionary # Add the file sheet dictionary to the content dictionary
@@ -44,11 +66,11 @@ class ExcelKnowledgeSource(BaseFileKnowledgeSource):
def _import_dependencies(self): def _import_dependencies(self):
"""Dynamically import dependencies.""" """Dynamically import dependencies."""
try: try:
import openpyxl # noqa # import openpyxl # noqa
from openpyxl import load_workbook # from openpyxl import load_workbook
import pandas as pd import pandas as pd
return pd, openpyxl, load_workbook return pd
except ImportError as e: except ImportError as e:
missing_package = str(e).split()[-1] missing_package = str(e).split()[-1]
raise ImportError( raise ImportError(