mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-01-10 16:48:30 +00:00
73 lines
2.4 KiB
Python
73 lines
2.4 KiB
Python
import csv
|
|
from io import StringIO
|
|
|
|
from crewai_tools.rag.base_loader import BaseLoader, LoaderResult
|
|
from crewai_tools.rag.source_content import SourceContent
|
|
|
|
|
|
class CSVLoader(BaseLoader):
|
|
def load(self, source_content: SourceContent, **kwargs) -> LoaderResult:
|
|
source_ref = source_content.source_ref
|
|
|
|
content_str = source_content.source
|
|
if source_content.is_url():
|
|
content_str = self._load_from_url(content_str, kwargs)
|
|
elif source_content.path_exists():
|
|
content_str = self._load_from_file(content_str)
|
|
|
|
return self._parse_csv(content_str, source_ref)
|
|
|
|
|
|
def _load_from_url(self, url: str, kwargs: dict) -> str:
|
|
import requests
|
|
|
|
headers = kwargs.get("headers", {
|
|
"Accept": "text/csv, application/csv, text/plain",
|
|
"User-Agent": "Mozilla/5.0 (compatible; crewai-tools CSVLoader)"
|
|
})
|
|
|
|
try:
|
|
response = requests.get(url, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
return response.text
|
|
except Exception as e:
|
|
raise ValueError(f"Error fetching CSV from URL {url}: {str(e)}")
|
|
|
|
def _load_from_file(self, path: str) -> str:
|
|
with open(path, "r", encoding="utf-8") as file:
|
|
return file.read()
|
|
|
|
def _parse_csv(self, content: str, source_ref: str) -> LoaderResult:
|
|
try:
|
|
csv_reader = csv.DictReader(StringIO(content))
|
|
|
|
text_parts = []
|
|
headers = csv_reader.fieldnames
|
|
|
|
if headers:
|
|
text_parts.append("Headers: " + " | ".join(headers))
|
|
text_parts.append("-" * 50)
|
|
|
|
for row_num, row in enumerate(csv_reader, 1):
|
|
row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
|
|
text_parts.append(f"Row {row_num}: {row_text}")
|
|
|
|
text = "\n".join(text_parts)
|
|
|
|
metadata = {
|
|
"format": "csv",
|
|
"columns": headers,
|
|
"rows": len(text_parts) - 2 if headers else 0
|
|
}
|
|
|
|
except Exception as e:
|
|
text = content
|
|
metadata = {"format": "csv", "parse_error": str(e)}
|
|
|
|
return LoaderResult(
|
|
content=text,
|
|
source=source_ref,
|
|
metadata=metadata,
|
|
doc_id=self.generate_doc_id(source_ref=source_ref, content=text)
|
|
)
|