Spaces:
Sleeping
Sleeping
| from typing import Iterator | |
| from langchain_core.documents import Document | |
| from langchain_community.document_loaders.base import BaseBlobParser | |
| from langchain_community.document_loaders.blob_loaders import Blob | |
| class XlsxParser(BaseBlobParser): | |
| """Parse Microsoft Excel spreadsheets from a blob.""" | |
| def lazy_parse(self, blob: Blob) -> Iterator[Document]: | |
| """Parse a Microsoft Excel document into the Document iterator. | |
| Args: | |
| blob: The blob to parse. | |
| Returns: An iterator of Documents. | |
| """ | |
| try: | |
| from openpyxl import load_workbook | |
| except ImportError as e: | |
| raise ImportError( | |
| "Could not import openpyxl, please install with `pip install openpyxl`." | |
| ) from e | |
| supported_mime_types = [ | |
| "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" # .xlsx | |
| ] | |
| # Debugging: Print MIME type | |
| print(f"Blob MIME type: {blob.mimetype}") | |
| if blob.mimetype not in supported_mime_types: | |
| raise ValueError( | |
| f"This blob type is not supported for this parser. Supported types are: {supported_mime_types}" | |
| ) | |
| with blob.as_bytes_io() as xlsx_file: | |
| workbook = load_workbook(xlsx_file, data_only=True) | |
| for sheet in workbook.sheetnames: | |
| worksheet = workbook[sheet] | |
| text = "" | |
| for row in worksheet.iter_rows(values_only=True): | |
| row_data = "\t".join([str(cell) if cell is not None else "" for cell in row]) | |
| text += row_data + "\n" | |
| metadata = {"source": blob.source, "sheet": sheet} | |
| yield Document(page_content=text, metadata=metadata) | |