from __future__ import annotations from datetime import ( date, datetime, time, timedelta, ) from typing import ( TYPE_CHECKING, Any, TypeAlias, ) from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import BaseExcelReader if TYPE_CHECKING: from python_calamine import ( CalamineSheet, CalamineWorkbook, ) from pandas._typing import ( FilePath, NaTType, ReadBuffer, Scalar, StorageOptions, ) _CellValue: TypeAlias = int | float | str | bool | time | date | datetime | timedelta class CalamineReader(BaseExcelReader["CalamineWorkbook"]): def __init__( self, filepath_or_buffer: FilePath | ReadBuffer[bytes], storage_options: StorageOptions | None = None, engine_kwargs: dict | None = None, ) -> None: """ Reader using calamine engine (xlsx/xls/xlsb/ods). Parameters ---------- filepath_or_buffer : str, path to be parsed or an open readable stream. storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value pairs are forwarded to ``urllib.request.Request`` as header options. For other URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more details, and for more examples on storage options refer `here `_. engine_kwargs : dict, optional Arbitrary keyword arguments passed to excel engine. """ import_optional_dependency("python_calamine") super().__init__( filepath_or_buffer, storage_options=storage_options, engine_kwargs=engine_kwargs, ) @property def _workbook_class(self) -> type[CalamineWorkbook]: from python_calamine import CalamineWorkbook return CalamineWorkbook def load_workbook( self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs: Any ) -> CalamineWorkbook: from python_calamine import load_workbook return load_workbook( filepath_or_buffer, **engine_kwargs, ) @property def sheet_names(self) -> list[str]: from python_calamine import SheetTypeEnum return [ sheet.name for sheet in self.book.sheets_metadata if sheet.typ == SheetTypeEnum.WorkSheet ] def get_sheet_by_name(self, name: str) -> CalamineSheet: self.raise_if_bad_sheet_by_name(name) return self.book.get_sheet_by_name(name) def get_sheet_by_index(self, index: int) -> CalamineSheet: self.raise_if_bad_sheet_by_index(index) return self.book.get_sheet_by_index(index) def get_sheet_data( self, sheet: CalamineSheet, file_rows_needed: int | None = None ) -> list[list[Scalar | NaTType | time]]: def _convert_cell(value: _CellValue) -> Scalar | NaTType | time: if isinstance(value, float): val = int(value) if val == value: return val else: return value elif isinstance(value, (datetime, timedelta)): # Return as-is to match openpyxl behavior (GH#59186) return value elif isinstance(value, date): # Convert date to datetime to match openpyxl behavior (GH#59186) return datetime(value.year, value.month, value.day) elif isinstance(value, time): return value return value rows: list[list[_CellValue]] = sheet.to_python( skip_empty_area=False, nrows=file_rows_needed ) data = [[_convert_cell(cell) for cell in row] for row in rows] return data