1""" 2 pyexcel_xlsr 3 ~~~~~~~~~~~~~~~~~~~ 4 5 The lower level xls/xlsm file format handler using xlrd 6 7 :copyright: (c) 2016-2021 by Onni Software Ltd 8 :license: New BSD License 9""" 10import datetime 11 12import xlrd 13from pyexcel_io.service import has_no_digits_in_float 14from pyexcel_io.plugin_api import ISheet, IReader 15 16XLS_KEYWORDS = [ 17 "filename", 18 "logfile", 19 "verbosity", 20 "use_mmap", 21 "file_contents", 22 "encoding_override", 23 "formatting_info", 24 "on_demand", 25 "ragged_rows", 26] 27DEFAULT_ERROR_VALUE = "#N/A" 28 29 30class MergedCell(object): 31 def __init__(self, row_low, row_high, column_low, column_high): 32 self.__rl = row_low 33 self.__rh = row_high 34 self.__cl = column_low 35 self.__ch = column_high 36 self.value = None 37 38 def register_cells(self, registry): 39 for rowx in range(self.__rl, self.__rh): 40 for colx in range(self.__cl, self.__ch): 41 key = "%s-%s" % (rowx, colx) 42 registry[key] = self 43 44 45class XLSheet(ISheet): 46 """ 47 xls, xlsx, xlsm sheet reader 48 49 Currently only support first sheet in the file 50 """ 51 52 def __init__(self, sheet, auto_detect_int=True, date_mode=0, **keywords): 53 self.__auto_detect_int = auto_detect_int 54 self.__hidden_cols = [] 55 self.__hidden_rows = [] 56 self.__merged_cells = {} 57 self._book_date_mode = date_mode 58 self.xls_sheet = sheet 59 self._keywords = keywords 60 if keywords.get("detect_merged_cells") is True: 61 for merged_cell_ranges in sheet.merged_cells: 62 merged_cells = MergedCell(*merged_cell_ranges) 63 merged_cells.register_cells(self.__merged_cells) 64 if keywords.get("skip_hidden_row_and_column") is True: 65 for col_index, info in self.xls_sheet.colinfo_map.items(): 66 if info.hidden == 1: 67 self.__hidden_cols.append(col_index) 68 for row_index, info in self.xls_sheet.rowinfo_map.items(): 69 if info.hidden == 1: 70 self.__hidden_rows.append(row_index) 71 72 @property 73 def name(self): 74 return self.xls_sheet.name 75 76 def row_iterator(self): 77 number_of_rows = self.xls_sheet.nrows - len(self.__hidden_rows) 78 return range(number_of_rows) 79 80 def column_iterator(self, row): 81 number_of_columns = self.xls_sheet.ncols - len(self.__hidden_cols) 82 for column in range(number_of_columns): 83 yield self.cell_value(row, column) 84 85 def cell_value(self, row, column): 86 """ 87 Random access to the xls cells 88 """ 89 if self._keywords.get("skip_hidden_row_and_column") is True: 90 row, column = self._offset_hidden_indices(row, column) 91 cell_type = self.xls_sheet.cell_type(row, column) 92 value = self.xls_sheet.cell_value(row, column) 93 94 if cell_type == xlrd.XL_CELL_DATE: 95 value = xldate_to_python_date(value, self._book_date_mode) 96 elif cell_type == xlrd.XL_CELL_NUMBER and self.__auto_detect_int: 97 if has_no_digits_in_float(value): 98 value = int(value) 99 elif cell_type == xlrd.XL_CELL_ERROR: 100 value = DEFAULT_ERROR_VALUE 101 102 if self.__merged_cells: 103 merged_cell = self.__merged_cells.get("%s-%s" % (row, column)) 104 if merged_cell: 105 if merged_cell.value: 106 value = merged_cell.value 107 else: 108 merged_cell.value = value 109 return value 110 111 def _offset_hidden_indices(self, row, column): 112 row = calculate_offsets(row, self.__hidden_rows) 113 column = calculate_offsets(column, self.__hidden_cols) 114 return row, column 115 116 117def calculate_offsets(incoming_index, hidden_indices): 118 offset = 0 119 for index in hidden_indices: 120 if index <= (incoming_index + offset): 121 offset += 1 122 return incoming_index + offset 123 124 125class XLSReader(IReader): 126 """ 127 XLSBook reader 128 129 It reads xls, xlsm, xlsx work book 130 """ 131 132 def __init__(self, file_type, **keywords): 133 self.__skip_hidden_sheets = keywords.get("skip_hidden_sheets", True) 134 self.__skip_hidden_row_column = keywords.get( 135 "skip_hidden_row_and_column", True 136 ) 137 self.__detect_merged_cells = keywords.get("detect_merged_cells", False) 138 self._keywords = keywords 139 xlrd_params = self._extract_xlrd_params() 140 if self.__skip_hidden_row_column and file_type == "xls": 141 xlrd_params["formatting_info"] = True 142 if self.__detect_merged_cells: 143 xlrd_params["formatting_info"] = True 144 145 self.content_array = [] 146 self.xls_book = self.get_xls_book(**xlrd_params) 147 for sheet in self.xls_book.sheets(): 148 if self.__skip_hidden_sheets and sheet.visibility != 0: 149 continue 150 self.content_array.append(sheet) 151 152 def read_sheet(self, index): 153 native_sheet = self.content_array[index] 154 sheet = XLSheet( 155 native_sheet, date_mode=self.xls_book.datemode, **self._keywords 156 ) 157 return sheet 158 159 def close(self): 160 if self.xls_book: 161 self.xls_book.release_resources() 162 self.xls_book = None 163 164 def get_xls_book(self, **xlrd_params): 165 xls_book = xlrd.open_workbook(**xlrd_params) 166 return xls_book 167 168 def _extract_xlrd_params(self): 169 params = {} 170 if self._keywords is not None: 171 for key in list(self._keywords.keys()): 172 if key in XLS_KEYWORDS: 173 params[key] = self._keywords.pop(key) 174 return params 175 176 177class XLSInFile(XLSReader): 178 def __init__(self, file_name, file_type, **keywords): 179 super().__init__(file_type, filename=file_name, **keywords) 180 181 182class XLSInContent(XLSReader): 183 def __init__(self, file_content, file_type, **keywords): 184 super().__init__(file_type, file_contents=file_content, **keywords) 185 186 187class XLSInMemory(XLSReader): 188 def __init__(self, file_stream, file_type, **keywords): 189 file_stream.seek(0) 190 super().__init__( 191 file_type, file_contents=file_stream.read(), **keywords 192 ) 193 194 195def xldate_to_python_date(value, date_mode): 196 """ 197 convert xl date to python date 198 """ 199 date_tuple = xlrd.xldate_as_tuple(value, date_mode) 200 201 ret = None 202 if date_tuple == (0, 0, 0, 0, 0, 0): 203 ret = datetime.datetime(1900, 1, 1, 0, 0, 0) 204 elif date_tuple[0:3] == (0, 0, 0): 205 ret = datetime.time(date_tuple[3], date_tuple[4], date_tuple[5]) 206 elif date_tuple[3:6] == (0, 0, 0): 207 ret = datetime.date(date_tuple[0], date_tuple[1], date_tuple[2]) 208 else: 209 ret = datetime.datetime( 210 date_tuple[0], 211 date_tuple[1], 212 date_tuple[2], 213 date_tuple[3], 214 date_tuple[4], 215 date_tuple[5], 216 ) 217 return ret 218