1#!/usr/bin/env python 2 3""" 4This module contains the XLSX extension to :class:`Table <agate.table.Table>`. 5""" 6 7import datetime 8from collections import OrderedDict 9 10import agate 11import openpyxl 12import six 13 14NULL_TIME = datetime.time(0, 0, 0) 15 16 17def from_xlsx(cls, path, sheet=None, skip_lines=0, header=True, read_only=True, 18 reset_dimensions=False, row_limit=None, **kwargs): 19 """ 20 Parse an XLSX file. 21 22 :param path: 23 Path to an XLSX file to load or a file-like object for one. 24 :param sheet: 25 The names or integer indices of the worksheets to load. If not specified 26 then the "active" sheet will be used. 27 :param skip_lines: 28 The number of rows to skip from the top of the sheet. 29 :param header: 30 If :code:`True`, the first row is assumed to contain column names. 31 :param reset_dimensions: 32 If :code:`True`, do not trust the dimensions in the file's properties, 33 and recalculate them based on the data in the file. 34 :param row_limit: 35 Limit how many rows of data will be read. 36 """ 37 if not isinstance(skip_lines, int): 38 raise ValueError('skip_lines argument must be an int') 39 40 if hasattr(path, 'read'): 41 f = path 42 else: 43 f = open(path, 'rb') 44 45 book = openpyxl.load_workbook(f, read_only=read_only, data_only=True) 46 47 multiple = agate.utils.issequence(sheet) 48 if multiple: 49 sheets = sheet 50 else: 51 sheets = [sheet] 52 53 tables = OrderedDict() 54 55 for i, sheet in enumerate(sheets): 56 if isinstance(sheet, six.string_types): 57 try: 58 sheet = book[sheet] 59 except KeyError: 60 f.close() 61 raise 62 elif isinstance(sheet, int): 63 try: 64 sheet = book.worksheets[sheet] 65 except IndexError: 66 f.close() 67 raise 68 else: 69 sheet = book.active 70 71 column_names = None 72 offset = 0 73 rows = [] 74 75 if reset_dimensions: 76 sheet.reset_dimensions() 77 78 if header: 79 sheet_header = sheet.iter_rows(min_row=1 + skip_lines, max_row=1 + skip_lines) 80 column_names = [None if c.value is None else six.text_type(c.value) for row in sheet_header for c in row] 81 offset = 1 82 83 if row_limit is None: 84 sheet_rows = sheet.iter_rows(min_row=1 + skip_lines + offset) 85 else: 86 sheet_rows = sheet.iter_rows(min_row=1 + skip_lines + offset, max_row=1 + skip_lines + offset + row_limit) 87 88 for i, row in enumerate(sheet_rows): 89 values = [] 90 91 for c in row: 92 value = c.value 93 94 if value.__class__ is datetime.datetime: 95 # Handle default XLSX date as 00:00 time 96 if value.date() == datetime.date(1904, 1, 1) and not has_date_elements(c): 97 value = value.time() 98 99 value = normalize_datetime(value) 100 elif value.time() == NULL_TIME: 101 value = value.date() 102 else: 103 value = normalize_datetime(value) 104 105 values.append(value) 106 107 rows.append(values) 108 109 if 'column_names' in kwargs: 110 if not header: 111 column_names = kwargs['column_names'] 112 del kwargs['column_names'] 113 114 tables[sheet.title] = agate.Table(rows, column_names, **kwargs) 115 116 f.close() 117 118 if multiple: 119 return agate.MappedSequence(tables.values(), tables.keys()) 120 else: 121 return tables.popitem()[1] 122 123 124def normalize_datetime(dt): 125 if dt.microsecond == 0: 126 return dt 127 128 ms = dt.microsecond 129 130 if ms < 1000: 131 return dt.replace(microsecond=0) 132 elif ms > 999000: 133 return dt.replace(microsecond=0) + datetime.timedelta(seconds=1) 134 135 return dt 136 137 138def has_date_elements(cell): 139 """ 140 Try to use formatting to determine if a cell contains only time info. 141 142 See: http://office.microsoft.com/en-us/excel-help/number-format-codes-HP005198679.aspx 143 """ 144 return 'd' in cell.number_format or 'y' in cell.number_format 145 146 147agate.Table.from_xlsx = classmethod(from_xlsx) 148