1#!/usr/bin/env python
2
3"""
4This module contains the XLSX extension to :class:`Table <agate.table.Table>`.
5"""
6
7import datetime
8from collections import OrderedDict
9
10import agate
11import openpyxl
12import six
13
14NULL_TIME = datetime.time(0, 0, 0)
15
16
17def from_xlsx(cls, path, sheet=None, skip_lines=0, header=True, read_only=True,
18              reset_dimensions=False, row_limit=None, **kwargs):
19    """
20    Parse an XLSX file.
21
22    :param path:
23        Path to an XLSX file to load or a file-like object for one.
24    :param sheet:
25        The names or integer indices of the worksheets to load. If not specified
26        then the "active" sheet will be used.
27    :param skip_lines:
28        The number of rows to skip from the top of the sheet.
29    :param header:
30        If :code:`True`, the first row is assumed to contain column names.
31    :param reset_dimensions:
32        If :code:`True`, do not trust the dimensions in the file's properties,
33        and recalculate them based on the data in the file.
34    :param row_limit:
35        Limit how many rows of data will be read.
36    """
37    if not isinstance(skip_lines, int):
38        raise ValueError('skip_lines argument must be an int')
39
40    if hasattr(path, 'read'):
41        f = path
42    else:
43        f = open(path, 'rb')
44
45    book = openpyxl.load_workbook(f, read_only=read_only, data_only=True)
46
47    multiple = agate.utils.issequence(sheet)
48    if multiple:
49        sheets = sheet
50    else:
51        sheets = [sheet]
52
53    tables = OrderedDict()
54
55    for i, sheet in enumerate(sheets):
56        if isinstance(sheet, six.string_types):
57            try:
58                sheet = book[sheet]
59            except KeyError:
60                f.close()
61                raise
62        elif isinstance(sheet, int):
63            try:
64                sheet = book.worksheets[sheet]
65            except IndexError:
66                f.close()
67                raise
68        else:
69            sheet = book.active
70
71        column_names = None
72        offset = 0
73        rows = []
74
75        if reset_dimensions:
76            sheet.reset_dimensions()
77
78        if header:
79            sheet_header = sheet.iter_rows(min_row=1 + skip_lines, max_row=1 + skip_lines)
80            column_names = [None if c.value is None else six.text_type(c.value) for row in sheet_header for c in row]
81            offset = 1
82
83        if row_limit is None:
84            sheet_rows = sheet.iter_rows(min_row=1 + skip_lines + offset)
85        else:
86            sheet_rows = sheet.iter_rows(min_row=1 + skip_lines + offset, max_row=1 + skip_lines + offset + row_limit)
87
88        for i, row in enumerate(sheet_rows):
89            values = []
90
91            for c in row:
92                value = c.value
93
94                if value.__class__ is datetime.datetime:
95                    # Handle default XLSX date as 00:00 time
96                    if value.date() == datetime.date(1904, 1, 1) and not has_date_elements(c):
97                        value = value.time()
98
99                        value = normalize_datetime(value)
100                    elif value.time() == NULL_TIME:
101                        value = value.date()
102                    else:
103                        value = normalize_datetime(value)
104
105                values.append(value)
106
107            rows.append(values)
108
109        if 'column_names' in kwargs:
110            if not header:
111                column_names = kwargs['column_names']
112            del kwargs['column_names']
113
114        tables[sheet.title] = agate.Table(rows, column_names, **kwargs)
115
116    f.close()
117
118    if multiple:
119        return agate.MappedSequence(tables.values(), tables.keys())
120    else:
121        return tables.popitem()[1]
122
123
124def normalize_datetime(dt):
125    if dt.microsecond == 0:
126        return dt
127
128    ms = dt.microsecond
129
130    if ms < 1000:
131        return dt.replace(microsecond=0)
132    elif ms > 999000:
133        return dt.replace(microsecond=0) + datetime.timedelta(seconds=1)
134
135    return dt
136
137
138def has_date_elements(cell):
139    """
140    Try to use formatting to determine if a cell contains only time info.
141
142    See: http://office.microsoft.com/en-us/excel-help/number-format-codes-HP005198679.aspx
143    """
144    return 'd' in cell.number_format or 'y' in cell.number_format
145
146
147agate.Table.from_xlsx = classmethod(from_xlsx)
148