1# -*- coding: ascii -*-
2from __future__ import print_function
3
4import sys, glob, string
5
6from .compat import xrange, unicode
7
8try:
9    from xlrd import open_workbook, XL_CELL_EMPTY, XL_CELL_BLANK, XL_CELL_TEXT, XL_CELL_NUMBER, cellname
10    null_cell_types = (XL_CELL_EMPTY, XL_CELL_BLANK)
11except ImportError:
12    # older version
13    from xlrd import open_workbook, XL_CELL_EMPTY, XL_CELL_TEXT, XL_CELL_NUMBER
14    null_cell_types = (XL_CELL_EMPTY, )
15
16def cells_all_junk(cells, is_rubbish=None):
17    """\
18    Return True if all cells in the sequence are junk.
19    What qualifies as junk:
20    -- empty cell
21    -- blank cell
22    -- zero-length text
23    -- text is all whitespace
24    -- number cell and is 0.0
25    -- text cell and is_rubbish(cell.value) returns True.
26    """
27    for cell in cells:
28        if cell.ctype in null_cell_types:
29            continue
30        if cell.ctype == XL_CELL_TEXT:
31            if not cell.value:
32                continue
33            if cell.value.isspace():
34                continue
35        if cell.ctype == XL_CELL_NUMBER:
36            if not cell.value:
37                continue
38        if is_rubbish is not None and is_rubbish(cell):
39            continue
40        return False
41    return True
42
43def ispunc(c, s=set(unicode(string.punctuation))):
44    """Return True if c is a single punctuation character"""
45    return c in s
46
47def number_of_good_rows(sheet, checker=None, nrows=None, ncols=None):
48    """Return 1 + the index of the last row with meaningful data in it."""
49    if nrows is None: nrows = sheet.nrows
50    if ncols is None: ncols = sheet.ncols
51    for rowx in xrange(nrows - 1, -1, -1):
52        if not cells_all_junk(sheet.row_slice(rowx, 0, ncols), checker):
53            return rowx + 1
54    return 0
55
56def number_of_good_cols(sheet, checker=None, nrows=None, ncols=None):
57    """Return 1 + the index of the last column with meaningful data in it."""
58    if nrows is None: nrows = sheet.nrows
59    if ncols is None: ncols = sheet.ncols
60    for colx in xrange(ncols - 1, -1, -1):
61        if not cells_all_junk(sheet.col_slice(colx, 0, nrows), checker):
62            return colx+1
63    return 0
64
65def safe_encode(ustr, encoding):
66    try:
67        return ustr.encode(encoding)
68    except (UnicodeEncodeError, UnicodeError):
69        return repr(ustr)
70
71def check_file(fname, verbose, do_punc=False, fmt_info=0, encoding='ascii', onesheet=''):
72    print()
73    print(fname)
74    if do_punc:
75        checker = ispunc
76    else:
77        checker = None
78    try:
79        book = open_workbook(fname, formatting_info=fmt_info, on_demand=True)
80    except TypeError:
81        try:
82            book = open_workbook(fname, formatting_info=fmt_info)
83        except TypeError:
84            # this is becoming ridiculous
85            book = open_workbook(fname)
86    totold = totnew = totnotnull = 0
87    if onesheet is None or onesheet == "":
88        shxrange = range(book.nsheets)
89    else:
90        try:
91            shxrange = [int(onesheet)]
92        except ValueError:
93            shxrange = [book.sheet_names().index(onesheet)]
94    for shx in shxrange:
95        sheet = book.sheet_by_index(shx)
96        ngoodrows = number_of_good_rows(sheet, checker)
97        ngoodcols = number_of_good_cols(sheet, checker, nrows=ngoodrows)
98        oldncells = sheet.nrows * sheet.ncols
99        newncells = ngoodrows * ngoodcols
100        totold += oldncells
101        totnew += newncells
102        nnotnull = 0
103        sheet_density_pct_s = ''
104        if verbose >= 2:
105            colxrange = range(ngoodcols)
106            for rowx in xrange(ngoodrows):
107                rowtypes = sheet.row_types(rowx)
108                for colx in colxrange:
109                    if rowtypes[colx] not in null_cell_types:
110                        nnotnull += 1
111            totnotnull += nnotnull
112            sheet_density_pct = (nnotnull * 100.0) / max(1, newncells)
113            sheet_density_pct_s = "; den = %5.1f%%" % sheet_density_pct
114        if verbose >= 3:
115            # which rows have non_empty cells in the right-most column?
116            lastcolx = sheet.ncols - 1
117            for rowx in xrange(sheet.nrows):
118                cell = sheet.cell(rowx, lastcolx)
119                if cell.ctype != XL_CELL_EMPTY:
120                    print("%s (%d, %d): type %d, value %r" % (
121                        cellname(rowx, lastcolx),
122                        rowx, lastcolx, cell.ctype, cell.value
123                    ))
124        if (verbose
125            or ngoodrows != sheet.nrows
126            or ngoodcols != sheet.ncols
127            or (verbose >= 2 and sheet_density_pct < 90.0)
128            ):
129            if oldncells:
130                pctwaste = (1.0 - float(newncells) / oldncells) * 100.0
131            else:
132                pctwaste = 0.0
133            shname_enc = safe_encode(sheet.name, encoding)
134            print(
135                "sheet #%2d: RxC %5d x %3d => %5d x %3d; %4.1f%% waste%s (%s)"
136                % (shx, sheet.nrows, sheet.ncols,
137                   ngoodrows, ngoodcols, pctwaste,
138                   sheet_density_pct_s, shname_enc))
139        if hasattr(book, 'unload_sheet'):
140            book.unload_sheet(shx)
141    if totold:
142        pctwaste = (1.0 - float(totnew) / totold) * 100.0
143    else:
144        pctwaste = 0.0
145    print("%d cells => %d cells; %4.1f%% waste" % (totold, totnew, pctwaste))
146
147def main():
148    import optparse
149    usage = "%prog [options] input-file-patterns"
150    oparser = optparse.OptionParser(usage)
151    oparser.add_option(
152        "-v", "--verbosity",
153        type="int", default=0,
154        help="level of information and diagnostics provided")
155    oparser.add_option(
156        "-p", "--punc",
157        action="store_true", default=False,
158        help="treat text cells containing only 1 punctuation char as rubbish")
159    oparser.add_option(
160        "-e", "--encoding",
161        default='',
162        help="encoding for text output")
163    oparser.add_option(
164        "-f", "--formatting",
165        action="store_true", default=False,
166        help="parse formatting information in the input files")
167    oparser.add_option(
168        "-s", "--onesheet",
169        default="",
170        help="restrict output to this sheet (name or index)")
171    options, args = oparser.parse_args(sys.argv[1:])
172    if len(args) < 1:
173        oparser.error("Expected at least 1 arg, found %d" % len(args))
174    encoding = options.encoding
175    if not encoding:
176        encoding = sys.stdout.encoding
177    if not encoding:
178        encoding = sys.getdefaultencoding()
179    for pattern in args:
180        for fname in glob.glob(pattern):
181            try:
182                check_file(fname,
183                    options.verbosity, options.punc,
184                    options.formatting, encoding, options.onesheet)
185            except:
186                e1, e2 = sys.exc_info()[:2]
187                print("*** File %s => %s:%s" % (fname, e1.__name__, e2))
188
189if __name__ == "__main__":
190    main()
191