1# -*- coding: ascii -*- 2from __future__ import print_function 3 4import sys, glob, string 5 6from .compat import xrange, unicode 7 8try: 9 from xlrd import open_workbook, XL_CELL_EMPTY, XL_CELL_BLANK, XL_CELL_TEXT, XL_CELL_NUMBER, cellname 10 null_cell_types = (XL_CELL_EMPTY, XL_CELL_BLANK) 11except ImportError: 12 # older version 13 from xlrd import open_workbook, XL_CELL_EMPTY, XL_CELL_TEXT, XL_CELL_NUMBER 14 null_cell_types = (XL_CELL_EMPTY, ) 15 16def cells_all_junk(cells, is_rubbish=None): 17 """\ 18 Return True if all cells in the sequence are junk. 19 What qualifies as junk: 20 -- empty cell 21 -- blank cell 22 -- zero-length text 23 -- text is all whitespace 24 -- number cell and is 0.0 25 -- text cell and is_rubbish(cell.value) returns True. 26 """ 27 for cell in cells: 28 if cell.ctype in null_cell_types: 29 continue 30 if cell.ctype == XL_CELL_TEXT: 31 if not cell.value: 32 continue 33 if cell.value.isspace(): 34 continue 35 if cell.ctype == XL_CELL_NUMBER: 36 if not cell.value: 37 continue 38 if is_rubbish is not None and is_rubbish(cell): 39 continue 40 return False 41 return True 42 43def ispunc(c, s=set(unicode(string.punctuation))): 44 """Return True if c is a single punctuation character""" 45 return c in s 46 47def number_of_good_rows(sheet, checker=None, nrows=None, ncols=None): 48 """Return 1 + the index of the last row with meaningful data in it.""" 49 if nrows is None: nrows = sheet.nrows 50 if ncols is None: ncols = sheet.ncols 51 for rowx in xrange(nrows - 1, -1, -1): 52 if not cells_all_junk(sheet.row_slice(rowx, 0, ncols), checker): 53 return rowx + 1 54 return 0 55 56def number_of_good_cols(sheet, checker=None, nrows=None, ncols=None): 57 """Return 1 + the index of the last column with meaningful data in it.""" 58 if nrows is None: nrows = sheet.nrows 59 if ncols is None: ncols = sheet.ncols 60 for colx in xrange(ncols - 1, -1, -1): 61 if not cells_all_junk(sheet.col_slice(colx, 0, nrows), checker): 62 return colx+1 63 return 0 64 65def safe_encode(ustr, encoding): 66 try: 67 return ustr.encode(encoding) 68 except (UnicodeEncodeError, UnicodeError): 69 return repr(ustr) 70 71def check_file(fname, verbose, do_punc=False, fmt_info=0, encoding='ascii', onesheet=''): 72 print() 73 print(fname) 74 if do_punc: 75 checker = ispunc 76 else: 77 checker = None 78 try: 79 book = open_workbook(fname, formatting_info=fmt_info, on_demand=True) 80 except TypeError: 81 try: 82 book = open_workbook(fname, formatting_info=fmt_info) 83 except TypeError: 84 # this is becoming ridiculous 85 book = open_workbook(fname) 86 totold = totnew = totnotnull = 0 87 if onesheet is None or onesheet == "": 88 shxrange = range(book.nsheets) 89 else: 90 try: 91 shxrange = [int(onesheet)] 92 except ValueError: 93 shxrange = [book.sheet_names().index(onesheet)] 94 for shx in shxrange: 95 sheet = book.sheet_by_index(shx) 96 ngoodrows = number_of_good_rows(sheet, checker) 97 ngoodcols = number_of_good_cols(sheet, checker, nrows=ngoodrows) 98 oldncells = sheet.nrows * sheet.ncols 99 newncells = ngoodrows * ngoodcols 100 totold += oldncells 101 totnew += newncells 102 nnotnull = 0 103 sheet_density_pct_s = '' 104 if verbose >= 2: 105 colxrange = range(ngoodcols) 106 for rowx in xrange(ngoodrows): 107 rowtypes = sheet.row_types(rowx) 108 for colx in colxrange: 109 if rowtypes[colx] not in null_cell_types: 110 nnotnull += 1 111 totnotnull += nnotnull 112 sheet_density_pct = (nnotnull * 100.0) / max(1, newncells) 113 sheet_density_pct_s = "; den = %5.1f%%" % sheet_density_pct 114 if verbose >= 3: 115 # which rows have non_empty cells in the right-most column? 116 lastcolx = sheet.ncols - 1 117 for rowx in xrange(sheet.nrows): 118 cell = sheet.cell(rowx, lastcolx) 119 if cell.ctype != XL_CELL_EMPTY: 120 print("%s (%d, %d): type %d, value %r" % ( 121 cellname(rowx, lastcolx), 122 rowx, lastcolx, cell.ctype, cell.value 123 )) 124 if (verbose 125 or ngoodrows != sheet.nrows 126 or ngoodcols != sheet.ncols 127 or (verbose >= 2 and sheet_density_pct < 90.0) 128 ): 129 if oldncells: 130 pctwaste = (1.0 - float(newncells) / oldncells) * 100.0 131 else: 132 pctwaste = 0.0 133 shname_enc = safe_encode(sheet.name, encoding) 134 print( 135 "sheet #%2d: RxC %5d x %3d => %5d x %3d; %4.1f%% waste%s (%s)" 136 % (shx, sheet.nrows, sheet.ncols, 137 ngoodrows, ngoodcols, pctwaste, 138 sheet_density_pct_s, shname_enc)) 139 if hasattr(book, 'unload_sheet'): 140 book.unload_sheet(shx) 141 if totold: 142 pctwaste = (1.0 - float(totnew) / totold) * 100.0 143 else: 144 pctwaste = 0.0 145 print("%d cells => %d cells; %4.1f%% waste" % (totold, totnew, pctwaste)) 146 147def main(): 148 import optparse 149 usage = "%prog [options] input-file-patterns" 150 oparser = optparse.OptionParser(usage) 151 oparser.add_option( 152 "-v", "--verbosity", 153 type="int", default=0, 154 help="level of information and diagnostics provided") 155 oparser.add_option( 156 "-p", "--punc", 157 action="store_true", default=False, 158 help="treat text cells containing only 1 punctuation char as rubbish") 159 oparser.add_option( 160 "-e", "--encoding", 161 default='', 162 help="encoding for text output") 163 oparser.add_option( 164 "-f", "--formatting", 165 action="store_true", default=False, 166 help="parse formatting information in the input files") 167 oparser.add_option( 168 "-s", "--onesheet", 169 default="", 170 help="restrict output to this sheet (name or index)") 171 options, args = oparser.parse_args(sys.argv[1:]) 172 if len(args) < 1: 173 oparser.error("Expected at least 1 arg, found %d" % len(args)) 174 encoding = options.encoding 175 if not encoding: 176 encoding = sys.stdout.encoding 177 if not encoding: 178 encoding = sys.getdefaultencoding() 179 for pattern in args: 180 for fname in glob.glob(pattern): 181 try: 182 check_file(fname, 183 options.verbosity, options.punc, 184 options.formatting, encoding, options.onesheet) 185 except: 186 e1, e2 = sys.exc_info()[:2] 187 print("*** File %s => %s:%s" % (fname, e1.__name__, e2)) 188 189if __name__ == "__main__": 190 main() 191