1#!/usr/bin/env python 2 3""" 4This module contains the Python 2 replacement for :mod:`csv`. 5""" 6 7import codecs 8import csv 9import warnings 10 11import six 12 13from agate.exceptions import FieldSizeLimitError 14 15EIGHT_BIT_ENCODINGS = [ 16 'utf-8', 'u8', 'utf', 'utf8', 17 'latin-1', 'iso-8859-1', 'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'l1' 18] 19 20POSSIBLE_DELIMITERS = [',', '\t', ';', ' ', ':', '|'] 21 22 23class UTF8Recoder(six.Iterator): 24 """ 25 Iterator that reads an encoded stream and reencodes the input to UTF-8. 26 """ 27 def __init__(self, f, encoding): 28 self.reader = codecs.getreader(encoding)(f) 29 30 def __iter__(self): 31 return self 32 33 def __next__(self): 34 return next(self.reader).encode('utf-8') 35 36 37class UnicodeReader(object): 38 """ 39 A CSV reader which will read rows from a file in a given encoding. 40 """ 41 def __init__(self, f, encoding='utf-8', field_size_limit=None, line_numbers=False, header=True, **kwargs): 42 self.line_numbers = line_numbers 43 self.header = header 44 45 f = UTF8Recoder(f, encoding) 46 47 self.reader = csv.reader(f, **kwargs) 48 49 if field_size_limit: 50 csv.field_size_limit(field_size_limit) 51 52 def next(self): 53 try: 54 row = next(self.reader) 55 except csv.Error as e: 56 # Terrible way to test for this exception, but there is no subclass 57 if 'field larger than field limit' in str(e): 58 raise FieldSizeLimitError(csv.field_size_limit(), self.line_num) 59 else: 60 raise e 61 62 if self.line_numbers: 63 if self.header and self.line_num == 1: 64 row.insert(0, 'line_numbers') 65 else: 66 row.insert(0, str(self.line_num - 1 if self.header else self.line_num)) 67 68 return [six.text_type(s, 'utf-8') for s in row] 69 70 def __iter__(self): 71 return self 72 73 @property 74 def dialect(self): 75 return self.reader.dialect 76 77 @property 78 def line_num(self): 79 return self.reader.line_num 80 81 82class UnicodeWriter(object): 83 """ 84 A CSV writer which will write rows to a file in the specified encoding. 85 86 NB: Optimized so that eight-bit encodings skip re-encoding. See: 87 https://github.com/wireservice/csvkit/issues/175 88 """ 89 def __init__(self, f, encoding='utf-8', **kwargs): 90 self.encoding = encoding 91 self._eight_bit = (self.encoding.lower().replace('_', '-') in EIGHT_BIT_ENCODINGS) 92 93 if self._eight_bit: 94 self.writer = csv.writer(f, **kwargs) 95 else: 96 # Redirect output to a queue for reencoding 97 self.queue = six.StringIO() 98 self.writer = csv.writer(self.queue, **kwargs) 99 self.stream = f 100 self.encoder = codecs.getincrementalencoder(encoding)() 101 102 def writerow(self, row): 103 if self._eight_bit: 104 self.writer.writerow([six.text_type(s if s is not None else '').encode(self.encoding) for s in row]) 105 else: 106 self.writer.writerow([six.text_type(s if s is not None else '').encode('utf-8') for s in row]) 107 # Fetch UTF-8 output from the queue... 108 data = self.queue.getvalue() 109 data = data.decode('utf-8') 110 # ...and reencode it into the target encoding 111 data = self.encoder.encode(data) 112 # write to the file 113 self.stream.write(data) 114 # empty the queue 115 self.queue.truncate(0) 116 117 def writerows(self, rows): 118 for row in rows: 119 self.writerow(row) 120 121 122class UnicodeDictReader(csv.DictReader): 123 """ 124 Defer almost all implementation to :class:`csv.DictReader`, but wraps our 125 unicode reader instead of :func:`csv.reader`. 126 """ 127 def __init__(self, f, fieldnames=None, restkey=None, restval=None, *args, **kwargs): 128 reader = UnicodeReader(f, *args, **kwargs) 129 130 if 'encoding' in kwargs: 131 kwargs.pop('encoding') 132 133 csv.DictReader.__init__(self, f, fieldnames, restkey, restval, *args, **kwargs) 134 135 self.reader = reader 136 137 138class UnicodeDictWriter(csv.DictWriter): 139 """ 140 Defer almost all implementation to :class:`csv.DictWriter`, but wraps our 141 unicode writer instead of :func:`csv.writer`. 142 """ 143 def __init__(self, f, fieldnames, restval='', extrasaction='raise', *args, **kwds): 144 self.fieldnames = fieldnames 145 self.restval = restval 146 147 if extrasaction.lower() not in ('raise', 'ignore'): 148 raise ValueError('extrasaction (%s) must be "raise" or "ignore"' % extrasaction) 149 150 self.extrasaction = extrasaction 151 152 self.writer = UnicodeWriter(f, *args, **kwds) 153 154 155class Reader(UnicodeReader): 156 """ 157 A unicode-aware CSV reader. 158 """ 159 pass 160 161 162class Writer(UnicodeWriter): 163 """ 164 A unicode-aware CSV writer. 165 """ 166 def __init__(self, f, encoding='utf-8', line_numbers=False, **kwargs): 167 self.row_count = 0 168 self.line_numbers = line_numbers 169 170 if 'lineterminator' not in kwargs: 171 kwargs['lineterminator'] = '\n' 172 173 UnicodeWriter.__init__(self, f, encoding, **kwargs) 174 175 def _append_line_number(self, row): 176 if self.row_count == 0: 177 row.insert(0, 'line_number') 178 else: 179 row.insert(0, self.row_count) 180 181 self.row_count += 1 182 183 def writerow(self, row): 184 if self.line_numbers: 185 row = list(row) 186 self._append_line_number(row) 187 188 # Convert embedded Mac line endings to unix style line endings so they get quoted 189 row = [i.replace('\r', '\n') if isinstance(i, six.string_types) else i for i in row] 190 191 UnicodeWriter.writerow(self, row) 192 193 def writerows(self, rows): 194 for row in rows: 195 self.writerow(row) 196 197 198class DictReader(UnicodeDictReader): 199 """ 200 A unicode-aware CSV DictReader. 201 """ 202 pass 203 204 205class DictWriter(UnicodeDictWriter): 206 """ 207 A unicode-aware CSV DictWriter. 208 """ 209 def __init__(self, f, fieldnames, encoding='utf-8', line_numbers=False, **kwargs): 210 self.row_count = 0 211 self.line_numbers = line_numbers 212 213 if 'lineterminator' not in kwargs: 214 kwargs['lineterminator'] = '\n' 215 216 UnicodeDictWriter.__init__(self, f, fieldnames, encoding=encoding, **kwargs) 217 218 def _append_line_number(self, row): 219 if self.row_count == 0: 220 row['line_number'] = 0 221 else: 222 row['line_number'] = self.row_count 223 224 self.row_count += 1 225 226 def writerow(self, row): 227 if self.line_numbers: 228 row = list(row) 229 self._append_line_number(row) 230 231 # Convert embedded Mac line endings to unix style line endings so they get quoted 232 row = dict([ 233 (k, v.replace('\r', '\n')) if isinstance(v, basestring) else (k, v) for k, v in row.items() # noqa: F821 234 ]) 235 236 UnicodeDictWriter.writerow(self, row) 237 238 def writerows(self, rows): 239 for row in rows: 240 self.writerow(row) 241 242 243class Sniffer(object): 244 """ 245 A functional wrapper of ``csv.Sniffer()``. 246 """ 247 def sniff(self, sample): 248 """ 249 A functional version of ``csv.Sniffer().sniff``, that extends the 250 list of possible delimiters to include some seen in the wild. 251 """ 252 try: 253 dialect = csv.Sniffer().sniff(sample, POSSIBLE_DELIMITERS) 254 except csv.Error as e: 255 warnings.warn('Error sniffing CSV dialect: %s' % e, RuntimeWarning, stacklevel=2) 256 dialect = None 257 258 return dialect 259 260 261def reader(*args, **kwargs): 262 """ 263 A replacement for Python's :func:`csv.reader` that uses 264 :class:`.csv_py2.Reader`. 265 """ 266 return Reader(*args, **kwargs) 267 268 269def writer(*args, **kwargs): 270 """ 271 A replacement for Python's :func:`csv.writer` that uses 272 :class:`.csv_py2.Writer`. 273 """ 274 return Writer(*args, **kwargs) 275