1#!/usr/bin/env python
2
3"""
4This module contains the Python 2 replacement for :mod:`csv`.
5"""
6
7import codecs
8import csv
9import warnings
10
11import six
12
13from agate.exceptions import FieldSizeLimitError
14
15EIGHT_BIT_ENCODINGS = [
16    'utf-8', 'u8', 'utf', 'utf8',
17    'latin-1', 'iso-8859-1', 'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'l1'
18]
19
20POSSIBLE_DELIMITERS = [',', '\t', ';', ' ', ':', '|']
21
22
23class UTF8Recoder(six.Iterator):
24    """
25    Iterator that reads an encoded stream and reencodes the input to UTF-8.
26    """
27    def __init__(self, f, encoding):
28        self.reader = codecs.getreader(encoding)(f)
29
30    def __iter__(self):
31        return self
32
33    def __next__(self):
34        return next(self.reader).encode('utf-8')
35
36
37class UnicodeReader(object):
38    """
39    A CSV reader which will read rows from a file in a given encoding.
40    """
41    def __init__(self, f, encoding='utf-8', field_size_limit=None, line_numbers=False, header=True, **kwargs):
42        self.line_numbers = line_numbers
43        self.header = header
44
45        f = UTF8Recoder(f, encoding)
46
47        self.reader = csv.reader(f, **kwargs)
48
49        if field_size_limit:
50            csv.field_size_limit(field_size_limit)
51
52    def next(self):
53        try:
54            row = next(self.reader)
55        except csv.Error as e:
56            # Terrible way to test for this exception, but there is no subclass
57            if 'field larger than field limit' in str(e):
58                raise FieldSizeLimitError(csv.field_size_limit(), self.line_num)
59            else:
60                raise e
61
62        if self.line_numbers:
63            if self.header and self.line_num == 1:
64                row.insert(0, 'line_numbers')
65            else:
66                row.insert(0, str(self.line_num - 1 if self.header else self.line_num))
67
68        return [six.text_type(s, 'utf-8') for s in row]
69
70    def __iter__(self):
71        return self
72
73    @property
74    def dialect(self):
75        return self.reader.dialect
76
77    @property
78    def line_num(self):
79        return self.reader.line_num
80
81
82class UnicodeWriter(object):
83    """
84    A CSV writer which will write rows to a file in the specified encoding.
85
86    NB: Optimized so that eight-bit encodings skip re-encoding. See:
87        https://github.com/wireservice/csvkit/issues/175
88    """
89    def __init__(self, f, encoding='utf-8', **kwargs):
90        self.encoding = encoding
91        self._eight_bit = (self.encoding.lower().replace('_', '-') in EIGHT_BIT_ENCODINGS)
92
93        if self._eight_bit:
94            self.writer = csv.writer(f, **kwargs)
95        else:
96            # Redirect output to a queue for reencoding
97            self.queue = six.StringIO()
98            self.writer = csv.writer(self.queue, **kwargs)
99            self.stream = f
100            self.encoder = codecs.getincrementalencoder(encoding)()
101
102    def writerow(self, row):
103        if self._eight_bit:
104            self.writer.writerow([six.text_type(s if s is not None else '').encode(self.encoding) for s in row])
105        else:
106            self.writer.writerow([six.text_type(s if s is not None else '').encode('utf-8') for s in row])
107            # Fetch UTF-8 output from the queue...
108            data = self.queue.getvalue()
109            data = data.decode('utf-8')
110            # ...and reencode it into the target encoding
111            data = self.encoder.encode(data)
112            # write to the file
113            self.stream.write(data)
114            # empty the queue
115            self.queue.truncate(0)
116
117    def writerows(self, rows):
118        for row in rows:
119            self.writerow(row)
120
121
122class UnicodeDictReader(csv.DictReader):
123    """
124    Defer almost all implementation to :class:`csv.DictReader`, but wraps our
125    unicode reader instead of :func:`csv.reader`.
126    """
127    def __init__(self, f, fieldnames=None, restkey=None, restval=None, *args, **kwargs):
128        reader = UnicodeReader(f, *args, **kwargs)
129
130        if 'encoding' in kwargs:
131            kwargs.pop('encoding')
132
133        csv.DictReader.__init__(self, f, fieldnames, restkey, restval, *args, **kwargs)
134
135        self.reader = reader
136
137
138class UnicodeDictWriter(csv.DictWriter):
139    """
140    Defer almost all implementation to :class:`csv.DictWriter`, but wraps our
141    unicode writer instead of :func:`csv.writer`.
142    """
143    def __init__(self, f, fieldnames, restval='', extrasaction='raise', *args, **kwds):
144        self.fieldnames = fieldnames
145        self.restval = restval
146
147        if extrasaction.lower() not in ('raise', 'ignore'):
148            raise ValueError('extrasaction (%s) must be "raise" or "ignore"' % extrasaction)
149
150        self.extrasaction = extrasaction
151
152        self.writer = UnicodeWriter(f, *args, **kwds)
153
154
155class Reader(UnicodeReader):
156    """
157    A unicode-aware CSV reader.
158    """
159    pass
160
161
162class Writer(UnicodeWriter):
163    """
164    A unicode-aware CSV writer.
165    """
166    def __init__(self, f, encoding='utf-8', line_numbers=False, **kwargs):
167        self.row_count = 0
168        self.line_numbers = line_numbers
169
170        if 'lineterminator' not in kwargs:
171            kwargs['lineterminator'] = '\n'
172
173        UnicodeWriter.__init__(self, f, encoding, **kwargs)
174
175    def _append_line_number(self, row):
176        if self.row_count == 0:
177            row.insert(0, 'line_number')
178        else:
179            row.insert(0, self.row_count)
180
181        self.row_count += 1
182
183    def writerow(self, row):
184        if self.line_numbers:
185            row = list(row)
186            self._append_line_number(row)
187
188        # Convert embedded Mac line endings to unix style line endings so they get quoted
189        row = [i.replace('\r', '\n') if isinstance(i, six.string_types) else i for i in row]
190
191        UnicodeWriter.writerow(self, row)
192
193    def writerows(self, rows):
194        for row in rows:
195            self.writerow(row)
196
197
198class DictReader(UnicodeDictReader):
199    """
200    A unicode-aware CSV DictReader.
201    """
202    pass
203
204
205class DictWriter(UnicodeDictWriter):
206    """
207    A unicode-aware CSV DictWriter.
208    """
209    def __init__(self, f, fieldnames, encoding='utf-8', line_numbers=False, **kwargs):
210        self.row_count = 0
211        self.line_numbers = line_numbers
212
213        if 'lineterminator' not in kwargs:
214            kwargs['lineterminator'] = '\n'
215
216        UnicodeDictWriter.__init__(self, f, fieldnames, encoding=encoding, **kwargs)
217
218    def _append_line_number(self, row):
219        if self.row_count == 0:
220            row['line_number'] = 0
221        else:
222            row['line_number'] = self.row_count
223
224        self.row_count += 1
225
226    def writerow(self, row):
227        if self.line_numbers:
228            row = list(row)
229            self._append_line_number(row)
230
231        # Convert embedded Mac line endings to unix style line endings so they get quoted
232        row = dict([
233            (k, v.replace('\r', '\n')) if isinstance(v, basestring) else (k, v) for k, v in row.items()  # noqa: F821
234        ])
235
236        UnicodeDictWriter.writerow(self, row)
237
238    def writerows(self, rows):
239        for row in rows:
240            self.writerow(row)
241
242
243class Sniffer(object):
244    """
245    A functional wrapper of ``csv.Sniffer()``.
246    """
247    def sniff(self, sample):
248        """
249        A functional version of ``csv.Sniffer().sniff``, that extends the
250        list of possible delimiters to include some seen in the wild.
251        """
252        try:
253            dialect = csv.Sniffer().sniff(sample, POSSIBLE_DELIMITERS)
254        except csv.Error as e:
255            warnings.warn('Error sniffing CSV dialect: %s' % e, RuntimeWarning, stacklevel=2)
256            dialect = None
257
258        return dialect
259
260
261def reader(*args, **kwargs):
262    """
263    A replacement for Python's :func:`csv.reader` that uses
264    :class:`.csv_py2.Reader`.
265    """
266    return Reader(*args, **kwargs)
267
268
269def writer(*args, **kwargs):
270    """
271    A replacement for Python's :func:`csv.writer` that uses
272    :class:`.csv_py2.Writer`.
273    """
274    return Writer(*args, **kwargs)
275