1#       Copyright (C) 2008-2019 Vicent Mas. All rights reserved
2#
3#       This program is free software: you can redistribute it and/or modify
4#       it under the terms of the GNU General Public License as published by
5#       the Free Software Foundation, either version 3 of the License, or
6#       (at your option) any later version.
7#
8#       This program is distributed in the hope that it will be useful,
9#       but WITHOUT ANY WARRANTY; without even the implied warranty of
10#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11#       GNU General Public License for more details.
12#
13#       You should have received a copy of the GNU General Public License
14#       along with this program.  If not, see <http://www.gnu.org/licenses/>.
15#
16#       Author:  Vicent Mas - vmas@vitables.org
17
18#
19# Plugin initial draft author: Jorge Ibanez jorge.ibannez@uam.es
20#
21
22"""Convenience functions for the import_csv.py module.
23"""
24
25__docformat__ = 'restructuredtext'
26
27import logging
28import os
29import re
30import tempfile
31import vitables.utils
32
33import numpy
34from qtpy import QtWidgets
35import tables
36
37# https://github.com/numpy/numpy/issues/10990
38import warnings
39warnings.filterwarnings("ignore", category=numpy.VisibleDeprecationWarning)
40
41
42translate = QtWidgets.QApplication.translate
43TYPE_ERROR = translate(
44    'ImportCSV', 'Please, make sure that you are importing a '
45    'homogeneous dataset.', 'CSV file not imported error')
46
47log = logging.getLogger(__name__)
48
49
50def getArray(buf):
51    """Fill an intermediate ``numpy`` array with data read from the `CSV` file.
52
53    The lines read from the CSV file are stored in a temporary file which is
54    passed to numpy.genfromtxt() in order to create a numpy array.
55
56    The dtypes of the numpy array are determined by the contents of each
57    column. Multidimensional columns will have string datatype.
58
59    Warning: the temporary file is written in binary mode so lines are stored
60    as bytearrays (encoded as UTF-8). It means that strings in the numpy array
61    will also be bytes with UTF-8 encoding and not Python 3 strings.
62
63    :Parameter buf: the data buffer is a list of lines of the CSV file
64    """
65
66    with tempfile.TemporaryFile(mode='w+b') as temp_file:
67        for line in buf:
68            temp_file.write(bytearray(line, 'UTF-8'))
69        temp_file.seek(0)
70        data = numpy.genfromtxt(temp_file, delimiter=',', dtype=None)
71    return data
72
73
74def tableInfo(input_handler):
75    """Return useful information about the `tables.Table` being created.
76
77    :Parameter input_handler: the file handler of the inspected CSV file
78    """
79
80    # Inspect the CSV file reading its second line
81    # (reading the first line is not safe as it could be a header)
82    input_handler.seek(0)
83    first_line = getArray(input_handler.readline())
84    try:
85        second_line = getArray(input_handler.readline())
86    except IOError:
87        # The second line cannot be read. We assume there is only on line
88        second_line = first_line
89
90    # Estimate the number of rows of the CSV file
91    filesize = os.path.getsize(input_handler.name)
92    # Record size = number of elements * element size
93    record_size = second_line.size * second_line.itemsize
94    nrows = filesize / record_size
95
96    if second_line.dtype.fields is None:
97        # second_line is a homogeneous array
98        descr, has_header = \
99            homogeneousTableInfo(input_handler, first_line, second_line)
100    else:
101        # second_line is a heterogeneous array
102        descr, has_header = \
103            heterogeneousTableInfo(input_handler, first_line, second_line)
104
105    del second_line
106    return (nrows, descr, has_header)
107
108
109def heterogeneousTableInfo(input_handler, first_line, second_line):
110    """Return useful information about the `tables.Table` being created.
111
112    The `data` array is heterogenous, i.e. not all fields have the same
113    dtype.
114
115    :Parameters:
116
117    - `input_handler`: the file handler of the inspected `CSV` file
118    - `first_line`: ``numpy`` array which contains the first line of the `CSV`
119      file
120    - `second_line`: ``numpy`` array which contains the second line of the
121      `CSV` file
122    """
123
124    has_header = False
125    fl_dtype = first_line.dtype
126    if (fl_dtype.fields is None) and (fl_dtype.char in('S', 'U')):
127        has_header = True
128
129    # Stuff used for finding out itemsizes of string fields
130    itemsizes = {}
131    for field in range(0, len(second_line.dtype)):
132        if second_line.dtype[field].name.startswith('str') or \
133                second_line.dtype[field].name.startswith('bytes'):
134            itemsizes[field] = 0
135
136    # If a dtype is a string, find out its biggest itemsize
137    if itemsizes:
138        buf_size = 1024 * 1024
139        input_handler.seek(0)
140        if has_header:
141            # Skip the header
142            input_handler.readline()
143        buf = input_handler.readlines(buf_size)
144        while buf:
145            temp_file = tempfile.TemporaryFile()
146            for line in buf:
147                temp_file.write(bytearray(line, 'UTF-8'))
148            for field in itemsizes.keys():
149                temp_file.seek(0)
150                idata = numpy.genfromtxt(temp_file, delimiter=',',
151                                         usecols=(field,), dtype=None)
152                itemsizes[field] = max(itemsizes[field], idata.dtype.itemsize)
153                del idata
154            temp_file.close()
155            buf = input_handler.readlines(buf_size)
156
157    if has_header:
158        descr = {}
159        for i in range(0, first_line.size):
160            dtype = second_line.dtype.fields['f{0}'.format(i)][0]
161            descr[first_line[i].decode('UTF-8')] = tables.Col.from_dtype(dtype,
162                                                                         pos=i)
163        for i in itemsizes:
164            descr[first_line[i].decode(
165                'UTF-8')] = tables.StringCol(itemsizes[i], pos=i)
166    else:
167        descr = dict([(f, tables.Col.from_dtype(t[0])) for f, t in
168                      second_line.dtype.fields.items()])
169        for i in itemsizes:
170            descr['f{0}'.format(i)] = tables.StringCol(itemsizes[i])
171
172    return descr, has_header
173
174
175def homogeneousTableInfo(input_handler, first_line, second_line):
176    """Return useful information about the `tables.Table` being created.
177
178    The `second_line` array is homegenous, i.e. all fields have the same dtype.
179
180    :Parameters:
181
182    - `input_handler`: the file handler of the inspected `CSV` file
183    - `first_line`: a ``numpy`` array which contains the first line of the
184      `CSV` file
185    - `second_line`: a ``numpy`` array which contains the second line of the
186      `CSV` file
187    """
188
189    # Find out if the table has a header or not.
190    has_header = False
191    fldn = first_line.dtype.name
192    sldn = second_line.dtype.name
193    if sldn.startswith('str') or sldn.startswith('bytes'):
194        answer = askForHelp(first_line)
195        if answer == 'Header':
196            has_header = True
197    elif fldn.startswith('str') or fldn.startswith('bytes'):
198        has_header = True
199
200    input_handler.seek(0)
201    if has_header:
202        # Skip the header
203        input_handler.readline()
204
205    # If the fields of the table are strings then find out the biggest itemsize
206    if sldn.startswith('str') or sldn.startswith('bytes'):
207        itemsize = 0
208        buf_size = 1024 * 1024
209        buf = input_handler.readlines(buf_size)
210        if not buf:
211            # If the CSV file contains just one line
212            itemsize = first_line.dtype.itemsize
213        while buf:
214            idata = getArray(buf)
215            itemsize = max(itemsize, idata.dtype.itemsize)
216            del idata
217            buf = input_handler.readlines(buf_size)
218
219    # Iterate over the data fields and make the table description
220    # If the CSV file contains just one field then first_line is a
221    # scalar array and cannot be iterated so we reshape it
222    if first_line.shape == ():
223        first_line = first_line.reshape(1,)
224    indices = list(range(0, first_line.shape[0]))
225
226    if has_header:
227        if sldn.startswith('str') or sldn.startswith('bytes'):
228            descr = dict([(first_line[i].decode('UTF-8'),
229                           tables.StringCol(itemsize, pos=i))
230                          for i in indices])
231        else:
232            descr = dict([(first_line[i].decode('UTF-8'),
233                           tables.Col.from_dtype(second_line.dtype, pos=i))
234                          for i in indices])
235    else:
236        if sldn.startswith('str') or sldn.startswith('bytes'):
237            descr = dict([('f{0}'.format(field), tables.StringCol(itemsize))
238                          for field in indices])
239        else:
240            descr = dict([('f{0}'.format(field),
241                           tables.Col.from_dtype(second_line.dtype))
242                          for field in indices])
243
244    return descr, has_header
245
246
247def askForHelp(first_line):
248    """Ask user if the first row is a header.
249
250    :Parameter first_line: a ``numpy`` array which contains the first line of
251      the `CSV` file
252    """
253
254    title = translate('ImportCSV', 'Resolving first line role',
255                      'Message box title')
256    text = translate('ImportCSV', 'Does the first line of the file contain '
257                     'a table header or regular data?', 'Message box text')
258    itext = ''
259    try:
260        from functools import reduce
261        dtext = reduce(lambda x, y: '{0}, {1}'.format(x, y), first_line)
262    except TypeError:
263        # If first_line has only one field reduce raises a TypeError
264        dtext = first_line.tostring()
265    buttons = {
266        'Header':
267        (translate('ImportCSV', 'Header', 'Button text'),
268         QtWidgets.QMessageBox.YesRole),
269        'Data':
270        (translate('ImportCSV', 'Data', 'Button text'),
271         QtWidgets.QMessageBox.NoRole),
272    }
273    return vitables.utils.questionBox(title, text, itext, dtext, buttons)
274
275
276def earrayInfo(input_handler):
277    """Return useful information about the `tables.EArray` being created.
278
279    :Parameter input_handler: the file handler of the inspected file
280    """
281
282    # Inspect the CSV file reading its first line
283    # The dtypes are determined by the contents of each column
284    # Multidimensional columns will have string datatype
285    first_line = getArray(input_handler.readline())
286
287    # Estimate the number of rows of the file
288    filesize = os.path.getsize(input_handler.name)
289    record_size = first_line.size * first_line.itemsize
290    nrows = filesize / record_size
291
292    if first_line.dtype.name.startswith('str') or \
293            first_line.dtype.name.startswith('bytes'):
294        # Find out the biggest itemsize
295        itemsize = 0
296        buf_size = 1024 * 1024
297        input_handler.seek(0)
298        buf = input_handler.readlines(buf_size)
299        while buf:
300            idata = getArray(buf)
301            itemsize = max(itemsize, idata.dtype.itemsize)
302            del idata
303            buf = input_handler.readlines(buf_size)
304        atom = tables.StringAtom(itemsize)
305    else:
306        # With compound dtypes this will raise a ValueError
307        atom = tables.Atom.from_dtype(first_line.dtype)
308
309    # Get the data shape
310    if nrows < 2:
311        # Corner case: the file only has one row
312        array_shape = (0, )
313    elif first_line.shape == ():
314        # Corner case: the file has just one column
315        array_shape = (0, )
316    else:
317        # General case: the file is a MxN array
318        array_shape = (0, first_line.shape[0])
319
320    del first_line
321    input_handler.seek(0)
322    return nrows, atom, array_shape
323
324
325def carrayInfo(input_handler):
326    """Return useful information about the `tables.CArray` being created.
327
328    :Parameter input_handler: the file handler of the inspected file
329    """
330
331    # Inspect the CSV file reading its first line
332    # The dtypes are determined by the contents of each column
333    # Multidimensional columns will have string datatype
334    input_handler.seek(0)
335    first_line = getArray(input_handler.readline())
336
337    # This counting algorithm is faster than looping over lines with
338    # fh.readline and incrementing a counter at every step
339    lines = 0
340    itemsize = 0
341    buf_size = 1024 * 1024
342    input_handler.seek(0)
343
344    if first_line.dtype.name.startswith('str') or \
345            first_line.dtype.name.startswith('bytes'):
346        # Count lines and find out the biggest itemsize
347        buf = input_handler.readlines(buf_size)
348        while buf:
349            idata = getArray(buf)
350            itemsize = max(itemsize, idata.dtype.itemsize)
351            del idata
352            lines += len(buf)
353            buf = input_handler.readlines(buf_size)
354    else:
355        # Count lines
356        buf = input_handler.readlines(buf_size)
357        while buf:
358            lines += len(buf)
359            buf = input_handler.readlines(buf_size)
360
361    if itemsize:
362        atom = tables.StringAtom(itemsize)
363    else:
364        atom = tables.Atom.from_dtype(first_line.dtype)
365
366    # Get the data shape
367    if lines == 1:
368        # Corner case: the file only has one row
369        array_shape = first_line.shape
370        lines = first_line.shape[0]
371    elif first_line.shape == ():
372        # Corner case: the file has just one column
373        array_shape = (lines, )
374    else:
375        # General case: the file is a MxN array
376        array_shape = (lines, first_line.shape[0])
377
378    del first_line
379    input_handler.seek(0)
380    return atom, array_shape
381
382
383def isValidFilepath(filepath):
384    """Check the filepath of the destination file.
385
386    :Parameter filepath: the filepath where the imported dataset will live
387    """
388    valid = True
389    if os.path.exists(filepath):
390        log.error(translate(
391            'ImportCSV',
392            'CSV import failed because destination file already exists.',
393            'A file creation error'))
394        valid = False
395
396    elif os.path.isdir(filepath):
397        log.error(translate(
398            'ImportCSV',
399            'CSV import failed because destination container is a directory.',
400            'A file creation error'))
401        valid = False
402
403    return valid
404
405
406def checkFilenameExtension(filepath):
407    """
408    Check the filename extension of the CSV file.
409
410    If the filename has no extension this method adds .csv
411    extension to it.
412
413    :Parameter filepath: the full path of the file
414
415    :Returns: the filepath with the proper extension
416    """
417
418    if not re.search(r'\.(.+)$', os.path.basename(filepath)):
419        ext = '.csv'
420        filepath = filepath + ext
421    return filepath
422