1#       Copyright (C) 2008-2019 Vicent Mas. All rights reserved
2#
3#       This program is free software: you can redistribute it and/or modify
4#       it under the terms of the GNU General Public License as published by
5#       the Free Software Foundation, either version 3 of the License, or
6#       (at your option) any later version.
7#
8#       This program is distributed in the hope that it will be useful,
9#       but WITHOUT ANY WARRANTY; without even the implied warranty of
10#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11#       GNU General Public License for more details.
12#
13#       You should have received a copy of the GNU General Public License
14#       along with this program.  If not, see <http://www.gnu.org/licenses/>.
15#
16#       Author:  Vicent Mas - vmas@vitables.org
17
18#
19# Initial draft author: Jorge Ibanez jorge.ibannez@uam.es
20#
21
22"""Module that provides import `CSV` files into `PyTables` arrays and tables.
23
24The pipeline for importing a `CSV` file is::
25
26    CSV file --> numpy array --> tables.Leaf
27
28The ``numpy`` array is created via `numpy.genfromtxt`. The `tables.Leaf`
29instance is created using the appropriate constructors.
30
31Beware that importing big files is a slow process because the whole file
32has to be read from disk, transformed and write back to disk again so
33there is a lot of disk IO.
34
35Other aspects to take into account:
36
37  - creation of `tables.Array` datasets requires the ``numpy`` array containing
38    the whole `CSV` file to be loaded in memory. If the file is large enough
39    you can run out of memory.
40
41  - creation of `tables.CArray` datasets requires an additional parsing of the
42    whole `CSV` file in order to find out its number of rows (it is a required
43    argument of the `tables.CArray` constructor).
44
45  - there is a penalty performance when string dtypes are involved. The reason
46    is that string fields use to have variable length so, before the ``numpy``
47    array is created, we need to find out the minimum itemsize required for
48    storing those string fields with no lose of data. This step requires an
49    additional parsing of the whole `CSV` file.
50
51  - `CSV` files containing N-dimensional fields are always imported with `str`
52    dtype. This is a limitation of `numpy.genfromtxt`.
53"""
54
55import logging
56import os
57import traceback
58
59import numpy
60import tables
61from qtpy import QtCore
62from qtpy import QtGui
63from qtpy import QtWidgets
64
65import vitables.csv.csvutils as csvutils
66import vitables.utils
67
68__docformat__ = 'restructuredtext'
69
70translate = QtWidgets.QApplication.translate
71TYPE_ERROR = translate(
72    'ImportCSV', 'Please, make sure that you are importing a '
73    'homogeneous dataset.', 'CSV file not imported error')
74
75log = logging.getLogger(__name__)
76
77
78class ImportCSV(QtCore.QObject):
79    """Provides CSV import capabilities for tables and arrays.
80
81    Some minor flaws: multidimensional fields are not well supported.
82    They are imported as strings.
83    """
84
85    def __init__(self):
86        """The class constructor.
87        """
88
89        super(ImportCSV, self).__init__()
90
91        # Get a reference to the application instance
92        self.vtapp = vitables.utils.getVTApp()
93        if self.vtapp is None:
94            return
95
96        self.vtgui = vitables.utils.getGui()
97        self.dbt_model = self.vtgui.dbs_tree_model
98        self.dbt_view = self.vtgui.dbs_tree_view
99
100        # Add an entry under the File menu
101        self.icons_dictionary = vitables.utils.getIcons()
102        self.addEntry()
103
104    def addEntry(self):
105        """Add the `Import CSV...` entry to the `File` menu.
106        """
107
108        self.import_csv_submenu = QtWidgets.QMenu(
109            translate('ImportCSV', 'I&mport from CSV...',
110                      'File -> Import CSV'))
111        self.import_csv_submenu.setSeparatorsCollapsible(False)
112        self.import_csv_submenu.setIcon(
113            self.icons_dictionary['document-import'])
114        self.import_csv_submenu.setObjectName('import_csv_submenu')
115
116        # Create the actions
117        actions = {}
118        actions['import_csv_table'] = QtWidgets.QAction(
119            translate('ImportCSV', "Import &Table...",
120                      "Import table from CSV file"),
121            self,
122            shortcut=QtGui.QKeySequence.UnknownKey,
123            triggered=self.csv2Table,
124            statusTip=translate(
125                'ImportCSV',
126                "Import Table from plain CSV file",
127                "Status bar text for File -> Import CSV... -> Import Table"))
128        actions['import_csv_table'].setObjectName('import_csv_table')
129
130        actions['import_csv_array'] = QtWidgets.QAction(
131            translate('ImportCSV', "Import &Array...",
132                      "Import array from CSV file"),
133            self,
134            shortcut=QtGui.QKeySequence.UnknownKey,
135            triggered=self.csv2Array,
136            statusTip=translate(
137                'ImportCSV',
138                "Import Array from plain CSV file",
139                "Status bar text for File -> Import CSV... -> Import Array"))
140        actions['import_csv_array'].setObjectName('import_csv_array')
141
142        actions['import_csv_carray'] = QtWidgets.QAction(
143            translate('ImportCSV', "Import &CArray...",
144                      "Import carray from CSV file"),
145            self,
146            shortcut=QtGui.QKeySequence.UnknownKey,
147            triggered=self.csv2CArray,
148            statusTip=translate(
149                'ImportCSV',
150                "Import CArray from plain CSV file",
151                "Status bar text for File -> Import CSV... -> Import CArray"))
152        actions['import_csv_carray'].setObjectName('import_csv_carray')
153
154        actions['import_csv_earray'] = QtWidgets.QAction(
155            translate('ImportCSV', "Import &EArray...",
156                      "Import earray from CSV file"),
157            self,
158            shortcut=QtGui.QKeySequence.UnknownKey,
159            triggered=self.csv2EArray,
160            statusTip=translate(
161                'ImportCSV',
162                "Import EArray from plain CSV file",
163                "Status bar text for File -> Import CSV... -> Import EArray"))
164        actions['import_csv_earray'].setObjectName('import_csv_earray')
165
166        actions['import_csv_separator'] = QtWidgets.QAction(self)
167        actions['import_csv_separator'].setSeparator(True)
168        actions['import_csv_separator'].setObjectName('import_csv_separator')
169
170        # Add actions to the Import submenu
171        keys = ('import_csv_table', 'import_csv_array', 'import_csv_carray',
172                'import_csv_earray')
173        vitables.utils.addActions(self.import_csv_submenu, keys, actions)
174
175        # Add submenu to file menu before the Close File action
176        vitables.utils.insertInMenu(
177            self.vtgui.file_menu, self.import_csv_submenu, 'fileClose')
178        sep = actions['import_csv_separator']
179        vitables.utils.insertInMenu(self.vtgui.file_menu, sep, 'fileClose')
180
181        # Add submenu to file context menu before the Close File action
182        vitables.utils.insertInMenu(self.vtgui.view_cm,
183                                    self.import_csv_submenu, 'fileClose')
184        vitables.utils.insertInMenu(self.vtgui.view_cm, sep, 'fileClose')
185
186    def createDestFile(self, filepath):
187        """Create the `PyTables` file where the `CSV` file will be imported.
188
189        :Parameter filepath: the `PyTables` file filepath
190        """
191
192        dbdoc = None
193        try:
194            dirname, filename = os.path.split(filepath)
195            root = os.path.splitext(filename)[0]
196            dest_filepath = vitables.utils.forwardPath(os.path.join(dirname,
197                                                                    '{0}.h5'.format(root)))
198            if csvutils.isValidFilepath(dest_filepath):
199                dbdoc = self.dbt_model.createDBDoc(dest_filepath)
200        except:
201            log.error(
202                translate('ImportCSV', 'Import failed because destination '
203                          'file cannot be created.',
204                          'A file creation error'))
205            vitables.utils.formatExceptionInfo()
206
207        return dbdoc
208
209    def csvFilepath(self, leaf_kind):
210        """Get the filepath of the source `CSV` file.
211
212        :Parameter leaf_kind: the kind of container where data will be stored
213        """
214
215        # Call the file selector (and, if needed, customise it)
216        filepath, working_dir = vitables.utils.getFilepath(
217            self.vtgui, translate(
218                'ImportCSV', 'Importing CSV file into {0}',
219                'Caption of the Import from CSV dialog').format(leaf_kind),
220            dfilter=translate('ImportCSV', """CSV Files (*.csv);;"""
221                              """All Files (*)""",
222                              'Filter for the Import from CSV dialog'),
223            settings={'accept_mode': QtWidgets.QFileDialog.AcceptOpen,
224                      'file_mode': QtWidgets.QFileDialog.ExistingFile,
225                      'history': self.vtapp.file_selector_history,
226                      'label': translate('ImportCSV', 'Import',
227                                         'Accept button text for QFileDialog')}
228        )
229
230        if not filepath:
231            # The user has canceled the dialog
232            return
233
234        # Update the history of the file selector widget
235        self.vtapp.updateFSHistory(working_dir)
236
237        return filepath
238
239    def updateTree(self, filepath):
240        """Update the databases tree once the `CSV` file has been imported.
241
242        When the destination h5 file is created and added to the databases tree
243        it has no nodes. Once the `CSV` file has been imported into a
244        `PyTables` container we update the representation of the h5 file in the
245        tree so that users can see that the file has a leaf. Eventually, the
246        root node of the imported file is selected so that users can locate it
247        immediately.
248
249        :Parameter filepath: the filepath of the destination h5 file
250        """
251
252        for row, child in enumerate(self.dbt_model.root.children):
253            if child.filepath == filepath:
254                index = self.dbt_model.index(row, 0, QtCore.QModelIndex())
255                self.dbt_model.lazyAddChildren(index)
256                self.dbt_view.setCurrentIndex(index)
257
258    def csv2Table(self):
259        """Import a plain `CSV` file into a `tables.Array` object.
260        """
261
262        kind = 'Table'
263        filepath = self.csvFilepath(kind)
264        if filepath is None:
265            return
266
267        # Import the CSV content
268        try:
269            QtWidgets.qApp.processEvents()
270            QtWidgets.qApp.setOverrideCursor(QtCore.Qt.WaitCursor)
271            input_handler = open(filepath, 'r+')
272            try:
273                (nrows, descr, has_header) = csvutils.tableInfo(input_handler)
274            except Exception as inst:
275                print(traceback.format_exc())
276
277            # Create the dataset
278            dbdoc = self.createDestFile(filepath)
279            if dbdoc is None:
280                return
281            io_filters = tables.Filters(complevel=9, complib='lzo')
282            dataset_name = "imported_{0}".format(kind)
283            atitle = \
284                'Source CSV file {0}'.format(os.path.basename(filepath))
285            dataset = dbdoc.h5file.create_table(
286                '/', dataset_name, descr, title=atitle, filters=io_filters,
287                expectedrows=nrows)
288            # Fill the dataset in a memory efficient way
289            input_handler.seek(0)
290            if has_header:
291                # Skip the header line
292                input_handler.readline()
293            chunk_size = 10000
294            buf_size = chunk_size * dataset.rowsize
295            read_fh = input_handler.readlines
296            buf = read_fh(buf_size)
297            while buf:
298                idata = csvutils.getArray(buf)
299                # Append data to the dataset
300                dataset.append(idata)
301                dataset.flush()
302                del idata
303                buf = read_fh(buf_size)
304            dbdoc.h5file.flush()
305            self.updateTree(dbdoc.filepath)
306        except:
307            vitables.utils.formatExceptionInfo()
308        finally:
309            QtWidgets.qApp.restoreOverrideCursor()
310            input_handler.close()
311
312    def csv2EArray(self):
313        """Import a plain `CSV` file into a `tables.EArray` object.
314
315        This is a slot method. See :meth:`addEntry` method for details.
316        """
317
318        kind = 'EArray'
319        filepath = self.csvFilepath(kind)
320        if filepath is None:
321            return
322
323        # Import the CSV content
324        try:
325            QtWidgets.qApp.processEvents()
326            QtWidgets.qApp.setOverrideCursor(QtCore.Qt.WaitCursor)
327            chunk_size = 10000
328            input_handler = open(filepath, 'r+')
329            (nrows, atom, array_shape) = csvutils.earrayInfo(input_handler)
330
331            # Create the dataset
332            dbdoc = self.createDestFile(filepath)
333            if dbdoc is None:
334                return
335            io_filters = tables.Filters(complevel=9, complib='lzo')
336            dataset_name = "imported_{0}".format(kind)
337            atitle = 'Source CSV file {0}'.format(os.path.basename(filepath))
338            dataset = dbdoc.h5file.create_earray(
339                '/', dataset_name, atom, array_shape, title=atitle,
340                filters=io_filters, expectedrows=nrows)
341
342            # Fill the dataset in a memory effcient way
343            input_handler.seek(0)
344            chunk_size = 10000
345            buf_size = chunk_size * dataset.rowsize
346            read_fh = input_handler.readlines
347            buf = read_fh(buf_size)
348            while buf:
349                idata = csvutils.getArray(buf)
350                # Append data to the dataset
351                dataset.append(idata)
352                dataset.flush()
353                del idata
354                buf = read_fh(buf_size)
355            dbdoc.h5file.flush()
356            self.updateTree(dbdoc.filepath)
357        except ValueError:
358            log.error(TYPE_ERROR)
359        except:
360            vitables.utils.formatExceptionInfo()
361        finally:
362            QtWidgets.qApp.restoreOverrideCursor()
363            input_handler.close()
364
365    def csv2CArray(self):
366        """Import a plain `CSV` file into a `tables.CArray` object.
367
368        This is a slot method. See :meth:`addEntry` method for details.
369        """
370
371        kind = 'CArray'
372        filepath = self.csvFilepath(kind)
373        if filepath is None:
374            return
375
376        # Import the CSV content
377        try:
378            QtWidgets.qApp.processEvents()
379            QtWidgets.qApp.setOverrideCursor(QtCore.Qt.WaitCursor)
380            chunk_size = 10000
381            input_handler = open(filepath, 'r+')
382            (atom, array_shape) = csvutils.carrayInfo(input_handler)
383
384            # Create the dataset
385            dbdoc = self.createDestFile(filepath)
386            if dbdoc is None:
387                return
388            io_filters = tables.Filters(complevel=9, complib='lzo')
389            dataset_name = "imported_{0}".format(kind)
390            atitle = 'Source CSV file {0}'.format(os.path.basename(filepath))
391            dataset = dbdoc.h5file.create_carray(
392                '/', dataset_name, atom, array_shape, title=atitle,
393                filters=io_filters)
394
395            # Fill the dataset in a memory effcient way
396            input_handler.seek(0)
397            chunk_size = 10000
398            buf_size = chunk_size * dataset.rowsize
399            read_fh = input_handler.readlines
400            buf = read_fh(buf_size)
401            start = 0
402            while buf:
403                idata = csvutils.getArray(buf)
404                stop = start + idata.shape[0]
405                # Append data to the dataset
406                dataset[start:stop] = idata
407                dataset.flush()
408                del idata
409                start = stop
410                buf = read_fh(buf_size)
411            dbdoc.h5file.flush()
412            self.updateTree(dbdoc.filepath)
413        except ValueError:
414            log.error(TYPE_ERROR)
415        except:
416            vitables.utils.formatExceptionInfo()
417        finally:
418            QtWidgets.qApp.restoreOverrideCursor()
419            input_handler.close()
420
421    def csv2Array(self):
422        """Import a plain `CSV` file into a `tables.Array` object.
423
424        This is a slot method. See :meth:`addEntry` method for details.
425        """
426
427        kind = 'Array'
428        filepath = self.csvFilepath(kind)
429        if filepath is None:
430            return
431
432        # Import the CSV content
433        try:
434            QtWidgets.qApp.processEvents()
435            QtWidgets.qApp.setOverrideCursor(QtCore.Qt.WaitCursor)
436            # The dtypes are determined by the contents of each column
437            # Multidimensional columns will have string datatype
438            data = numpy.genfromtxt(filepath, delimiter=',', dtype=None)
439        except TypeError:
440            data = None
441            dbdoc = None
442            log.error(TYPE_ERROR)
443        else:
444            try:
445                # Create the array
446                dbdoc = self.createDestFile(filepath)
447                if dbdoc is None:
448                    return
449                array_name = "imported_{0}".format(kind)
450                title = 'Imported from CSV file {0}'.\
451                    format(os.path.basename(filepath))
452                dbdoc.h5file.create_array('/', array_name, data, title=title)
453                dbdoc.h5file.flush()
454                self.updateTree(dbdoc.filepath)
455            except TypeError:
456                log.error(TYPE_ERROR)
457            except tables.NodeError:
458                vitables.utils.formatExceptionInfo()
459        finally:
460            del data
461            QtWidgets.qApp.restoreOverrideCursor()
462