1# Copyright (C) 2008-2019 Vicent Mas. All rights reserved 2# 3# This program is free software: you can redistribute it and/or modify 4# it under the terms of the GNU General Public License as published by 5# the Free Software Foundation, either version 3 of the License, or 6# (at your option) any later version. 7# 8# This program is distributed in the hope that it will be useful, 9# but WITHOUT ANY WARRANTY; without even the implied warranty of 10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11# GNU General Public License for more details. 12# 13# You should have received a copy of the GNU General Public License 14# along with this program. If not, see <http://www.gnu.org/licenses/>. 15# 16# Author: Vicent Mas - vmas@vitables.org 17 18# 19# Initial draft author: Jorge Ibanez jorge.ibannez@uam.es 20# 21 22"""Module that provides import `CSV` files into `PyTables` arrays and tables. 23 24The pipeline for importing a `CSV` file is:: 25 26 CSV file --> numpy array --> tables.Leaf 27 28The ``numpy`` array is created via `numpy.genfromtxt`. The `tables.Leaf` 29instance is created using the appropriate constructors. 30 31Beware that importing big files is a slow process because the whole file 32has to be read from disk, transformed and write back to disk again so 33there is a lot of disk IO. 34 35Other aspects to take into account: 36 37 - creation of `tables.Array` datasets requires the ``numpy`` array containing 38 the whole `CSV` file to be loaded in memory. If the file is large enough 39 you can run out of memory. 40 41 - creation of `tables.CArray` datasets requires an additional parsing of the 42 whole `CSV` file in order to find out its number of rows (it is a required 43 argument of the `tables.CArray` constructor). 44 45 - there is a penalty performance when string dtypes are involved. The reason 46 is that string fields use to have variable length so, before the ``numpy`` 47 array is created, we need to find out the minimum itemsize required for 48 storing those string fields with no lose of data. This step requires an 49 additional parsing of the whole `CSV` file. 50 51 - `CSV` files containing N-dimensional fields are always imported with `str` 52 dtype. This is a limitation of `numpy.genfromtxt`. 53""" 54 55import logging 56import os 57import traceback 58 59import numpy 60import tables 61from qtpy import QtCore 62from qtpy import QtGui 63from qtpy import QtWidgets 64 65import vitables.csv.csvutils as csvutils 66import vitables.utils 67 68__docformat__ = 'restructuredtext' 69 70translate = QtWidgets.QApplication.translate 71TYPE_ERROR = translate( 72 'ImportCSV', 'Please, make sure that you are importing a ' 73 'homogeneous dataset.', 'CSV file not imported error') 74 75log = logging.getLogger(__name__) 76 77 78class ImportCSV(QtCore.QObject): 79 """Provides CSV import capabilities for tables and arrays. 80 81 Some minor flaws: multidimensional fields are not well supported. 82 They are imported as strings. 83 """ 84 85 def __init__(self): 86 """The class constructor. 87 """ 88 89 super(ImportCSV, self).__init__() 90 91 # Get a reference to the application instance 92 self.vtapp = vitables.utils.getVTApp() 93 if self.vtapp is None: 94 return 95 96 self.vtgui = vitables.utils.getGui() 97 self.dbt_model = self.vtgui.dbs_tree_model 98 self.dbt_view = self.vtgui.dbs_tree_view 99 100 # Add an entry under the File menu 101 self.icons_dictionary = vitables.utils.getIcons() 102 self.addEntry() 103 104 def addEntry(self): 105 """Add the `Import CSV...` entry to the `File` menu. 106 """ 107 108 self.import_csv_submenu = QtWidgets.QMenu( 109 translate('ImportCSV', 'I&mport from CSV...', 110 'File -> Import CSV')) 111 self.import_csv_submenu.setSeparatorsCollapsible(False) 112 self.import_csv_submenu.setIcon( 113 self.icons_dictionary['document-import']) 114 self.import_csv_submenu.setObjectName('import_csv_submenu') 115 116 # Create the actions 117 actions = {} 118 actions['import_csv_table'] = QtWidgets.QAction( 119 translate('ImportCSV', "Import &Table...", 120 "Import table from CSV file"), 121 self, 122 shortcut=QtGui.QKeySequence.UnknownKey, 123 triggered=self.csv2Table, 124 statusTip=translate( 125 'ImportCSV', 126 "Import Table from plain CSV file", 127 "Status bar text for File -> Import CSV... -> Import Table")) 128 actions['import_csv_table'].setObjectName('import_csv_table') 129 130 actions['import_csv_array'] = QtWidgets.QAction( 131 translate('ImportCSV', "Import &Array...", 132 "Import array from CSV file"), 133 self, 134 shortcut=QtGui.QKeySequence.UnknownKey, 135 triggered=self.csv2Array, 136 statusTip=translate( 137 'ImportCSV', 138 "Import Array from plain CSV file", 139 "Status bar text for File -> Import CSV... -> Import Array")) 140 actions['import_csv_array'].setObjectName('import_csv_array') 141 142 actions['import_csv_carray'] = QtWidgets.QAction( 143 translate('ImportCSV', "Import &CArray...", 144 "Import carray from CSV file"), 145 self, 146 shortcut=QtGui.QKeySequence.UnknownKey, 147 triggered=self.csv2CArray, 148 statusTip=translate( 149 'ImportCSV', 150 "Import CArray from plain CSV file", 151 "Status bar text for File -> Import CSV... -> Import CArray")) 152 actions['import_csv_carray'].setObjectName('import_csv_carray') 153 154 actions['import_csv_earray'] = QtWidgets.QAction( 155 translate('ImportCSV', "Import &EArray...", 156 "Import earray from CSV file"), 157 self, 158 shortcut=QtGui.QKeySequence.UnknownKey, 159 triggered=self.csv2EArray, 160 statusTip=translate( 161 'ImportCSV', 162 "Import EArray from plain CSV file", 163 "Status bar text for File -> Import CSV... -> Import EArray")) 164 actions['import_csv_earray'].setObjectName('import_csv_earray') 165 166 actions['import_csv_separator'] = QtWidgets.QAction(self) 167 actions['import_csv_separator'].setSeparator(True) 168 actions['import_csv_separator'].setObjectName('import_csv_separator') 169 170 # Add actions to the Import submenu 171 keys = ('import_csv_table', 'import_csv_array', 'import_csv_carray', 172 'import_csv_earray') 173 vitables.utils.addActions(self.import_csv_submenu, keys, actions) 174 175 # Add submenu to file menu before the Close File action 176 vitables.utils.insertInMenu( 177 self.vtgui.file_menu, self.import_csv_submenu, 'fileClose') 178 sep = actions['import_csv_separator'] 179 vitables.utils.insertInMenu(self.vtgui.file_menu, sep, 'fileClose') 180 181 # Add submenu to file context menu before the Close File action 182 vitables.utils.insertInMenu(self.vtgui.view_cm, 183 self.import_csv_submenu, 'fileClose') 184 vitables.utils.insertInMenu(self.vtgui.view_cm, sep, 'fileClose') 185 186 def createDestFile(self, filepath): 187 """Create the `PyTables` file where the `CSV` file will be imported. 188 189 :Parameter filepath: the `PyTables` file filepath 190 """ 191 192 dbdoc = None 193 try: 194 dirname, filename = os.path.split(filepath) 195 root = os.path.splitext(filename)[0] 196 dest_filepath = vitables.utils.forwardPath(os.path.join(dirname, 197 '{0}.h5'.format(root))) 198 if csvutils.isValidFilepath(dest_filepath): 199 dbdoc = self.dbt_model.createDBDoc(dest_filepath) 200 except: 201 log.error( 202 translate('ImportCSV', 'Import failed because destination ' 203 'file cannot be created.', 204 'A file creation error')) 205 vitables.utils.formatExceptionInfo() 206 207 return dbdoc 208 209 def csvFilepath(self, leaf_kind): 210 """Get the filepath of the source `CSV` file. 211 212 :Parameter leaf_kind: the kind of container where data will be stored 213 """ 214 215 # Call the file selector (and, if needed, customise it) 216 filepath, working_dir = vitables.utils.getFilepath( 217 self.vtgui, translate( 218 'ImportCSV', 'Importing CSV file into {0}', 219 'Caption of the Import from CSV dialog').format(leaf_kind), 220 dfilter=translate('ImportCSV', """CSV Files (*.csv);;""" 221 """All Files (*)""", 222 'Filter for the Import from CSV dialog'), 223 settings={'accept_mode': QtWidgets.QFileDialog.AcceptOpen, 224 'file_mode': QtWidgets.QFileDialog.ExistingFile, 225 'history': self.vtapp.file_selector_history, 226 'label': translate('ImportCSV', 'Import', 227 'Accept button text for QFileDialog')} 228 ) 229 230 if not filepath: 231 # The user has canceled the dialog 232 return 233 234 # Update the history of the file selector widget 235 self.vtapp.updateFSHistory(working_dir) 236 237 return filepath 238 239 def updateTree(self, filepath): 240 """Update the databases tree once the `CSV` file has been imported. 241 242 When the destination h5 file is created and added to the databases tree 243 it has no nodes. Once the `CSV` file has been imported into a 244 `PyTables` container we update the representation of the h5 file in the 245 tree so that users can see that the file has a leaf. Eventually, the 246 root node of the imported file is selected so that users can locate it 247 immediately. 248 249 :Parameter filepath: the filepath of the destination h5 file 250 """ 251 252 for row, child in enumerate(self.dbt_model.root.children): 253 if child.filepath == filepath: 254 index = self.dbt_model.index(row, 0, QtCore.QModelIndex()) 255 self.dbt_model.lazyAddChildren(index) 256 self.dbt_view.setCurrentIndex(index) 257 258 def csv2Table(self): 259 """Import a plain `CSV` file into a `tables.Array` object. 260 """ 261 262 kind = 'Table' 263 filepath = self.csvFilepath(kind) 264 if filepath is None: 265 return 266 267 # Import the CSV content 268 try: 269 QtWidgets.qApp.processEvents() 270 QtWidgets.qApp.setOverrideCursor(QtCore.Qt.WaitCursor) 271 input_handler = open(filepath, 'r+') 272 try: 273 (nrows, descr, has_header) = csvutils.tableInfo(input_handler) 274 except Exception as inst: 275 print(traceback.format_exc()) 276 277 # Create the dataset 278 dbdoc = self.createDestFile(filepath) 279 if dbdoc is None: 280 return 281 io_filters = tables.Filters(complevel=9, complib='lzo') 282 dataset_name = "imported_{0}".format(kind) 283 atitle = \ 284 'Source CSV file {0}'.format(os.path.basename(filepath)) 285 dataset = dbdoc.h5file.create_table( 286 '/', dataset_name, descr, title=atitle, filters=io_filters, 287 expectedrows=nrows) 288 # Fill the dataset in a memory efficient way 289 input_handler.seek(0) 290 if has_header: 291 # Skip the header line 292 input_handler.readline() 293 chunk_size = 10000 294 buf_size = chunk_size * dataset.rowsize 295 read_fh = input_handler.readlines 296 buf = read_fh(buf_size) 297 while buf: 298 idata = csvutils.getArray(buf) 299 # Append data to the dataset 300 dataset.append(idata) 301 dataset.flush() 302 del idata 303 buf = read_fh(buf_size) 304 dbdoc.h5file.flush() 305 self.updateTree(dbdoc.filepath) 306 except: 307 vitables.utils.formatExceptionInfo() 308 finally: 309 QtWidgets.qApp.restoreOverrideCursor() 310 input_handler.close() 311 312 def csv2EArray(self): 313 """Import a plain `CSV` file into a `tables.EArray` object. 314 315 This is a slot method. See :meth:`addEntry` method for details. 316 """ 317 318 kind = 'EArray' 319 filepath = self.csvFilepath(kind) 320 if filepath is None: 321 return 322 323 # Import the CSV content 324 try: 325 QtWidgets.qApp.processEvents() 326 QtWidgets.qApp.setOverrideCursor(QtCore.Qt.WaitCursor) 327 chunk_size = 10000 328 input_handler = open(filepath, 'r+') 329 (nrows, atom, array_shape) = csvutils.earrayInfo(input_handler) 330 331 # Create the dataset 332 dbdoc = self.createDestFile(filepath) 333 if dbdoc is None: 334 return 335 io_filters = tables.Filters(complevel=9, complib='lzo') 336 dataset_name = "imported_{0}".format(kind) 337 atitle = 'Source CSV file {0}'.format(os.path.basename(filepath)) 338 dataset = dbdoc.h5file.create_earray( 339 '/', dataset_name, atom, array_shape, title=atitle, 340 filters=io_filters, expectedrows=nrows) 341 342 # Fill the dataset in a memory effcient way 343 input_handler.seek(0) 344 chunk_size = 10000 345 buf_size = chunk_size * dataset.rowsize 346 read_fh = input_handler.readlines 347 buf = read_fh(buf_size) 348 while buf: 349 idata = csvutils.getArray(buf) 350 # Append data to the dataset 351 dataset.append(idata) 352 dataset.flush() 353 del idata 354 buf = read_fh(buf_size) 355 dbdoc.h5file.flush() 356 self.updateTree(dbdoc.filepath) 357 except ValueError: 358 log.error(TYPE_ERROR) 359 except: 360 vitables.utils.formatExceptionInfo() 361 finally: 362 QtWidgets.qApp.restoreOverrideCursor() 363 input_handler.close() 364 365 def csv2CArray(self): 366 """Import a plain `CSV` file into a `tables.CArray` object. 367 368 This is a slot method. See :meth:`addEntry` method for details. 369 """ 370 371 kind = 'CArray' 372 filepath = self.csvFilepath(kind) 373 if filepath is None: 374 return 375 376 # Import the CSV content 377 try: 378 QtWidgets.qApp.processEvents() 379 QtWidgets.qApp.setOverrideCursor(QtCore.Qt.WaitCursor) 380 chunk_size = 10000 381 input_handler = open(filepath, 'r+') 382 (atom, array_shape) = csvutils.carrayInfo(input_handler) 383 384 # Create the dataset 385 dbdoc = self.createDestFile(filepath) 386 if dbdoc is None: 387 return 388 io_filters = tables.Filters(complevel=9, complib='lzo') 389 dataset_name = "imported_{0}".format(kind) 390 atitle = 'Source CSV file {0}'.format(os.path.basename(filepath)) 391 dataset = dbdoc.h5file.create_carray( 392 '/', dataset_name, atom, array_shape, title=atitle, 393 filters=io_filters) 394 395 # Fill the dataset in a memory effcient way 396 input_handler.seek(0) 397 chunk_size = 10000 398 buf_size = chunk_size * dataset.rowsize 399 read_fh = input_handler.readlines 400 buf = read_fh(buf_size) 401 start = 0 402 while buf: 403 idata = csvutils.getArray(buf) 404 stop = start + idata.shape[0] 405 # Append data to the dataset 406 dataset[start:stop] = idata 407 dataset.flush() 408 del idata 409 start = stop 410 buf = read_fh(buf_size) 411 dbdoc.h5file.flush() 412 self.updateTree(dbdoc.filepath) 413 except ValueError: 414 log.error(TYPE_ERROR) 415 except: 416 vitables.utils.formatExceptionInfo() 417 finally: 418 QtWidgets.qApp.restoreOverrideCursor() 419 input_handler.close() 420 421 def csv2Array(self): 422 """Import a plain `CSV` file into a `tables.Array` object. 423 424 This is a slot method. See :meth:`addEntry` method for details. 425 """ 426 427 kind = 'Array' 428 filepath = self.csvFilepath(kind) 429 if filepath is None: 430 return 431 432 # Import the CSV content 433 try: 434 QtWidgets.qApp.processEvents() 435 QtWidgets.qApp.setOverrideCursor(QtCore.Qt.WaitCursor) 436 # The dtypes are determined by the contents of each column 437 # Multidimensional columns will have string datatype 438 data = numpy.genfromtxt(filepath, delimiter=',', dtype=None) 439 except TypeError: 440 data = None 441 dbdoc = None 442 log.error(TYPE_ERROR) 443 else: 444 try: 445 # Create the array 446 dbdoc = self.createDestFile(filepath) 447 if dbdoc is None: 448 return 449 array_name = "imported_{0}".format(kind) 450 title = 'Imported from CSV file {0}'.\ 451 format(os.path.basename(filepath)) 452 dbdoc.h5file.create_array('/', array_name, data, title=title) 453 dbdoc.h5file.flush() 454 self.updateTree(dbdoc.filepath) 455 except TypeError: 456 log.error(TYPE_ERROR) 457 except tables.NodeError: 458 vitables.utils.formatExceptionInfo() 459 finally: 460 del data 461 QtWidgets.qApp.restoreOverrideCursor() 462