1# Copyright (C) 2008-2019 Vicent Mas. All rights reserved 2# 3# This program is free software: you can redistribute it and/or modify 4# it under the terms of the GNU General Public License as published by 5# the Free Software Foundation, either version 3 of the License, or 6# (at your option) any later version. 7# 8# This program is distributed in the hope that it will be useful, 9# but WITHOUT ANY WARRANTY; without even the implied warranty of 10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11# GNU General Public License for more details. 12# 13# You should have received a copy of the GNU General Public License 14# along with this program. If not, see <http://www.gnu.org/licenses/>. 15# 16# Author: Vicent Mas - vmas@vitables.org 17 18# 19# Plugin initial draft author: Jorge Ibanez jorge.ibannez@uam.es 20# 21 22"""Convenience functions for the import_csv.py module. 23""" 24 25__docformat__ = 'restructuredtext' 26 27import logging 28import os 29import re 30import tempfile 31import vitables.utils 32 33import numpy 34from qtpy import QtWidgets 35import tables 36 37# https://github.com/numpy/numpy/issues/10990 38import warnings 39warnings.filterwarnings("ignore", category=numpy.VisibleDeprecationWarning) 40 41 42translate = QtWidgets.QApplication.translate 43TYPE_ERROR = translate( 44 'ImportCSV', 'Please, make sure that you are importing a ' 45 'homogeneous dataset.', 'CSV file not imported error') 46 47log = logging.getLogger(__name__) 48 49 50def getArray(buf): 51 """Fill an intermediate ``numpy`` array with data read from the `CSV` file. 52 53 The lines read from the CSV file are stored in a temporary file which is 54 passed to numpy.genfromtxt() in order to create a numpy array. 55 56 The dtypes of the numpy array are determined by the contents of each 57 column. Multidimensional columns will have string datatype. 58 59 Warning: the temporary file is written in binary mode so lines are stored 60 as bytearrays (encoded as UTF-8). It means that strings in the numpy array 61 will also be bytes with UTF-8 encoding and not Python 3 strings. 62 63 :Parameter buf: the data buffer is a list of lines of the CSV file 64 """ 65 66 with tempfile.TemporaryFile(mode='w+b') as temp_file: 67 for line in buf: 68 temp_file.write(bytearray(line, 'UTF-8')) 69 temp_file.seek(0) 70 data = numpy.genfromtxt(temp_file, delimiter=',', dtype=None) 71 return data 72 73 74def tableInfo(input_handler): 75 """Return useful information about the `tables.Table` being created. 76 77 :Parameter input_handler: the file handler of the inspected CSV file 78 """ 79 80 # Inspect the CSV file reading its second line 81 # (reading the first line is not safe as it could be a header) 82 input_handler.seek(0) 83 first_line = getArray(input_handler.readline()) 84 try: 85 second_line = getArray(input_handler.readline()) 86 except IOError: 87 # The second line cannot be read. We assume there is only on line 88 second_line = first_line 89 90 # Estimate the number of rows of the CSV file 91 filesize = os.path.getsize(input_handler.name) 92 # Record size = number of elements * element size 93 record_size = second_line.size * second_line.itemsize 94 nrows = filesize / record_size 95 96 if second_line.dtype.fields is None: 97 # second_line is a homogeneous array 98 descr, has_header = \ 99 homogeneousTableInfo(input_handler, first_line, second_line) 100 else: 101 # second_line is a heterogeneous array 102 descr, has_header = \ 103 heterogeneousTableInfo(input_handler, first_line, second_line) 104 105 del second_line 106 return (nrows, descr, has_header) 107 108 109def heterogeneousTableInfo(input_handler, first_line, second_line): 110 """Return useful information about the `tables.Table` being created. 111 112 The `data` array is heterogenous, i.e. not all fields have the same 113 dtype. 114 115 :Parameters: 116 117 - `input_handler`: the file handler of the inspected `CSV` file 118 - `first_line`: ``numpy`` array which contains the first line of the `CSV` 119 file 120 - `second_line`: ``numpy`` array which contains the second line of the 121 `CSV` file 122 """ 123 124 has_header = False 125 fl_dtype = first_line.dtype 126 if (fl_dtype.fields is None) and (fl_dtype.char in('S', 'U')): 127 has_header = True 128 129 # Stuff used for finding out itemsizes of string fields 130 itemsizes = {} 131 for field in range(0, len(second_line.dtype)): 132 if second_line.dtype[field].name.startswith('str') or \ 133 second_line.dtype[field].name.startswith('bytes'): 134 itemsizes[field] = 0 135 136 # If a dtype is a string, find out its biggest itemsize 137 if itemsizes: 138 buf_size = 1024 * 1024 139 input_handler.seek(0) 140 if has_header: 141 # Skip the header 142 input_handler.readline() 143 buf = input_handler.readlines(buf_size) 144 while buf: 145 temp_file = tempfile.TemporaryFile() 146 for line in buf: 147 temp_file.write(bytearray(line, 'UTF-8')) 148 for field in itemsizes.keys(): 149 temp_file.seek(0) 150 idata = numpy.genfromtxt(temp_file, delimiter=',', 151 usecols=(field,), dtype=None) 152 itemsizes[field] = max(itemsizes[field], idata.dtype.itemsize) 153 del idata 154 temp_file.close() 155 buf = input_handler.readlines(buf_size) 156 157 if has_header: 158 descr = {} 159 for i in range(0, first_line.size): 160 dtype = second_line.dtype.fields['f{0}'.format(i)][0] 161 descr[first_line[i].decode('UTF-8')] = tables.Col.from_dtype(dtype, 162 pos=i) 163 for i in itemsizes: 164 descr[first_line[i].decode( 165 'UTF-8')] = tables.StringCol(itemsizes[i], pos=i) 166 else: 167 descr = dict([(f, tables.Col.from_dtype(t[0])) for f, t in 168 second_line.dtype.fields.items()]) 169 for i in itemsizes: 170 descr['f{0}'.format(i)] = tables.StringCol(itemsizes[i]) 171 172 return descr, has_header 173 174 175def homogeneousTableInfo(input_handler, first_line, second_line): 176 """Return useful information about the `tables.Table` being created. 177 178 The `second_line` array is homegenous, i.e. all fields have the same dtype. 179 180 :Parameters: 181 182 - `input_handler`: the file handler of the inspected `CSV` file 183 - `first_line`: a ``numpy`` array which contains the first line of the 184 `CSV` file 185 - `second_line`: a ``numpy`` array which contains the second line of the 186 `CSV` file 187 """ 188 189 # Find out if the table has a header or not. 190 has_header = False 191 fldn = first_line.dtype.name 192 sldn = second_line.dtype.name 193 if sldn.startswith('str') or sldn.startswith('bytes'): 194 answer = askForHelp(first_line) 195 if answer == 'Header': 196 has_header = True 197 elif fldn.startswith('str') or fldn.startswith('bytes'): 198 has_header = True 199 200 input_handler.seek(0) 201 if has_header: 202 # Skip the header 203 input_handler.readline() 204 205 # If the fields of the table are strings then find out the biggest itemsize 206 if sldn.startswith('str') or sldn.startswith('bytes'): 207 itemsize = 0 208 buf_size = 1024 * 1024 209 buf = input_handler.readlines(buf_size) 210 if not buf: 211 # If the CSV file contains just one line 212 itemsize = first_line.dtype.itemsize 213 while buf: 214 idata = getArray(buf) 215 itemsize = max(itemsize, idata.dtype.itemsize) 216 del idata 217 buf = input_handler.readlines(buf_size) 218 219 # Iterate over the data fields and make the table description 220 # If the CSV file contains just one field then first_line is a 221 # scalar array and cannot be iterated so we reshape it 222 if first_line.shape == (): 223 first_line = first_line.reshape(1,) 224 indices = list(range(0, first_line.shape[0])) 225 226 if has_header: 227 if sldn.startswith('str') or sldn.startswith('bytes'): 228 descr = dict([(first_line[i].decode('UTF-8'), 229 tables.StringCol(itemsize, pos=i)) 230 for i in indices]) 231 else: 232 descr = dict([(first_line[i].decode('UTF-8'), 233 tables.Col.from_dtype(second_line.dtype, pos=i)) 234 for i in indices]) 235 else: 236 if sldn.startswith('str') or sldn.startswith('bytes'): 237 descr = dict([('f{0}'.format(field), tables.StringCol(itemsize)) 238 for field in indices]) 239 else: 240 descr = dict([('f{0}'.format(field), 241 tables.Col.from_dtype(second_line.dtype)) 242 for field in indices]) 243 244 return descr, has_header 245 246 247def askForHelp(first_line): 248 """Ask user if the first row is a header. 249 250 :Parameter first_line: a ``numpy`` array which contains the first line of 251 the `CSV` file 252 """ 253 254 title = translate('ImportCSV', 'Resolving first line role', 255 'Message box title') 256 text = translate('ImportCSV', 'Does the first line of the file contain ' 257 'a table header or regular data?', 'Message box text') 258 itext = '' 259 try: 260 from functools import reduce 261 dtext = reduce(lambda x, y: '{0}, {1}'.format(x, y), first_line) 262 except TypeError: 263 # If first_line has only one field reduce raises a TypeError 264 dtext = first_line.tostring() 265 buttons = { 266 'Header': 267 (translate('ImportCSV', 'Header', 'Button text'), 268 QtWidgets.QMessageBox.YesRole), 269 'Data': 270 (translate('ImportCSV', 'Data', 'Button text'), 271 QtWidgets.QMessageBox.NoRole), 272 } 273 return vitables.utils.questionBox(title, text, itext, dtext, buttons) 274 275 276def earrayInfo(input_handler): 277 """Return useful information about the `tables.EArray` being created. 278 279 :Parameter input_handler: the file handler of the inspected file 280 """ 281 282 # Inspect the CSV file reading its first line 283 # The dtypes are determined by the contents of each column 284 # Multidimensional columns will have string datatype 285 first_line = getArray(input_handler.readline()) 286 287 # Estimate the number of rows of the file 288 filesize = os.path.getsize(input_handler.name) 289 record_size = first_line.size * first_line.itemsize 290 nrows = filesize / record_size 291 292 if first_line.dtype.name.startswith('str') or \ 293 first_line.dtype.name.startswith('bytes'): 294 # Find out the biggest itemsize 295 itemsize = 0 296 buf_size = 1024 * 1024 297 input_handler.seek(0) 298 buf = input_handler.readlines(buf_size) 299 while buf: 300 idata = getArray(buf) 301 itemsize = max(itemsize, idata.dtype.itemsize) 302 del idata 303 buf = input_handler.readlines(buf_size) 304 atom = tables.StringAtom(itemsize) 305 else: 306 # With compound dtypes this will raise a ValueError 307 atom = tables.Atom.from_dtype(first_line.dtype) 308 309 # Get the data shape 310 if nrows < 2: 311 # Corner case: the file only has one row 312 array_shape = (0, ) 313 elif first_line.shape == (): 314 # Corner case: the file has just one column 315 array_shape = (0, ) 316 else: 317 # General case: the file is a MxN array 318 array_shape = (0, first_line.shape[0]) 319 320 del first_line 321 input_handler.seek(0) 322 return nrows, atom, array_shape 323 324 325def carrayInfo(input_handler): 326 """Return useful information about the `tables.CArray` being created. 327 328 :Parameter input_handler: the file handler of the inspected file 329 """ 330 331 # Inspect the CSV file reading its first line 332 # The dtypes are determined by the contents of each column 333 # Multidimensional columns will have string datatype 334 input_handler.seek(0) 335 first_line = getArray(input_handler.readline()) 336 337 # This counting algorithm is faster than looping over lines with 338 # fh.readline and incrementing a counter at every step 339 lines = 0 340 itemsize = 0 341 buf_size = 1024 * 1024 342 input_handler.seek(0) 343 344 if first_line.dtype.name.startswith('str') or \ 345 first_line.dtype.name.startswith('bytes'): 346 # Count lines and find out the biggest itemsize 347 buf = input_handler.readlines(buf_size) 348 while buf: 349 idata = getArray(buf) 350 itemsize = max(itemsize, idata.dtype.itemsize) 351 del idata 352 lines += len(buf) 353 buf = input_handler.readlines(buf_size) 354 else: 355 # Count lines 356 buf = input_handler.readlines(buf_size) 357 while buf: 358 lines += len(buf) 359 buf = input_handler.readlines(buf_size) 360 361 if itemsize: 362 atom = tables.StringAtom(itemsize) 363 else: 364 atom = tables.Atom.from_dtype(first_line.dtype) 365 366 # Get the data shape 367 if lines == 1: 368 # Corner case: the file only has one row 369 array_shape = first_line.shape 370 lines = first_line.shape[0] 371 elif first_line.shape == (): 372 # Corner case: the file has just one column 373 array_shape = (lines, ) 374 else: 375 # General case: the file is a MxN array 376 array_shape = (lines, first_line.shape[0]) 377 378 del first_line 379 input_handler.seek(0) 380 return atom, array_shape 381 382 383def isValidFilepath(filepath): 384 """Check the filepath of the destination file. 385 386 :Parameter filepath: the filepath where the imported dataset will live 387 """ 388 valid = True 389 if os.path.exists(filepath): 390 log.error(translate( 391 'ImportCSV', 392 'CSV import failed because destination file already exists.', 393 'A file creation error')) 394 valid = False 395 396 elif os.path.isdir(filepath): 397 log.error(translate( 398 'ImportCSV', 399 'CSV import failed because destination container is a directory.', 400 'A file creation error')) 401 valid = False 402 403 return valid 404 405 406def checkFilenameExtension(filepath): 407 """ 408 Check the filename extension of the CSV file. 409 410 If the filename has no extension this method adds .csv 411 extension to it. 412 413 :Parameter filepath: the full path of the file 414 415 :Returns: the filepath with the proper extension 416 """ 417 418 if not re.search(r'\.(.+)$', os.path.basename(filepath)): 419 ext = '.csv' 420 filepath = filepath + ext 421 return filepath 422