1# -*- coding: utf-8 -*- 2 3######################################################################## 4# 5# License: BSD 6# Created: September 4, 2002 7# Author: Francesc Alted - faltet@pytables.com 8# 9# $Id$ 10# 11######################################################################## 12 13"""Here is defined the Table class.""" 14 15import math 16import operator 17import os.path 18import sys 19import warnings 20 21from functools import reduce as _reduce 22from time import time 23 24import numpy 25import numexpr 26 27from . import tableextension 28from .lrucacheextension import ObjectCache, NumCache 29from .atom import Atom 30from .conditions import compile_condition 31from numexpr.necompiler import getType as numexpr_getType, double 32from numexpr.expressions import functions as numexpr_functions 33from .flavor import flavor_of, array_as_internal, internal_to_flavor 34from .utils import is_idx, lazyattr, SizeType, NailedDict as CacheDict 35from .leaf import Leaf 36from .description import (IsDescription, Description, Col, descr_from_dtype) 37from .exceptions import ( 38 NodeError, HDF5ExtError, PerformanceWarning, OldIndexWarning, 39 NoSuchNodeError) 40from .utilsextension import get_nested_field 41 42from .path import join_path, split_path 43from .index import ( 44 OldIndex, default_index_filters, default_auto_index, Index, IndexesDescG, 45 IndexesTableG) 46 47 48profile = False 49# profile = True # Uncomment for profiling 50if profile: 51 from .utils import show_stats 52 53 54# 2.2: Added support for complex types. Introduced in version 0.9. 55# 2.2.1: Added suport for time types. 56# 2.3: Changed the indexes naming schema. 57# 2.4: Changed indexes naming schema (again). 58# 2.5: Added the FIELD_%d_FILL attributes. 59# 2.6: Added the FLAVOR attribute (optional). 60# 2.7: Numeric and numarray flavors are gone. 61obversion = "2.7" # The Table VERSION number 62 63 64try: 65 # int_, long_ are only available in numexpr >= 2.1 66 from numexpr.necompiler import int_, long_ 67except ImportError: 68 int_ = int 69 long_ = int 70 71# Maps NumPy types to the types used by Numexpr. 72_nxtype_from_nptype = { 73 numpy.bool_: bool, 74 numpy.int8: int_, 75 numpy.int16: int_, 76 numpy.int32: int_, 77 numpy.int64: long_, 78 numpy.uint8: int_, 79 numpy.uint16: int_, 80 numpy.uint32: long_, 81 numpy.uint64: long_, 82 numpy.float32: float, 83 numpy.float64: double, 84 numpy.complex64: complex, 85 numpy.complex128: complex, 86 numpy.bytes_: bytes, 87} 88 89_nxtype_from_nptype[numpy.str_] = str 90 91if hasattr(numpy, 'float16'): 92 _nxtype_from_nptype[numpy.float16] = float # XXX: check 93if hasattr(numpy, 'float96'): 94 _nxtype_from_nptype[numpy.float96] = double # XXX: check 95if hasattr(numpy, 'float128'): 96 _nxtype_from_nptype[numpy.float128] = double # XXX: check 97if hasattr(numpy, 'complec192'): 98 _nxtype_from_nptype[numpy.complex192] = complex # XXX: check 99if hasattr(numpy, 'complex256'): 100 _nxtype_from_nptype[numpy.complex256] = complex # XXX: check 101 102 103# The NumPy scalar type corresponding to `SizeType`. 104_npsizetype = numpy.array(SizeType(0)).dtype.type 105 106 107def _index_name_of(node): 108 return '_i_%s' % node._v_name 109 110 111def _index_pathname_of(node): 112 nodeParentPath = split_path(node._v_pathname)[0] 113 return join_path(nodeParentPath, _index_name_of(node)) 114 115 116def _index_pathname_of_column(table, colpathname): 117 return join_path(_index_pathname_of(table), colpathname) 118 119 120# The next are versions that work with just paths (i.e. we don't need 121# a node instance for using them, which can be critical in certain 122# situations) 123 124 125def _index_name_of_(nodeName): 126 return '_i_%s' % nodeName 127 128 129def _index_pathname_of_(nodePath): 130 nodeParentPath, nodeName = split_path(nodePath) 131 return join_path(nodeParentPath, _index_name_of_(nodeName)) 132 133 134def _index_pathname_of_column_(tablePath, colpathname): 135 return join_path(_index_pathname_of_(tablePath), colpathname) 136 137 138def restorecache(self): 139 # Define a cache for sparse table reads 140 params = self._v_file.params 141 chunksize = self._v_chunkshape[0] 142 nslots = params['TABLE_MAX_SIZE'] / (chunksize * self._v_dtype.itemsize) 143 self._chunkcache = NumCache((nslots, chunksize), self._v_dtype, 144 'table chunk cache') 145 self._seqcache = ObjectCache(params['ITERSEQ_MAX_SLOTS'], 146 params['ITERSEQ_MAX_SIZE'], 147 'Iter sequence cache') 148 self._dirtycache = False 149 150 151def _table__where_indexed(self, compiled, condition, condvars, 152 start, stop, step): 153 if profile: 154 tref = time() 155 if profile: 156 show_stats("Entering table_whereIndexed", tref) 157 self._use_index = True 158 # Clean the table caches for indexed queries if needed 159 if self._dirtycache: 160 restorecache(self) 161 162 # Get the values in expression that are not columns 163 values = [] 164 for key, value in condvars.items(): 165 if isinstance(value, numpy.ndarray): 166 values.append((key, value.item())) 167 # Build a key for the sequence cache 168 seqkey = (condition, tuple(values), (start, stop, step)) 169 # Do a lookup in sequential cache for this query 170 nslot = self._seqcache.getslot(seqkey) 171 if nslot >= 0: 172 # Get the row sequence from the cache 173 seq = self._seqcache.getitem(nslot) 174 if len(seq) == 0: 175 return iter([]) 176 # seq is a list. 177 seq = numpy.array(seq, dtype='int64') 178 # Correct the ranges in cached sequence 179 if (start, stop, step) != (0, self.nrows, 1): 180 seq = seq[(seq >= start) & ( 181 seq < stop) & ((seq - start) % step == 0)] 182 return self.itersequence(seq) 183 else: 184 # No luck. self._seqcache will be populated 185 # in the iterator if possible. (Row._finish_riterator) 186 self._seqcache_key = seqkey 187 188 # Compute the chunkmap for every index in indexed expression 189 idxexprs = compiled.index_expressions 190 strexpr = compiled.string_expression 191 cmvars = {} 192 tcoords = 0 193 for i, idxexpr in enumerate(idxexprs): 194 var, ops, lims = idxexpr 195 col = condvars[var] 196 index = col.index 197 assert index is not None, "the chosen column is not indexed" 198 assert not index.dirty, "the chosen column has a dirty index" 199 200 # Get the number of rows that the indexed condition yields. 201 range_ = index.get_lookup_range(ops, lims) 202 ncoords = index.search(range_) 203 tcoords += ncoords 204 if index.reduction == 1 and ncoords == 0: 205 # No values from index condition, thus the chunkmap should be empty 206 nrowsinchunk = self.chunkshape[0] 207 nchunks = int(math.ceil(float(self.nrows) / nrowsinchunk)) 208 chunkmap = numpy.zeros(shape=nchunks, dtype="bool") 209 else: 210 # Get the chunkmap from the index 211 chunkmap = index.get_chunkmap() 212 # Assign the chunkmap to the cmvars dictionary 213 cmvars["e%d" % i] = chunkmap 214 215 if index.reduction == 1 and tcoords == 0: 216 # No candidates found in any indexed expression component, so leave now 217 self._seqcache.setitem(seqkey, [], 1) 218 return iter([]) 219 220 # Compute the final chunkmap 221 chunkmap = numexpr.evaluate(strexpr, cmvars) 222 if not chunkmap.any(): 223 # The chunkmap is all False, so the result is empty 224 self._seqcache.setitem(seqkey, [], 1) 225 return iter([]) 226 227 if profile: 228 show_stats("Exiting table_whereIndexed", tref) 229 return chunkmap 230 231 232def create_indexes_table(table): 233 itgroup = IndexesTableG( 234 table._v_parent, _index_name_of(table), 235 "Indexes container for table " + table._v_pathname, new=True) 236 return itgroup 237 238 239def create_indexes_descr(igroup, dname, iname, filters): 240 idgroup = IndexesDescG( 241 igroup, iname, 242 "Indexes container for sub-description " + dname, 243 filters=filters, new=True) 244 return idgroup 245 246 247def _column__create_index(self, optlevel, kind, filters, tmp_dir, 248 blocksizes, verbose): 249 name = self.name 250 table = self.table 251 dtype = self.dtype 252 descr = self.descr 253 index = self.index 254 get_node = table._v_file._get_node 255 256 # Warn if the index already exists 257 if index: 258 raise ValueError("%s for column '%s' already exists. If you want to " 259 "re-create it, please, try with reindex() method " 260 "better" % (str(index), str(self.pathname))) 261 262 # Check that the datatype is indexable. 263 if dtype.str[1:] == 'u8': 264 raise NotImplementedError( 265 "indexing 64-bit unsigned integer columns " 266 "is not supported yet, sorry") 267 if dtype.kind == 'c': 268 raise TypeError("complex columns can not be indexed") 269 if dtype.shape != (): 270 raise TypeError("multidimensional columns can not be indexed") 271 272 # Get the indexes group for table, and if not exists, create it 273 try: 274 itgroup = get_node(_index_pathname_of(table)) 275 except NoSuchNodeError: 276 itgroup = create_indexes_table(table) 277 278 # Create the necessary intermediate groups for descriptors 279 idgroup = itgroup 280 dname = "" 281 pathname = descr._v_pathname 282 if pathname != '': 283 inames = pathname.split('/') 284 for iname in inames: 285 if dname == '': 286 dname = iname 287 else: 288 dname += '/' + iname 289 try: 290 idgroup = get_node('%s/%s' % (itgroup._v_pathname, dname)) 291 except NoSuchNodeError: 292 idgroup = create_indexes_descr(idgroup, dname, iname, filters) 293 294 # Create the atom 295 assert dtype.shape == () 296 atom = Atom.from_dtype(numpy.dtype((dtype, (0,)))) 297 298 # Protection on tables larger than the expected rows (perhaps the 299 # user forgot to pass this parameter to the Table constructor?) 300 expectedrows = table._v_expectedrows 301 if table.nrows > expectedrows: 302 expectedrows = table.nrows 303 304 # Create the index itself 305 index = Index( 306 idgroup, name, atom=atom, 307 title="Index for %s column" % name, 308 kind=kind, 309 optlevel=optlevel, 310 filters=filters, 311 tmp_dir=tmp_dir, 312 expectedrows=expectedrows, 313 byteorder=table.byteorder, 314 blocksizes=blocksizes) 315 316 table._set_column_indexing(self.pathname, True) 317 318 # Feed the index with values 319 320 # Add rows to the index if necessary 321 if table.nrows > 0: 322 indexedrows = table._add_rows_to_index( 323 self.pathname, 0, table.nrows, lastrow=True, update=False) 324 else: 325 indexedrows = 0 326 index.dirty = False 327 table._indexedrows = indexedrows 328 table._unsaved_indexedrows = table.nrows - indexedrows 329 330 # Optimize the index that has been already filled-up 331 index.optimize(verbose=verbose) 332 333 # We cannot do a flush here because when reindexing during a 334 # flush, the indexes are created anew, and that creates a nested 335 # call to flush(). 336 # table.flush() 337 338 return indexedrows 339 340 341class _ColIndexes(dict): 342 """Provides a nice representation of column indexes.""" 343 344 def __repr__(self): 345 """Gives a detailed Description column representation.""" 346 347 rep = [' \"%s\": %s' % (k, self[k]) for k in self.keys()] 348 return '{\n %s}' % (',\n '.join(rep)) 349 350 351class Table(tableextension.Table, Leaf): 352 """This class represents heterogeneous datasets in an HDF5 file. 353 354 Tables are leaves (see the Leaf class in :ref:`LeafClassDescr`) whose data 355 consists of a unidimensional sequence of *rows*, where each row contains 356 one or more *fields*. Fields have an associated unique *name* and 357 *position*, with the first field having position 0. All rows have the same 358 fields, which are arranged in *columns*. 359 360 Fields can have any type supported by the Col class (see 361 :ref:`ColClassDescr`) and its descendants, which support multidimensional 362 data. Moreover, a field can be *nested* (to an arbitrary depth), meaning 363 that it includes further fields inside. A field named x inside a nested 364 field a in a table can be accessed as the field a/x (its *path name*) from 365 the table. 366 367 The structure of a table is declared by its description, which is made 368 available in the Table.description attribute (see :class:`Table`). 369 370 This class provides new methods to read, write and search table data 371 efficiently. It also provides special Python methods to allow accessing 372 the table as a normal sequence or array (with extended slicing supported). 373 374 PyTables supports *in-kernel* searches working simultaneously on several 375 columns using complex conditions. These are faster than selections using 376 Python expressions. See the :meth:`Table.where` method for more 377 information on in-kernel searches. 378 379 Non-nested columns can be *indexed*. Searching an indexed column can be 380 several times faster than searching a non-nested one. Search methods 381 automatically take advantage of indexing where available. 382 383 When iterating a table, an object from the Row (see :ref:`RowClassDescr`) 384 class is used. This object allows to read and write data one row at a 385 time, as well as to perform queries which are not supported by in-kernel 386 syntax (at a much lower speed, of course). 387 388 Objects of this class support access to individual columns via *natural 389 naming* through the :attr:`Table.cols` accessor. Nested columns are 390 mapped to Cols instances, and non-nested ones to Column instances. 391 See the Column class in :ref:`ColumnClassDescr` for examples of this 392 feature. 393 394 Parameters 395 ---------- 396 parentnode 397 The parent :class:`Group` object. 398 399 .. versionchanged:: 3.0 400 Renamed from *parentNode* to *parentnode*. 401 402 name : str 403 The name of this node in its parent group. 404 description 405 An IsDescription subclass or a dictionary where the keys are the field 406 names, and the values the type definitions. In addition, a pure NumPy 407 dtype is accepted. If None, the table metadata is read from disk, 408 else, it's taken from previous parameters. 409 title 410 Sets a TITLE attribute on the HDF5 table entity. 411 filters : Filters 412 An instance of the Filters class that provides information about the 413 desired I/O filters to be applied during the life of this object. 414 expectedrows 415 A user estimate about the number of rows that will be on table. If not 416 provided, the default value is ``EXPECTED_ROWS_TABLE`` (see 417 ``tables/parameters.py``). If you plan to save bigger tables, try 418 providing a guess; this will optimize the HDF5 B-Tree creation and 419 management process time and memory used. 420 chunkshape 421 The shape of the data chunk to be read or written as a single HDF5 I/O 422 operation. The filters are applied to those chunks of data. Its rank 423 for tables has to be 1. If ``None``, a sensible value is calculated 424 based on the `expectedrows` parameter (which is recommended). 425 byteorder 426 The byteorder of the data *on-disk*, specified as 'little' or 'big'. If 427 this is not specified, the byteorder is that of the platform, unless 428 you passed a recarray as the `description`, in which case the recarray 429 byteorder will be chosen. 430 track_times 431 Whether time data associated with the leaf are recorded (object 432 access time, raw data modification time, metadata change time, object 433 birth time); default True. Semantics of these times depend on their 434 implementation in the HDF5 library: refer to documentation of the 435 H5O_info_t data structure. As of HDF5 1.8.15, only ctime (metadata 436 change time) is implemented. 437 438 .. versionadded:: 3.4.3 439 440 Notes 441 ----- 442 The instance variables below are provided in addition to those in 443 Leaf (see :ref:`LeafClassDescr`). Please note that there are several 444 col* dictionaries to ease retrieving information about a column 445 directly by its path name, avoiding the need to walk through 446 Table.description or Table.cols. 447 448 449 .. rubric:: Table attributes 450 451 .. attribute:: coldescrs 452 453 Maps the name of a column to its Col description (see 454 :ref:`ColClassDescr`). 455 456 .. attribute:: coldflts 457 458 Maps the name of a column to its default value. 459 460 .. attribute:: coldtypes 461 462 Maps the name of a column to its NumPy data type. 463 464 .. attribute:: colindexed 465 466 Is the column which name is used as a key indexed? 467 468 .. attribute:: colinstances 469 470 Maps the name of a column to its Column (see 471 :ref:`ColumnClassDescr`) or Cols (see :ref:`ColsClassDescr`) 472 instance. 473 474 .. attribute:: colnames 475 476 A list containing the names of *top-level* columns in the table. 477 478 .. attribute:: colpathnames 479 480 A list containing the pathnames of *bottom-level* columns in 481 the table. 482 483 These are the leaf columns obtained when walking the table 484 description left-to-right, bottom-first. Columns inside a 485 nested column have slashes (/) separating name components in 486 their pathname. 487 488 .. attribute:: cols 489 490 A Cols instance that provides *natural naming* access to 491 non-nested (Column, see :ref:`ColumnClassDescr`) and nested 492 (Cols, see :ref:`ColsClassDescr`) columns. 493 494 .. attribute:: coltypes 495 496 Maps the name of a column to its PyTables data type. 497 498 .. attribute:: description 499 500 A Description instance (see :ref:`DescriptionClassDescr`) 501 reflecting the structure of the table. 502 503 .. attribute:: extdim 504 505 The index of the enlargeable dimension (always 0 for tables). 506 507 .. attribute:: indexed 508 509 Does this table have any indexed columns? 510 511 .. attribute:: nrows 512 513 The current number of rows in the table. 514 515 """ 516 517 # Class identifier. 518 _c_classid = 'TABLE' 519 520 # Properties 521 # ~~~~~~~~~~ 522 @lazyattr 523 def row(self): 524 """The associated Row instance (see :ref:`RowClassDescr`).""" 525 526 return tableextension.Row(self) 527 528 @lazyattr 529 def dtype(self): 530 """The NumPy ``dtype`` that most closely matches this table.""" 531 532 return self.description._v_dtype 533 534 # Read-only shorthands 535 # ```````````````````` 536 537 @property 538 def shape(self): 539 """The shape of this table.""" 540 return (self.nrows,) 541 542 @property 543 def rowsize(self): 544 """The size in bytes of each row in the table.""" 545 return self.description._v_dtype.itemsize 546 547 @property 548 def size_in_memory(self): 549 """The size of this table's data in bytes when it is fully loaded into 550 memory. This may be used in combination with size_on_disk to calculate 551 the compression ratio of the data.""" 552 return self.nrows * self.rowsize 553 554 # Lazy attributes 555 # ``````````````` 556 @lazyattr 557 def _v_iobuf(self): 558 """A buffer for doing I/O.""" 559 560 return self._get_container(self.nrowsinbuf) 561 562 @lazyattr 563 def _v_wdflts(self): 564 """The defaults for writing in recarray format.""" 565 566 # First, do a check to see whether we need to set default values 567 # different from 0 or not. 568 for coldflt in self.coldflts.values(): 569 if isinstance(coldflt, numpy.ndarray) or coldflt: 570 break 571 else: 572 # No default different from 0 found. Returning None. 573 return None 574 wdflts = self._get_container(1) 575 for colname, coldflt in self.coldflts.items(): 576 ra = get_nested_field(wdflts, colname) 577 ra[:] = coldflt 578 return wdflts 579 580 @lazyattr 581 def _colunaligned(self): 582 """The pathnames of unaligned, *unidimensional* columns.""" 583 colunaligned, rarr = [], self._get_container(0) 584 for colpathname in self.colpathnames: 585 carr = get_nested_field(rarr, colpathname) 586 if not carr.flags.aligned and carr.ndim == 1: 587 colunaligned.append(colpathname) 588 return frozenset(colunaligned) 589 590 # Index-related properties 591 # ```````````````````````` 592 593 # **************** WARNING! *********************** 594 # This function can be called during the destruction time of a table 595 # so measures have been taken so that it doesn't have to revive 596 # another node (which can fool the LRU cache). The solution devised 597 # has been to add a cache for autoindex (Table._autoindex), populate 598 # it in creation time of the cache (which is a safe period) and then 599 # update the cache whenever it changes. 600 # This solves the error when running test_indexes.py ManyNodesTestCase. 601 # F. Alted 2007-04-20 602 # ************************************************** 603 604 @property 605 def autoindex(self): 606 """Automatically keep column indexes up to date? 607 608 Setting this value states whether existing indexes should be 609 automatically updated after an append operation or recomputed 610 after an index-invalidating operation (i.e. removal and 611 modification of rows). The default is true. 612 613 This value gets into effect whenever a column is altered. If you 614 don't have automatic indexing activated and you want to do an an 615 immediate update use `Table.flush_rows_to_index()`; for an immediate 616 reindexing of invalidated indexes, use `Table.reindex_dirty()`. 617 618 This value is persistent. 619 620 .. versionchanged:: 3.0 621 The *autoIndex* property has been renamed into *autoindex*. 622 """ 623 624 if self._autoindex is None: 625 try: 626 indexgroup = self._v_file._get_node(_index_pathname_of(self)) 627 except NoSuchNodeError: 628 self._autoindex = default_auto_index # update cache 629 return self._autoindex 630 else: 631 self._autoindex = indexgroup.auto # update cache 632 return self._autoindex 633 else: 634 # The value is in cache, return it 635 return self._autoindex 636 637 @autoindex.setter 638 def autoindex(self, auto): 639 auto = bool(auto) 640 try: 641 indexgroup = self._v_file._get_node(_index_pathname_of(self)) 642 except NoSuchNodeError: 643 indexgroup = create_indexes_table(self) 644 indexgroup.auto = auto 645 # Update the cache in table instance as well 646 self._autoindex = auto 647 648 @property 649 def indexedcolpathnames(self): 650 """List of pathnames of indexed columns in the table.""" 651 return [_colpname for _colpname in self.colpathnames if self.colindexed[_colpname]] 652 653 @property 654 def colindexes(self): 655 """A dictionary with the indexes of the indexed columns.""" 656 return _ColIndexes( 657 ((_colpname, self.cols._f_col(_colpname).index) 658 for _colpname in self.colpathnames 659 if self.colindexed[_colpname])) 660 661 @property 662 def _dirtyindexes(self): 663 """Whether some index in table is dirty.""" 664 return self._condition_cache._nailcount > 0 665 666 # Other methods 667 # ~~~~~~~~~~~~~ 668 def __init__(self, parentnode, name, 669 description=None, title="", filters=None, 670 expectedrows=None, chunkshape=None, 671 byteorder=None, _log=True, track_times=True): 672 673 self._v_new = new = description is not None 674 """Is this the first time the node has been created?""" 675 self._v_new_title = title 676 """New title for this node.""" 677 self._v_new_filters = filters 678 """New filter properties for this node.""" 679 self.extdim = 0 # Tables only have one dimension currently 680 """The index of the enlargeable dimension (always 0 for tables).""" 681 self._v_recarray = None 682 """A structured array to be stored in the table.""" 683 self._rabyteorder = None 684 """The computed byteorder of the self._v_recarray.""" 685 if expectedrows is None: 686 expectedrows = parentnode._v_file.params['EXPECTED_ROWS_TABLE'] 687 self._v_expectedrows = expectedrows 688 """The expected number of rows to be stored in the table.""" 689 self.nrows = SizeType(0) 690 """The current number of rows in the table.""" 691 self.description = None 692 """A Description instance (see :ref:`DescriptionClassDescr`) 693 reflecting the structure of the table.""" 694 self._time64colnames = [] 695 """The names of ``Time64`` columns.""" 696 self._strcolnames = [] 697 """The names of ``String`` columns.""" 698 self._colenums = {} 699 """Maps the name of an enumerated column to its ``Enum`` instance.""" 700 self._v_chunkshape = None 701 """Private storage for the `chunkshape` property of the leaf.""" 702 703 self.indexed = False 704 """Does this table have any indexed columns?""" 705 self._indexedrows = 0 706 """Number of rows indexed in disk.""" 707 self._unsaved_indexedrows = 0 708 """Number of rows indexed in memory but still not in disk.""" 709 self._listoldindexes = [] 710 """The list of columns with old indexes.""" 711 self._autoindex = None 712 """Private variable that caches the value for autoindex.""" 713 714 self.colnames = [] 715 """A list containing the names of *top-level* columns in the table.""" 716 self.colpathnames = [] 717 """A list containing the pathnames of *bottom-level* columns in the 718 table. 719 720 These are the leaf columns obtained when walking the 721 table description left-to-right, bottom-first. Columns inside a 722 nested column have slashes (/) separating name components in 723 their pathname. 724 """ 725 self.colinstances = {} 726 """Maps the name of a column to its Column (see 727 :ref:`ColumnClassDescr`) or Cols (see :ref:`ColsClassDescr`) 728 instance.""" 729 self.coldescrs = {} 730 """Maps the name of a column to its Col description (see 731 :ref:`ColClassDescr`).""" 732 self.coltypes = {} 733 """Maps the name of a column to its PyTables data type.""" 734 self.coldtypes = {} 735 """Maps the name of a column to its NumPy data type.""" 736 self.coldflts = {} 737 """Maps the name of a column to its default value.""" 738 self.colindexed = {} 739 """Is the column which name is used as a key indexed?""" 740 741 self._use_index = False 742 """Whether an index can be used or not in a search. Boolean.""" 743 self._where_condition = None 744 """Condition function and argument list for selection of values.""" 745 self._seqcache_key = None 746 """The key under which to save a query's results (list of row indexes) 747 or None to not save.""" 748 max_slots = parentnode._v_file.params['COND_CACHE_SLOTS'] 749 self._condition_cache = CacheDict(max_slots) 750 """Cache of already compiled conditions.""" 751 self._exprvars_cache = {} 752 """Cache of variables participating in numexpr expressions.""" 753 self._enabled_indexing_in_queries = True 754 """Is indexing enabled in queries? *Use only for testing.*""" 755 self._empty_array_cache = {} 756 """Cache of empty arrays.""" 757 758 self._v_dtype = None 759 """The NumPy datatype fopr this table.""" 760 self.cols = None 761 """ 762 A Cols instance that provides *natural naming* access to non-nested 763 (Column, see :ref:`ColumnClassDescr`) and nested (Cols, see 764 :ref:`ColsClassDescr`) columns. 765 """ 766 self._dirtycache = True 767 """Whether the data caches are dirty or not. Initially set to yes.""" 768 self._descflavor = None 769 """Temporarily keeps the flavor of a description with data.""" 770 771 # Initialize this object in case is a new Table 772 773 # Try purely descriptive description objects. 774 if new and isinstance(description, dict): 775 # Dictionary case 776 self.description = Description(description, ptparams=parentnode._v_file.params) 777 elif new and (type(description) == type(IsDescription) 778 and issubclass(description, IsDescription)): 779 # IsDescription subclass case 780 descr = description() 781 self.description = Description(descr.columns, ptparams=parentnode._v_file.params) 782 elif new and isinstance(description, Description): 783 # It is a Description instance already 784 self.description = description 785 786 # No description yet? 787 if new and self.description is None: 788 # Try NumPy dtype instances 789 if isinstance(description, numpy.dtype): 790 self.description, self._rabyteorder = \ 791 descr_from_dtype(description, ptparams=parentnode._v_file.params) 792 793 # No description yet? 794 if new and self.description is None: 795 # Try structured array description objects. 796 try: 797 self._descflavor = flavor = flavor_of(description) 798 except TypeError: # probably not an array 799 pass 800 else: 801 if flavor == 'python': 802 nparray = numpy.rec.array(description) 803 else: 804 nparray = array_as_internal(description, flavor) 805 self.nrows = nrows = SizeType(nparray.size) 806 # If `self._v_recarray` is set, it will be used as the 807 # initial buffer. 808 if nrows > 0: 809 self._v_recarray = nparray 810 self.description, self._rabyteorder = \ 811 descr_from_dtype(nparray.dtype, ptparams=parentnode._v_file.params) 812 813 # No description yet? 814 if new and self.description is None: 815 raise TypeError( 816 "the ``description`` argument is not of a supported type: " 817 "``IsDescription`` subclass, ``Description`` instance, " 818 "dictionary, or structured array") 819 820 # Check the chunkshape parameter 821 if new and chunkshape is not None: 822 if isinstance(chunkshape, (int, numpy.integer)): 823 chunkshape = (chunkshape,) 824 try: 825 chunkshape = tuple(chunkshape) 826 except TypeError: 827 raise TypeError( 828 "`chunkshape` parameter must be an integer or sequence " 829 "and you passed a %s" % type(chunkshape)) 830 if len(chunkshape) != 1: 831 raise ValueError("`chunkshape` rank (length) must be 1: %r" 832 % (chunkshape,)) 833 self._v_chunkshape = tuple(SizeType(s) for s in chunkshape) 834 835 super(Table, self).__init__(parentnode, name, new, filters, 836 byteorder, _log, track_times) 837 838 def _g_post_init_hook(self): 839 # We are putting here the index-related issues 840 # as well as filling general info for table 841 # This is needed because we need first the index objects created 842 843 # First, get back the flavor of input data (if any) for 844 # `Leaf._g_post_init_hook()`. 845 self._flavor, self._descflavor = self._descflavor, None 846 super(Table, self)._g_post_init_hook() 847 848 # Create a cols accessor. 849 self.cols = Cols(self, self.description) 850 851 # Place the `Cols` and `Column` objects into `self.colinstances`. 852 colinstances, cols = self.colinstances, self.cols 853 for colpathname in self.description._v_pathnames: 854 colinstances[colpathname] = cols._g_col(colpathname) 855 856 if self._v_new: 857 # Columns are never indexed on creation. 858 self.colindexed = dict((cpn, False) for cpn in self.colpathnames) 859 return 860 861 # The following code is only for opened tables. 862 863 # Do the indexes group exist? 864 indexesgrouppath = _index_pathname_of(self) 865 igroup = indexesgrouppath in self._v_file 866 oldindexes = False 867 for colobj in self.description._f_walk(type="Col"): 868 colname = colobj._v_pathname 869 # Is this column indexed? 870 if igroup: 871 indexname = _index_pathname_of_column(self, colname) 872 indexed = indexname in self._v_file 873 self.colindexed[colname] = indexed 874 if indexed: 875 column = self.cols._g_col(colname) 876 indexobj = column.index 877 if isinstance(indexobj, OldIndex): 878 indexed = False # Not a vaild index 879 oldindexes = True 880 self._listoldindexes.append(colname) 881 else: 882 # Tell the condition cache about columns with dirty 883 # indexes. 884 if indexobj.dirty: 885 self._condition_cache.nail() 886 else: 887 indexed = False 888 self.colindexed[colname] = False 889 if indexed: 890 self.indexed = True 891 892 if oldindexes: # this should only appear under 2.x Pro 893 warnings.warn( 894 "table ``%s`` has column indexes with PyTables 1.x format. " 895 "Unfortunately, this format is not supported in " 896 "PyTables 2.x series. Note that you can use the " 897 "``ptrepack`` utility in order to recreate the indexes. " 898 "The 1.x indexed columns found are: %s" % 899 (self._v_pathname, self._listoldindexes), 900 OldIndexWarning) 901 902 # It does not matter to which column 'indexobj' belongs, 903 # since their respective index objects share 904 # the same number of elements. 905 if self.indexed: 906 self._indexedrows = indexobj.nelements 907 self._unsaved_indexedrows = self.nrows - self._indexedrows 908 # Put the autoindex value in a cache variable 909 self._autoindex = self.autoindex 910 911 def _calc_nrowsinbuf(self): 912 """Calculate the number of rows that fits on a PyTables buffer.""" 913 914 params = self._v_file.params 915 # Compute the nrowsinbuf 916 rowsize = self.rowsize 917 buffersize = params['IO_BUFFER_SIZE'] 918 if rowsize != 0: 919 nrowsinbuf = buffersize // rowsize 920 # The number of rows in buffer needs to be an exact multiple of 921 # chunkshape[0] for queries using indexed columns. 922 # Fixes #319 and probably #409 too. 923 nrowsinbuf -= nrowsinbuf % self.chunkshape[0] 924 else: 925 nrowsinbuf = 1 926 927 # tableextension.pyx performs an assertion 928 # to make sure nrowsinbuf is greater than or 929 # equal to the chunksize. 930 # See gh-206 and gh-238 931 if self.chunkshape is not None: 932 if nrowsinbuf < self.chunkshape[0]: 933 nrowsinbuf = self.chunkshape[0] 934 935 # Safeguard against row sizes being extremely large 936 if nrowsinbuf == 0: 937 nrowsinbuf = 1 938 # If rowsize is too large, issue a Performance warning 939 maxrowsize = params['BUFFER_TIMES'] * buffersize 940 if rowsize > maxrowsize: 941 warnings.warn("""\ 942The Table ``%s`` is exceeding the maximum recommended rowsize (%d bytes); 943be ready to see PyTables asking for *lots* of memory and possibly slow 944I/O. You may want to reduce the rowsize by trimming the value of 945dimensions that are orthogonal (and preferably close) to the *main* 946dimension of this leave. Alternatively, in case you have specified a 947very small/large chunksize, you may want to increase/decrease it.""" 948 % (self._v_pathname, maxrowsize), 949 PerformanceWarning) 950 return nrowsinbuf 951 952 def _getemptyarray(self, dtype): 953 # Acts as a cache for empty arrays 954 key = dtype 955 if key in self._empty_array_cache: 956 return self._empty_array_cache[key] 957 else: 958 self._empty_array_cache[ 959 key] = arr = numpy.empty(shape=0, dtype=key) 960 return arr 961 962 def _get_container(self, shape): 963 "Get the appropriate buffer for data depending on table nestedness." 964 965 # This is *much* faster than the numpy.rec.array counterpart 966 return numpy.empty(shape=shape, dtype=self._v_dtype) 967 968 def _get_type_col_names(self, type_): 969 """Returns a list containing 'type_' column names.""" 970 971 return [colobj._v_pathname 972 for colobj in self.description._f_walk('Col') 973 if colobj.type == type_] 974 975 def _get_enum_map(self): 976 """Return mapping from enumerated column names to `Enum` instances.""" 977 978 enumMap = {} 979 for colobj in self.description._f_walk('Col'): 980 if colobj.kind == 'enum': 981 enumMap[colobj._v_pathname] = colobj.enum 982 return enumMap 983 984 def _g_create(self): 985 """Create a new table on disk.""" 986 987 # Warning against assigning too much columns... 988 # F. Alted 2005-06-05 989 maxColumns = self._v_file.params['MAX_COLUMNS'] 990 if (len(self.description._v_names) > maxColumns): 991 warnings.warn( 992 "table ``%s`` is exceeding the recommended " 993 "maximum number of columns (%d); " 994 "be ready to see PyTables asking for *lots* of memory " 995 "and possibly slow I/O" % (self._v_pathname, maxColumns), 996 PerformanceWarning) 997 998 # 1. Create the HDF5 table (some parameters need to be computed). 999 1000 # Fix the byteorder of the recarray and update the number of 1001 # expected rows if necessary 1002 if self._v_recarray is not None: 1003 self._v_recarray = self._g_fix_byteorder_data(self._v_recarray, 1004 self._rabyteorder) 1005 if len(self._v_recarray) > self._v_expectedrows: 1006 self._v_expectedrows = len(self._v_recarray) 1007 # Compute a sensible chunkshape 1008 if self._v_chunkshape is None: 1009 self._v_chunkshape = self._calc_chunkshape( 1010 self._v_expectedrows, self.rowsize, self.rowsize) 1011 # Correct the byteorder, if still needed 1012 if self.byteorder is None: 1013 self.byteorder = sys.byteorder 1014 1015 # Cache some data which is already in the description. 1016 # This is necessary to happen before creation time in order 1017 # to be able to populate the self._v_wdflts 1018 self._cache_description_data() 1019 1020 # After creating the table, ``self._v_objectid`` needs to be 1021 # set because it is needed for setting attributes afterwards. 1022 self._v_objectid = self._create_table( 1023 self._v_new_title, self.filters.complib or '', obversion) 1024 self._v_recarray = None # not useful anymore 1025 self._rabyteorder = None # not useful anymore 1026 1027 # 2. Compute or get chunk shape and buffer size parameters. 1028 self.nrowsinbuf = self._calc_nrowsinbuf() 1029 1030 # 3. Get field fill attributes from the table description and 1031 # set them on disk. 1032 if self._v_file.params['PYTABLES_SYS_ATTRS']: 1033 set_attr = self._v_attrs._g__setattr 1034 for i, colobj in enumerate(self.description._f_walk(type="Col")): 1035 fieldname = "FIELD_%d_FILL" % i 1036 set_attr(fieldname, colobj.dflt) 1037 1038 return self._v_objectid 1039 1040 def _g_open(self): 1041 """Opens a table from disk and read the metadata on it. 1042 1043 Creates an user description on the flight to easy the access to 1044 the actual data. 1045 1046 """ 1047 1048 # 1. Open the HDF5 table and get some data from it. 1049 self._v_objectid, description, chunksize = self._get_info() 1050 self._v_expectedrows = self.nrows # the actual number of rows 1051 1052 # 2. Create an instance description to host the record fields. 1053 validate = not self._v_file._isPTFile # only for non-PyTables files 1054 self.description = Description(description, validate=validate, ptparams=self._v_file.params) 1055 1056 # 3. Compute or get chunk shape and buffer size parameters. 1057 if chunksize == 0: 1058 self._v_chunkshape = self._calc_chunkshape( 1059 self._v_expectedrows, self.rowsize, self.rowsize) 1060 else: 1061 self._v_chunkshape = (chunksize,) 1062 self.nrowsinbuf = self._calc_nrowsinbuf() 1063 1064 # 4. If there are field fill attributes, get them from disk and 1065 # set them in the table description. 1066 if self._v_file.params['PYTABLES_SYS_ATTRS']: 1067 if "FIELD_0_FILL" in self._v_attrs._f_list("sys"): 1068 i = 0 1069 get_attr = self._v_attrs.__getattr__ 1070 for objcol in self.description._f_walk(type="Col"): 1071 colname = objcol._v_pathname 1072 # Get the default values for each column 1073 fieldname = "FIELD_%s_FILL" % i 1074 defval = get_attr(fieldname) 1075 if defval is not None: 1076 objcol.dflt = defval 1077 else: 1078 warnings.warn("could not load default value " 1079 "for the ``%s`` column of table ``%s``; " 1080 "using ``%r`` instead" 1081 % (colname, self._v_pathname, 1082 objcol.dflt)) 1083 defval = objcol.dflt 1084 i += 1 1085 1086 # Set also the correct value in the desc._v_dflts dictionary 1087 for descr in self.description._f_walk(type="Description"): 1088 for name in descr._v_names: 1089 objcol = descr._v_colobjects[name] 1090 if isinstance(objcol, Col): 1091 descr._v_dflts[objcol._v_name] = objcol.dflt 1092 1093 # 5. Cache some data which is already in the description. 1094 self._cache_description_data() 1095 1096 return self._v_objectid 1097 1098 def _cache_description_data(self): 1099 """Cache some data which is already in the description. 1100 1101 Some information is extracted from `self.description` to build 1102 some useful (but redundant) structures: 1103 1104 * `self.colnames` 1105 * `self.colpathnames` 1106 * `self.coldescrs` 1107 * `self.coltypes` 1108 * `self.coldtypes` 1109 * `self.coldflts` 1110 * `self._v_dtype` 1111 * `self._time64colnames` 1112 * `self._strcolnames` 1113 * `self._colenums` 1114 1115 """ 1116 1117 self.colnames = list(self.description._v_names) 1118 self.colpathnames = [ 1119 col._v_pathname for col in self.description._f_walk() 1120 if not hasattr(col, '_v_names')] # bottom-level 1121 1122 # Find ``time64`` column names. 1123 self._time64colnames = self._get_type_col_names('time64') 1124 # Find ``string`` column names. 1125 self._strcolnames = self._get_type_col_names('string') 1126 # Get a mapping of enumerated columns to their `Enum` instances. 1127 self._colenums = self._get_enum_map() 1128 1129 # Get info about columns 1130 for colobj in self.description._f_walk(type="Col"): 1131 colname = colobj._v_pathname 1132 # Get the column types, types and defaults 1133 self.coldescrs[colname] = colobj 1134 self.coltypes[colname] = colobj.type 1135 self.coldtypes[colname] = colobj.dtype 1136 self.coldflts[colname] = colobj.dflt 1137 1138 # Assign _v_dtype for this table 1139 self._v_dtype = self.description._v_dtype 1140 1141 def _get_column_instance(self, colpathname): 1142 """Get the instance of the column with the given `colpathname`. 1143 1144 If the column does not exist in the table, a `KeyError` is 1145 raised. 1146 1147 """ 1148 1149 try: 1150 return _reduce(getattr, colpathname.split('/'), self.description) 1151 except AttributeError: 1152 raise KeyError("table ``%s`` does not have a column named ``%s``" 1153 % (self._v_pathname, colpathname)) 1154 1155 _check_column = _get_column_instance 1156 1157 def _disable_indexing_in_queries(self): 1158 """Force queries not to use indexing. 1159 1160 *Use only for testing.* 1161 1162 """ 1163 1164 if not self._enabled_indexing_in_queries: 1165 return # already disabled 1166 # The nail avoids setting/getting compiled conditions in/from 1167 # the cache where indexing is used. 1168 self._condition_cache.nail() 1169 self._enabled_indexing_in_queries = False 1170 1171 def _enable_indexing_in_queries(self): 1172 """Allow queries to use indexing. 1173 1174 *Use only for testing.* 1175 1176 """ 1177 1178 if self._enabled_indexing_in_queries: 1179 return # already enabled 1180 self._condition_cache.unnail() 1181 self._enabled_indexing_in_queries = True 1182 1183 def _required_expr_vars(self, expression, uservars, depth=1): 1184 """Get the variables required by the `expression`. 1185 1186 A new dictionary defining the variables used in the `expression` 1187 is returned. Required variables are first looked up in the 1188 `uservars` mapping, then in the set of top-level columns of the 1189 table. Unknown variables cause a `NameError` to be raised. 1190 1191 When `uservars` is `None`, the local and global namespace where 1192 the API callable which uses this method is called is sought 1193 instead. This mechanism will not work as expected if this 1194 method is not used *directly* from an API callable. To disable 1195 this mechanism, just specify a mapping as `uservars`. 1196 1197 Nested columns and columns from other tables are not allowed 1198 (`TypeError` and `ValueError` are raised, respectively). Also, 1199 non-column variable values are converted to NumPy arrays. 1200 1201 `depth` specifies the depth of the frame in order to reach local 1202 or global variables. 1203 1204 """ 1205 1206 # Get the names of variables used in the expression. 1207 exprvarscache = self._exprvars_cache 1208 if expression not in exprvarscache: 1209 # Protection against growing the cache too much 1210 if len(exprvarscache) > 256: 1211 # Remove 10 (arbitrary) elements from the cache 1212 for k in list(exprvarscache.keys())[:10]: 1213 del exprvarscache[k] 1214 cexpr = compile(expression, '<string>', 'eval') 1215 exprvars = [var for var in cexpr.co_names 1216 if var not in ['None', 'False', 'True'] 1217 and var not in numexpr_functions] 1218 exprvarscache[expression] = exprvars 1219 else: 1220 exprvars = exprvarscache[expression] 1221 1222 # Get the local and global variable mappings of the user frame 1223 # if no mapping has been explicitly given for user variables. 1224 user_locals, user_globals = {}, {} 1225 if uservars is None: 1226 # We use specified depth to get the frame where the API 1227 # callable using this method is called. For instance: 1228 # 1229 # * ``table._required_expr_vars()`` (depth 0) is called by 1230 # * ``table._where()`` (depth 1) is called by 1231 # * ``table.where()`` (depth 2) is called by 1232 # * user-space functions (depth 3) 1233 user_frame = sys._getframe(depth) 1234 user_locals = user_frame.f_locals 1235 user_globals = user_frame.f_globals 1236 1237 colinstances = self.colinstances 1238 tblfile, tblpath = self._v_file, self._v_pathname 1239 # Look for the required variables first among the ones 1240 # explicitly provided by the user, then among implicit columns, 1241 # then among external variables (only if no explicit variables). 1242 reqvars = {} 1243 for var in exprvars: 1244 # Get the value. 1245 if uservars is not None and var in uservars: 1246 val = uservars[var] 1247 elif var in colinstances: 1248 val = colinstances[var] 1249 elif uservars is None and var in user_locals: 1250 val = user_locals[var] 1251 elif uservars is None and var in user_globals: 1252 val = user_globals[var] 1253 else: 1254 raise NameError("name ``%s`` is not defined" % var) 1255 1256 # Check the value. 1257 if hasattr(val, 'pathname'): # non-nested column 1258 if val.shape[1:] != (): 1259 raise NotImplementedError( 1260 "variable ``%s`` refers to " 1261 "a multidimensional column, " 1262 "not yet supported in conditions, sorry" % var) 1263 if (val._table_file is not tblfile or 1264 val._table_path != tblpath): 1265 raise ValueError("variable ``%s`` refers to a column " 1266 "which is not part of table ``%s``" 1267 % (var, tblpath)) 1268 if val.dtype.str[1:] == 'u8': 1269 raise NotImplementedError( 1270 "variable ``%s`` refers to " 1271 "a 64-bit unsigned integer column, " 1272 "not yet supported in conditions, sorry; " 1273 "please use regular Python selections" % var) 1274 elif hasattr(val, '_v_colpathnames'): # nested column 1275 raise TypeError( 1276 "variable ``%s`` refers to a nested column, " 1277 "not allowed in conditions" % var) 1278 else: # only non-column values are converted to arrays 1279 # XXX: not 100% sure about this 1280 if isinstance(val, str): 1281 val = numpy.asarray(val.encode('ascii')) 1282 else: 1283 val = numpy.asarray(val) 1284 reqvars[var] = val 1285 return reqvars 1286 1287 def _get_condition_key(self, condition, condvars): 1288 """Get the condition cache key for `condition` with `condvars`. 1289 1290 Currently, the key is a tuple of `condition`, column variables 1291 names, normal variables names, column paths and variable paths 1292 (all are tuples). 1293 1294 """ 1295 1296 # Variable names for column and normal variables. 1297 colnames, varnames = [], [] 1298 # Column paths and types for each of the previous variable. 1299 colpaths, vartypes = [], [] 1300 for (var, val) in condvars.items(): 1301 if hasattr(val, 'pathname'): # column 1302 colnames.append(var) 1303 colpaths.append(val.pathname) 1304 else: # array 1305 try: 1306 varnames.append(var) 1307 vartypes.append(numexpr_getType(val)) # expensive 1308 except ValueError: 1309 # This is more clear than the error given by Numexpr. 1310 raise TypeError("variable ``%s`` has data type ``%s``, " 1311 "not allowed in conditions" 1312 % (var, val.dtype.name)) 1313 colnames, varnames = tuple(colnames), tuple(varnames) 1314 colpaths, vartypes = tuple(colpaths), tuple(vartypes) 1315 condkey = (condition, colnames, varnames, colpaths, vartypes) 1316 return condkey 1317 1318 def _compile_condition(self, condition, condvars): 1319 """Compile the `condition` and extract usable index conditions. 1320 1321 This method returns an instance of ``CompiledCondition``. See 1322 the ``compile_condition()`` function in the ``conditions`` 1323 module for more information about the compilation process. 1324 1325 This method makes use of the condition cache when possible. 1326 1327 """ 1328 1329 # Look up the condition in the condition cache. 1330 condcache = self._condition_cache 1331 condkey = self._get_condition_key(condition, condvars) 1332 compiled = condcache.get(condkey) 1333 if compiled: 1334 return compiled.with_replaced_vars(condvars) # bingo! 1335 1336 # Bad luck, the condition must be parsed and compiled. 1337 # Fortunately, the key provides some valuable information. ;) 1338 (condition, colnames, varnames, colpaths, vartypes) = condkey 1339 1340 # Extract more information from referenced columns. 1341 1342 # start with normal variables 1343 typemap = dict(list(zip(varnames, vartypes))) 1344 indexedcols = [] 1345 for colname in colnames: 1346 col = condvars[colname] 1347 1348 # Extract types from *all* the given variables. 1349 coltype = col.dtype.type 1350 typemap[colname] = _nxtype_from_nptype[coltype] 1351 1352 # Get the set of columns with usable indexes. 1353 if (self._enabled_indexing_in_queries # no in-kernel searches 1354 and self.colindexed[col.pathname] and not col.index.dirty): 1355 indexedcols.append(colname) 1356 1357 indexedcols = frozenset(indexedcols) 1358 # Now let ``compile_condition()`` do the Numexpr-related job. 1359 compiled = compile_condition(condition, typemap, indexedcols) 1360 1361 # Check that there actually are columns in the condition. 1362 if not set(compiled.parameters).intersection(set(colnames)): 1363 raise ValueError("there are no columns taking part " 1364 "in condition ``%s``" % (condition,)) 1365 1366 # Store the compiled condition in the cache and return it. 1367 condcache[condkey] = compiled 1368 return compiled.with_replaced_vars(condvars) 1369 1370 def will_query_use_indexing(self, condition, condvars=None): 1371 """Will a query for the condition use indexing? 1372 1373 The meaning of the condition and *condvars* arguments is the same as in 1374 the :meth:`Table.where` method. If condition can use indexing, this 1375 method returns a frozenset with the path names of the columns whose 1376 index is usable. Otherwise, it returns an empty list. 1377 1378 This method is mainly intended for testing. Keep in mind that changing 1379 the set of indexed columns or their dirtiness may make this method 1380 return different values for the same arguments at different times. 1381 1382 """ 1383 1384 # Compile the condition and extract usable index conditions. 1385 condvars = self._required_expr_vars(condition, condvars, depth=2) 1386 compiled = self._compile_condition(condition, condvars) 1387 # Return the columns in indexed expressions 1388 idxcols = [condvars[var].pathname for var in compiled.index_variables] 1389 return frozenset(idxcols) 1390 1391 def where(self, condition, condvars=None, 1392 start=None, stop=None, step=None): 1393 """Iterate over values fulfilling a condition. 1394 1395 This method returns a Row iterator (see :ref:`RowClassDescr`) which 1396 only selects rows in the table that satisfy the given condition (an 1397 expression-like string). 1398 1399 The condvars mapping may be used to define the variable names appearing 1400 in the condition. condvars should consist of identifier-like strings 1401 pointing to Column (see :ref:`ColumnClassDescr`) instances *of this 1402 table*, or to other values (which will be converted to arrays). A 1403 default set of condition variables is provided where each top-level, 1404 non-nested column with an identifier-like name appears. Variables in 1405 condvars override the default ones. 1406 1407 When condvars is not provided or None, the current local and global 1408 namespace is sought instead of condvars. The previous mechanism is 1409 mostly intended for interactive usage. To disable it, just specify a 1410 (maybe empty) mapping as condvars. 1411 1412 If a range is supplied (by setting some of the start, stop or step 1413 parameters), only the rows in that range and fulfilling the condition 1414 are used. The meaning of the start, stop and step parameters is the 1415 same as for Python slices. 1416 1417 When possible, indexed columns participating in the condition will be 1418 used to speed up the search. It is recommended that you place the 1419 indexed columns as left and out in the condition as possible. Anyway, 1420 this method has always better performance than regular Python 1421 selections on the table. 1422 1423 You can mix this method with regular Python selections in order to 1424 support even more complex queries. It is strongly recommended that you 1425 pass the most restrictive condition as the parameter to this method if 1426 you want to achieve maximum performance. 1427 1428 .. warning:: 1429 1430 When in the middle of a table row iterator, you should not 1431 use methods that can change the number of rows in the table 1432 (like :meth:`Table.append` or :meth:`Table.remove_rows`) or 1433 unexpected errors will happen. 1434 1435 Examples 1436 -------- 1437 1438 :: 1439 1440 >>> passvalues = [ row['col3'] for row in 1441 ... table.where('(col1 > 0) & (col2 <= 20)', step=5) 1442 ... if your_function(row['col2']) ] 1443 >>> print("Values that pass the cuts:", passvalues) 1444 1445 .. note:: 1446 1447 A special care should be taken when the query condition includes 1448 string literals. Indeed Python 2 string literals are string of 1449 bytes while Python 3 strings are unicode objects. 1450 1451 Let's assume that the table ``table`` has the following 1452 structure:: 1453 1454 class Record(IsDescription): 1455 col1 = StringCol(4) # 4-character String of bytes 1456 col2 = IntCol() 1457 col3 = FloatCol() 1458 1459 The type of "col1" do not change depending on the Python version 1460 used (of course) and it always corresponds to strings of bytes. 1461 1462 Any condition involving "col1" should be written using the 1463 appropriate type for string literals in order to avoid 1464 :exc:`TypeError`\ s. 1465 1466 The code below will work fine in Python 2 but will fail with a 1467 :exc:`TypeError` in Python 3:: 1468 1469 condition = 'col1 == "AAAA"' 1470 for record in table.where(condition): # TypeError in Python3 1471 # do something with "record" 1472 1473 The reason is that in Python 3 "condition" implies a comparison 1474 between a string of bytes ("col1" contents) and an unicode literal 1475 ("AAAA"). 1476 1477 The correct way to write the condition is:: 1478 1479 condition = 'col1 == b"AAAA"' 1480 1481 .. versionchanged:: 3.0 1482 The start, stop and step parameters now behave like in slice. 1483 1484 """ 1485 1486 return self._where(condition, condvars, start, stop, step) 1487 1488 def _where(self, condition, condvars, start=None, stop=None, step=None): 1489 """Low-level counterpart of `self.where()`.""" 1490 1491 if profile: 1492 tref = time() 1493 if profile: 1494 show_stats("Entering table._where", tref) 1495 # Adjust the slice to be used. 1496 (start, stop, step) = self._process_range_read(start, stop, step) 1497 if start >= stop: # empty range, reset conditions 1498 self._use_index = False 1499 self._where_condition = None 1500 return iter([]) 1501 1502 # Compile the condition and extract usable index conditions. 1503 condvars = self._required_expr_vars(condition, condvars, depth=3) 1504 compiled = self._compile_condition(condition, condvars) 1505 1506 # Can we use indexes? 1507 if compiled.index_expressions: 1508 chunkmap = _table__where_indexed( 1509 self, compiled, condition, condvars, start, stop, step) 1510 if not isinstance(chunkmap, numpy.ndarray): 1511 # If it is not a NumPy array it should be an iterator 1512 # Reset conditions 1513 self._use_index = False 1514 self._where_condition = None 1515 # ...and return the iterator 1516 return chunkmap 1517 else: 1518 chunkmap = None # default to an in-kernel query 1519 1520 args = [condvars[param] for param in compiled.parameters] 1521 self._where_condition = (compiled.function, args, compiled.kwargs) 1522 row = tableextension.Row(self) 1523 if profile: 1524 show_stats("Exiting table._where", tref) 1525 return row._iter(start, stop, step, chunkmap=chunkmap) 1526 1527 def read_where(self, condition, condvars=None, field=None, 1528 start=None, stop=None, step=None): 1529 """Read table data fulfilling the given *condition*. 1530 1531 This method is similar to :meth:`Table.read`, having their common 1532 arguments and return values the same meanings. However, only the rows 1533 fulfilling the *condition* are included in the result. 1534 1535 The meaning of the other arguments is the same as in the 1536 :meth:`Table.where` method. 1537 1538 """ 1539 1540 self._g_check_open() 1541 coords = [p.nrow for p in 1542 self._where(condition, condvars, start, stop, step)] 1543 self._where_condition = None # reset the conditions 1544 if len(coords) > 1: 1545 cstart, cstop = coords[0], coords[-1] + 1 1546 if cstop - cstart == len(coords): 1547 # Chances for monotonically increasing row values. Refine. 1548 inc_seq = numpy.alltrue( 1549 numpy.arange(cstart, cstop) == numpy.array(coords)) 1550 if inc_seq: 1551 return self.read(cstart, cstop, field=field) 1552 return self.read_coordinates(coords, field) 1553 1554 def append_where(self, dstTable, condition=None, condvars=None, 1555 start=None, stop=None, step=None): 1556 """Append rows fulfilling the condition to the dstTable table. 1557 1558 dstTable must be capable of taking the rows resulting from the query, 1559 i.e. it must have columns with the expected names and compatible 1560 types. The meaning of the other arguments is the same as in the 1561 :meth:`Table.where` method. 1562 1563 The number of rows appended to dstTable is returned as a result. 1564 1565 .. versionchanged:: 3.0 1566 The *whereAppend* method has been renamed into *append_where*. 1567 1568 """ 1569 1570 self._g_check_open() 1571 1572 # Check that the destination file is not in read-only mode. 1573 dstTable._v_file._check_writable() 1574 1575 # Row objects do not support nested columns, so we must iterate 1576 # over the flat column paths. When rows support nesting, 1577 # ``self.colnames`` can be directly iterated upon. 1578 colNames = [colName for colName in self.colpathnames] 1579 dstRow = dstTable.row 1580 nrows = 0 1581 if condition is not None: 1582 srcRows = self._where(condition, condvars, start, stop, step) 1583 else: 1584 srcRows = self.iterrows(start, stop, step) 1585 for srcRow in srcRows: 1586 for colName in colNames: 1587 dstRow[colName] = srcRow[colName] 1588 dstRow.append() 1589 nrows += 1 1590 dstTable.flush() 1591 return nrows 1592 1593 def get_where_list(self, condition, condvars=None, sort=False, 1594 start=None, stop=None, step=None): 1595 """Get the row coordinates fulfilling the given condition. 1596 1597 The coordinates are returned as a list of the current flavor. sort 1598 means that you want to retrieve the coordinates ordered. The default is 1599 to not sort them. 1600 1601 The meaning of the other arguments is the same as in the 1602 :meth:`Table.where` method. 1603 1604 """ 1605 1606 self._g_check_open() 1607 1608 coords = [p.nrow for p in 1609 self._where(condition, condvars, start, stop, step)] 1610 coords = numpy.array(coords, dtype=SizeType) 1611 # Reset the conditions 1612 self._where_condition = None 1613 if sort: 1614 coords = numpy.sort(coords) 1615 return internal_to_flavor(coords, self.flavor) 1616 1617 def itersequence(self, sequence): 1618 """Iterate over a sequence of row coordinates.""" 1619 1620 if not hasattr(sequence, '__getitem__'): 1621 raise TypeError(("Wrong 'sequence' parameter type. Only sequences " 1622 "are suported.")) 1623 # start, stop and step are necessary for the new iterator for 1624 # coordinates, and perhaps it would be useful to add them as 1625 # parameters in the future (not now, because I've just removed 1626 # the `sort` argument for 2.1). 1627 # 1628 # *Important note*: Negative values for step are not supported 1629 # for the general case, but only for the itersorted() and 1630 # read_sorted() purposes! The self._process_range_read will raise 1631 # an appropiate error. 1632 # F. Alted 2008-09-18 1633 # A.V. 20130513: _process_range_read --> _process_range 1634 (start, stop, step) = self._process_range(None, None, None) 1635 if (start > stop) or (len(sequence) == 0): 1636 return iter([]) 1637 row = tableextension.Row(self) 1638 return row._iter(start, stop, step, coords=sequence) 1639 1640 def _check_sortby_csi(self, sortby, checkCSI): 1641 if isinstance(sortby, Column): 1642 icol = sortby 1643 elif isinstance(sortby, str): 1644 icol = self.cols._f_col(sortby) 1645 else: 1646 raise TypeError( 1647 "`sortby` can only be a `Column` or string object, " 1648 "but you passed an object of type: %s" % type(sortby)) 1649 if icol.is_indexed and icol.index.kind == "full": 1650 if checkCSI and not icol.index.is_csi: 1651 # The index exists, but it is not a CSI one. 1652 raise ValueError( 1653 "Field `%s` must have associated a CSI index " 1654 "in table `%s`, but the existing one is not. " 1655 % (sortby, self)) 1656 return icol.index 1657 else: 1658 raise ValueError( 1659 "Field `%s` must have associated a 'full' index " 1660 "in table `%s`." % (sortby, self)) 1661 1662 def itersorted(self, sortby, checkCSI=False, 1663 start=None, stop=None, step=None): 1664 """Iterate table data following the order of the index of sortby 1665 column. 1666 1667 The sortby column must have associated a full index. If you want to 1668 ensure a fully sorted order, the index must be a CSI one. You may want 1669 to use the checkCSI argument in order to explicitly check for the 1670 existence of a CSI index. 1671 1672 The meaning of the start, stop and step arguments is the same as in 1673 :meth:`Table.read`. 1674 1675 .. versionchanged:: 3.0 1676 If the *start* parameter is provided and *stop* is None then the 1677 table is iterated from *start* to the last line. 1678 In PyTables < 3.0 only one element was returned. 1679 1680 """ 1681 1682 index = self._check_sortby_csi(sortby, checkCSI) 1683 # Adjust the slice to be used. 1684 (start, stop, step) = self._process_range(start, stop, step, 1685 warn_negstep=False) 1686 if (start > stop and 0 < step) or (start < stop and 0 > step): 1687 # Fall-back action is to return an empty iterator 1688 return iter([]) 1689 row = tableextension.Row(self) 1690 return row._iter(start, stop, step, coords=index) 1691 1692 def read_sorted(self, sortby, checkCSI=False, field=None, 1693 start=None, stop=None, step=None): 1694 """Read table data following the order of the index of sortby column. 1695 1696 The sortby column must have associated a full index. If you want to 1697 ensure a fully sorted order, the index must be a CSI one. You may want 1698 to use the checkCSI argument in order to explicitly check for the 1699 existence of a CSI index. 1700 1701 If field is supplied only the named column will be selected. If the 1702 column is not nested, an *array* of the current flavor will be 1703 returned; if it is, a *structured array* will be used instead. If no 1704 field is specified, all the columns will be returned in a structured 1705 array of the current flavor. 1706 1707 The meaning of the start, stop and step arguments is the same as in 1708 :meth:`Table.read`. 1709 1710 .. versionchanged:: 3.0 1711 The start, stop and step parameters now behave like in slice. 1712 1713 """ 1714 1715 self._g_check_open() 1716 index = self._check_sortby_csi(sortby, checkCSI) 1717 coords = index[start:stop:step] 1718 return self.read_coordinates(coords, field) 1719 1720 def iterrows(self, start=None, stop=None, step=None): 1721 """Iterate over the table using a Row instance. 1722 1723 If a range is not supplied, *all the rows* in the table are iterated 1724 upon - you can also use the :meth:`Table.__iter__` special method for 1725 that purpose. If you want to iterate over a given *range of rows* in 1726 the table, you may use the start, stop and step parameters. 1727 1728 .. warning:: 1729 1730 When in the middle of a table row iterator, you should not 1731 use methods that can change the number of rows in the table 1732 (like :meth:`Table.append` or :meth:`Table.remove_rows`) or 1733 unexpected errors will happen. 1734 1735 See Also 1736 -------- 1737 tableextension.Row : the table row iterator and field accessor 1738 1739 Examples 1740 -------- 1741 1742 :: 1743 1744 result = [ row['var2'] for row in table.iterrows(step=5) 1745 if row['var1'] <= 20 ] 1746 1747 .. versionchanged:: 3.0 1748 If the *start* parameter is provided and *stop* is None then the 1749 table is iterated from *start* to the last line. 1750 In PyTables < 3.0 only one element was returned. 1751 1752 """ 1753 (start, stop, step) = self._process_range(start, stop, step, 1754 warn_negstep=False) 1755 if (start > stop and 0 < step) or (start < stop and 0 > step): 1756 # Fall-back action is to return an empty iterator 1757 return iter([]) 1758 row = tableextension.Row(self) 1759 return row._iter(start, stop, step) 1760 1761 def __iter__(self): 1762 """Iterate over the table using a Row instance. 1763 1764 This is equivalent to calling :meth:`Table.iterrows` with default 1765 arguments, i.e. it iterates over *all the rows* in the table. 1766 1767 See Also 1768 -------- 1769 tableextension.Row : the table row iterator and field accessor 1770 1771 Examples 1772 -------- 1773 1774 :: 1775 1776 result = [ row['var2'] for row in table if row['var1'] <= 20 ] 1777 1778 Which is equivalent to:: 1779 1780 result = [ row['var2'] for row in table.iterrows() 1781 if row['var1'] <= 20 ] 1782 1783 """ 1784 1785 return self.iterrows() 1786 1787 def _read(self, start, stop, step, field=None, out=None): 1788 """Read a range of rows and return an in-memory object.""" 1789 1790 select_field = None 1791 if field: 1792 if field not in self.coldtypes: 1793 if field in self.description._v_names: 1794 # Remember to select this field 1795 select_field = field 1796 field = None 1797 else: 1798 raise KeyError(("Field {0} not found in table " 1799 "{1}").format(field, self)) 1800 else: 1801 # The column hangs directly from the top 1802 dtype_field = self.coldtypes[field] 1803 1804 # Return a rank-0 array if start > stop 1805 if (start >= stop and 0 < step) or (start <= stop and 0 > step): 1806 if field is None: 1807 nra = self._get_container(0) 1808 return nra 1809 return numpy.empty(shape=0, dtype=dtype_field) 1810 1811 nrows = len(range(start, stop, step)) 1812 1813 if out is None: 1814 # Compute the shape of the resulting column object 1815 if field: 1816 # Create a container for the results 1817 result = numpy.empty(shape=nrows, dtype=dtype_field) 1818 else: 1819 # Recarray case 1820 result = self._get_container(nrows) 1821 else: 1822 # there is no fast way to byteswap, since different columns may 1823 # have different byteorders 1824 if not out.dtype.isnative: 1825 raise ValueError(("output array must be in system's byteorder " 1826 "or results will be incorrect")) 1827 if field: 1828 bytes_required = dtype_field.itemsize * nrows 1829 else: 1830 bytes_required = self.rowsize * nrows 1831 if bytes_required != out.nbytes: 1832 raise ValueError(('output array size invalid, got {0} bytes, ' 1833 'need {1} bytes').format(out.nbytes, 1834 bytes_required)) 1835 if not out.flags['C_CONTIGUOUS']: 1836 raise ValueError('output array not C contiguous') 1837 result = out 1838 1839 # Call the routine to fill-up the resulting array 1840 if step == 1 and not field: 1841 # This optimization works three times faster than 1842 # the row._fill_col method (up to 170 MB/s on a pentium IV @ 2GHz) 1843 self._read_records(start, stop - start, result) 1844 # Warning!: _read_field_name should not be used until 1845 # H5TBread_fields_name in tableextension will be finished 1846 # F. Alted 2005/05/26 1847 # XYX Ho implementem per a PyTables 2.0?? 1848 elif field and step > 15 and 0: 1849 # For step>15, this seems to work always faster than row._fill_col. 1850 self._read_field_name(result, start, stop, step, field) 1851 else: 1852 self.row._fill_col(result, start, stop, step, field) 1853 1854 if select_field: 1855 return result[select_field] 1856 else: 1857 return result 1858 1859 def read(self, start=None, stop=None, step=None, field=None, out=None): 1860 """Get data in the table as a (record) array. 1861 1862 The start, stop and step parameters can be used to select only 1863 a *range of rows* in the table. Their meanings are the same as 1864 in the built-in Python slices. 1865 1866 If field is supplied only the named column will be selected. 1867 If the column is not nested, an *array* of the current flavor 1868 will be returned; if it is, a *structured array* will be used 1869 instead. If no field is specified, all the columns will be 1870 returned in a structured array of the current flavor. 1871 1872 Columns under a nested column can be specified in the field 1873 parameter by using a slash character (/) as a separator (e.g. 1874 'position/x'). 1875 1876 The out parameter may be used to specify a NumPy array to 1877 receive the output data. Note that the array must have the 1878 same size as the data selected with the other parameters. 1879 Note that the array's datatype is not checked and no type 1880 casting is performed, so if it does not match the datatype on 1881 disk, the output will not be correct. 1882 1883 When specifying a single nested column with the field parameter, 1884 and supplying an output buffer with the out parameter, the 1885 output buffer must contain all columns in the table. 1886 The data in all columns will be read into the output buffer. 1887 However, only the specified nested column will be returned from 1888 the method call. 1889 1890 When data is read from disk in NumPy format, the output will be 1891 in the current system's byteorder, regardless of how it is 1892 stored on disk. If the out parameter is specified, the output 1893 array also must be in the current system's byteorder. 1894 1895 .. versionchanged:: 3.0 1896 Added the *out* parameter. Also the start, stop and step 1897 parameters now behave like in slice. 1898 1899 Examples 1900 -------- 1901 1902 Reading the entire table:: 1903 1904 t.read() 1905 1906 Reading record n. 6:: 1907 1908 t.read(6, 7) 1909 1910 Reading from record n. 6 to the end of the table:: 1911 1912 t.read(6) 1913 1914 """ 1915 1916 self._g_check_open() 1917 1918 if field: 1919 self._check_column(field) 1920 1921 if out is not None and self.flavor != 'numpy': 1922 msg = ("Optional 'out' argument may only be supplied if array " 1923 "flavor is 'numpy', currently is {0}").format(self.flavor) 1924 raise TypeError(msg) 1925 1926 start, stop, step = self._process_range(start, stop, step, 1927 warn_negstep=False) 1928 1929 arr = self._read(start, stop, step, field, out) 1930 return internal_to_flavor(arr, self.flavor) 1931 1932 def _read_coordinates(self, coords, field=None): 1933 """Private part of `read_coordinates()` with no flavor conversion.""" 1934 1935 coords = self._point_selection(coords) 1936 1937 ncoords = len(coords) 1938 # Create a read buffer only if needed 1939 if field is None or ncoords > 0: 1940 # Doing a copy is faster when ncoords is small (<1000) 1941 if ncoords < min(1000, self.nrowsinbuf): 1942 result = self._v_iobuf[:ncoords].copy() 1943 else: 1944 result = self._get_container(ncoords) 1945 1946 # Do the real read 1947 if ncoords > 0: 1948 # Turn coords into an array of coordinate indexes, if necessary 1949 if not (isinstance(coords, numpy.ndarray) and 1950 coords.dtype.type is _npsizetype and 1951 coords.flags.contiguous and 1952 coords.flags.aligned): 1953 # Get a contiguous and aligned coordinate array 1954 coords = numpy.array(coords, dtype=SizeType) 1955 self._read_elements(coords, result) 1956 1957 # Do the final conversions, if needed 1958 if field: 1959 if ncoords > 0: 1960 result = get_nested_field(result, field) 1961 else: 1962 # Get an empty array from the cache 1963 result = self._getemptyarray(self.coldtypes[field]) 1964 return result 1965 1966 def read_coordinates(self, coords, field=None): 1967 """Get a set of rows given their indexes as a (record) array. 1968 1969 This method works much like the :meth:`Table.read` method, but it uses 1970 a sequence (coords) of row indexes to select the wanted columns, 1971 instead of a column range. 1972 1973 The selected rows are returned in an array or structured array of the 1974 current flavor. 1975 1976 """ 1977 1978 self._g_check_open() 1979 result = self._read_coordinates(coords, field) 1980 return internal_to_flavor(result, self.flavor) 1981 1982 def get_enum(self, colname): 1983 """Get the enumerated type associated with the named column. 1984 1985 If the column named colname (a string) exists and is of an enumerated 1986 type, the corresponding Enum instance (see :ref:`EnumClassDescr`) is 1987 returned. If it is not of an enumerated type, a TypeError is raised. If 1988 the column does not exist, a KeyError is raised. 1989 1990 """ 1991 1992 self._check_column(colname) 1993 1994 try: 1995 return self._colenums[colname] 1996 except KeyError: 1997 raise TypeError( 1998 "column ``%s`` of table ``%s`` is not of an enumerated type" 1999 % (colname, self._v_pathname)) 2000 2001 def col(self, name): 2002 """Get a column from the table. 2003 2004 If a column called name exists in the table, it is read and returned as 2005 a NumPy object. If it does not exist, a KeyError is raised. 2006 2007 Examples 2008 -------- 2009 2010 :: 2011 2012 narray = table.col('var2') 2013 2014 That statement is equivalent to:: 2015 2016 narray = table.read(field='var2') 2017 2018 Here you can see how this method can be used as a shorthand for the 2019 :meth:`Table.read` method. 2020 2021 """ 2022 2023 return self.read(field=name) 2024 2025 def __getitem__(self, key): 2026 """Get a row or a range of rows from the table. 2027 2028 If key argument is an integer, the corresponding table row is returned 2029 as a record of the current flavor. If key is a slice, the range of rows 2030 determined by it is returned as a structured array of the current 2031 flavor. 2032 2033 In addition, NumPy-style point selections are supported. In 2034 particular, if key is a list of row coordinates, the set of rows 2035 determined by it is returned. Furthermore, if key is an array of 2036 boolean values, only the coordinates where key is True are returned. 2037 Note that for the latter to work it is necessary that key list would 2038 contain exactly as many rows as the table has. 2039 2040 Examples 2041 -------- 2042 2043 :: 2044 2045 record = table[4] 2046 recarray = table[4:1000:2] 2047 recarray = table[[4,1000]] # only retrieves rows 4 and 1000 2048 recarray = table[[True, False, ..., True]] 2049 2050 Those statements are equivalent to:: 2051 2052 record = table.read(start=4)[0] 2053 recarray = table.read(start=4, stop=1000, step=2) 2054 recarray = table.read_coordinates([4,1000]) 2055 recarray = table.read_coordinates([True, False, ..., True]) 2056 2057 Here, you can see how indexing can be used as a shorthand for the 2058 :meth:`Table.read` and :meth:`Table.read_coordinates` methods. 2059 2060 """ 2061 2062 self._g_check_open() 2063 2064 if is_idx(key): 2065 key = operator.index(key) 2066 2067 # Index out of range protection 2068 if key >= self.nrows: 2069 raise IndexError("Index out of range") 2070 if key < 0: 2071 # To support negative values 2072 key += self.nrows 2073 (start, stop, step) = self._process_range(key, key + 1, 1) 2074 return self.read(start, stop, step)[0] 2075 elif isinstance(key, slice): 2076 (start, stop, step) = self._process_range( 2077 key.start, key.stop, key.step) 2078 return self.read(start, stop, step) 2079 # Try with a boolean or point selection 2080 elif type(key) in (list, tuple) or isinstance(key, numpy.ndarray): 2081 return self._read_coordinates(key, None) 2082 else: 2083 raise IndexError("Invalid index or slice: %r" % (key,)) 2084 2085 def __setitem__(self, key, value): 2086 """Set a row or a range of rows in the table. 2087 2088 It takes different actions depending on the type of the *key* 2089 parameter: if it is an integer, the corresponding table row is 2090 set to *value* (a record or sequence capable of being converted 2091 to the table structure). If *key* is a slice, the row slice 2092 determined by it is set to *value* (a record array or sequence 2093 capable of being converted to the table structure). 2094 2095 In addition, NumPy-style point selections are supported. In 2096 particular, if key is a list of row coordinates, the set of rows 2097 determined by it is set to value. Furthermore, if key is an array of 2098 boolean values, only the coordinates where key is True are set to 2099 values from value. Note that for the latter to work it is necessary 2100 that key list would contain exactly as many rows as the table has. 2101 2102 Examples 2103 -------- 2104 2105 :: 2106 2107 # Modify just one existing row 2108 table[2] = [456,'db2',1.2] 2109 2110 # Modify two existing rows 2111 rows = numpy.rec.array([[457,'db1',1.2],[6,'de2',1.3]], 2112 formats='i4,a3,f8') 2113 table[1:30:2] = rows # modify a table slice 2114 table[[1,3]] = rows # only modifies rows 1 and 3 2115 table[[True,False,True]] = rows # only modifies rows 0 and 2 2116 2117 Which is equivalent to:: 2118 2119 table.modify_rows(start=2, rows=[456,'db2',1.2]) 2120 rows = numpy.rec.array([[457,'db1',1.2],[6,'de2',1.3]], 2121 formats='i4,a3,f8') 2122 table.modify_rows(start=1, stop=3, step=2, rows=rows) 2123 table.modify_coordinates([1,3,2], rows) 2124 table.modify_coordinates([True, False, True], rows) 2125 2126 Here, you can see how indexing can be used as a shorthand for the 2127 :meth:`Table.modify_rows` and :meth:`Table.modify_coordinates` 2128 methods. 2129 2130 """ 2131 2132 self._g_check_open() 2133 self._v_file._check_writable() 2134 2135 if is_idx(key): 2136 key = operator.index(key) 2137 2138 # Index out of range protection 2139 if key >= self.nrows: 2140 raise IndexError("Index out of range") 2141 if key < 0: 2142 # To support negative values 2143 key += self.nrows 2144 return self.modify_rows(key, key + 1, 1, [value]) 2145 elif isinstance(key, slice): 2146 (start, stop, step) = self._process_range( 2147 key.start, key.stop, key.step) 2148 return self.modify_rows(start, stop, step, value) 2149 # Try with a boolean or point selection 2150 elif type(key) in (list, tuple) or isinstance(key, numpy.ndarray): 2151 return self.modify_coordinates(key, value) 2152 else: 2153 raise IndexError("Invalid index or slice: %r" % (key,)) 2154 2155 def _save_buffered_rows(self, wbufRA, lenrows): 2156 """Update the indexes after a flushing of rows.""" 2157 2158 self._open_append(wbufRA) 2159 self._append_records(lenrows) 2160 self._close_append() 2161 if self.indexed: 2162 self._unsaved_indexedrows += lenrows 2163 # The table caches for indexed queries are dirty now 2164 self._dirtycache = True 2165 if self.autoindex: 2166 # Flush the unindexed rows 2167 self.flush_rows_to_index(_lastrow=False) 2168 else: 2169 # All the columns are dirty now 2170 self._mark_columns_as_dirty(self.colpathnames) 2171 2172 def append(self, rows): 2173 """Append a sequence of rows to the end of the table. 2174 2175 The rows argument may be any object which can be converted to 2176 a structured array compliant with the table structure 2177 (otherwise, a ValueError is raised). This includes NumPy 2178 structured arrays, lists of tuples or array records, and a 2179 string or Python buffer. 2180 2181 Examples 2182 -------- 2183 2184 :: 2185 2186 import tables as tb 2187 2188 class Particle(tb.IsDescription): 2189 name = tb.StringCol(16, pos=1) # 16-character String 2190 lati = tb.IntCol(pos=2) # integer 2191 longi = tb.IntCol(pos=3) # integer 2192 pressure = tb.Float32Col(pos=4) # float (single-precision) 2193 temperature = tb.FloatCol(pos=5) # double (double-precision) 2194 2195 fileh = tb.open_file('test4.h5', mode='w') 2196 table = fileh.create_table(fileh.root, 'table', Particle, 2197 "A table") 2198 2199 # Append several rows in only one call 2200 table.append([("Particle: 10", 10, 0, 10 * 10, 10**2), 2201 ("Particle: 11", 11, -1, 11 * 11, 11**2), 2202 ("Particle: 12", 12, -2, 12 * 12, 12**2)]) 2203 fileh.close() 2204 2205 """ 2206 2207 self._g_check_open() 2208 self._v_file._check_writable() 2209 2210 if not self._chunked: 2211 raise HDF5ExtError( 2212 "You cannot append rows to a non-chunked table.", h5bt=False) 2213 2214 # Try to convert the object into a recarray compliant with table 2215 try: 2216 iflavor = flavor_of(rows) 2217 if iflavor != 'python': 2218 rows = array_as_internal(rows, iflavor) 2219 # Works for Python structures and always copies the original, 2220 # so the resulting object is safe for in-place conversion. 2221 wbufRA = numpy.rec.array(rows, dtype=self._v_dtype) 2222 except Exception as exc: # XXX 2223 raise ValueError("rows parameter cannot be converted into a " 2224 "recarray object compliant with table '%s'. " 2225 "The error was: <%s>" % (str(self), exc)) 2226 lenrows = wbufRA.shape[0] 2227 # If the number of rows to append is zero, don't do anything else 2228 if lenrows > 0: 2229 # Save write buffer to disk 2230 self._save_buffered_rows(wbufRA, lenrows) 2231 2232 def _conv_to_recarr(self, obj): 2233 """Try to convert the object into a recarray.""" 2234 2235 try: 2236 iflavor = flavor_of(obj) 2237 if iflavor != 'python': 2238 obj = array_as_internal(obj, iflavor) 2239 if hasattr(obj, "shape") and obj.shape == (): 2240 # To allow conversion of scalars (void type) into arrays. 2241 # See http://projects.scipy.org/scipy/numpy/ticket/315 2242 # for discussion on how to pass buffers to constructors 2243 # See also http://projects.scipy.org/scipy/numpy/ticket/348 2244 recarr = numpy.array([obj], dtype=self._v_dtype) 2245 else: 2246 # Works for Python structures and always copies the original, 2247 # so the resulting object is safe for in-place conversion. 2248 recarr = numpy.rec.array(obj, dtype=self._v_dtype) 2249 except Exception as exc: # XXX 2250 raise ValueError("Object cannot be converted into a recarray " 2251 "object compliant with table format '%s'. " 2252 "The error was: <%s>" % 2253 (self.description._v_nested_descr, exc)) 2254 2255 return recarr 2256 2257 def modify_coordinates(self, coords, rows): 2258 """Modify a series of rows in positions specified in coords. 2259 2260 The values in the selected rows will be modified with the data given in 2261 rows. This method returns the number of rows modified. 2262 2263 The possible values for the rows argument are the same as in 2264 :meth:`Table.append`. 2265 2266 """ 2267 2268 if rows is None: # Nothing to be done 2269 return SizeType(0) 2270 2271 # Convert the coordinates to something expected by HDF5 2272 coords = self._point_selection(coords) 2273 2274 lcoords = len(coords) 2275 if len(rows) < lcoords: 2276 raise ValueError("The value has not enough elements to fill-in " 2277 "the specified range") 2278 2279 # Convert rows into a recarray 2280 recarr = self._conv_to_recarr(rows) 2281 2282 if len(coords) > 0: 2283 # Do the actual update of rows 2284 self._update_elements(lcoords, coords, recarr) 2285 2286 # Redo the index if needed 2287 self._reindex(self.colpathnames) 2288 2289 return SizeType(lcoords) 2290 2291 def modify_rows(self, start=None, stop=None, step=None, rows=None): 2292 """Modify a series of rows in the slice [start:stop:step]. 2293 2294 The values in the selected rows will be modified with the data given in 2295 rows. This method returns the number of rows modified. Should the 2296 modification exceed the length of the table, an IndexError is raised 2297 before changing data. 2298 2299 The possible values for the rows argument are the same as in 2300 :meth:`Table.append`. 2301 2302 """ 2303 2304 if step is None: 2305 step = 1 2306 if rows is None: # Nothing to be done 2307 return SizeType(0) 2308 if start is None: 2309 start = 0 2310 2311 if start < 0: 2312 raise ValueError("'start' must have a positive value.") 2313 if step < 1: 2314 raise ValueError( 2315 "'step' must have a value greater or equal than 1.") 2316 if stop is None: 2317 # compute the stop value. start + len(rows)*step does not work 2318 stop = start + (len(rows) - 1) * step + 1 2319 2320 (start, stop, step) = self._process_range(start, stop, step) 2321 if stop > self.nrows: 2322 raise IndexError("This modification will exceed the length of " 2323 "the table. Giving up.") 2324 # Compute the number of rows to read. 2325 nrows = len(range(start, stop, step)) 2326 if len(rows) != nrows: 2327 raise ValueError("The value has different elements than the " 2328 "specified range") 2329 2330 # Convert rows into a recarray 2331 recarr = self._conv_to_recarr(rows) 2332 2333 lenrows = len(recarr) 2334 if start + lenrows > self.nrows: 2335 raise IndexError("This modification will exceed the length of the " 2336 "table. Giving up.") 2337 2338 # Do the actual update 2339 self._update_records(start, stop, step, recarr) 2340 2341 # Redo the index if needed 2342 self._reindex(self.colpathnames) 2343 2344 return SizeType(lenrows) 2345 2346 def modify_column(self, start=None, stop=None, step=None, 2347 column=None, colname=None): 2348 """Modify one single column in the row slice [start:stop:step]. 2349 2350 The colname argument specifies the name of the column in the 2351 table to be modified with the data given in column. This 2352 method returns the number of rows modified. Should the 2353 modification exceed the length of the table, an IndexError is 2354 raised before changing data. 2355 2356 The *column* argument may be any object which can be converted 2357 to a (record) array compliant with the structure of the column 2358 to be modified (otherwise, a ValueError is raised). This 2359 includes NumPy (record) arrays, lists of scalars, tuples or 2360 array records, and a string or Python buffer. 2361 2362 """ 2363 if step is None: 2364 step = 1 2365 if not isinstance(colname, str): 2366 raise TypeError("The 'colname' parameter must be a string.") 2367 self._v_file._check_writable() 2368 2369 if column is None: # Nothing to be done 2370 return SizeType(0) 2371 if start is None: 2372 start = 0 2373 2374 if start < 0: 2375 raise ValueError("'start' must have a positive value.") 2376 if step < 1: 2377 raise ValueError( 2378 "'step' must have a value greater or equal than 1.") 2379 # Get the column format to be modified: 2380 objcol = self._get_column_instance(colname) 2381 descr = [objcol._v_parent._v_nested_descr[objcol._v_pos]] 2382 # Try to convert the column object into a NumPy ndarray 2383 try: 2384 # If the column is a recarray (or kind of), convert into ndarray 2385 if hasattr(column, 'dtype') and column.dtype.kind == 'V': 2386 column = numpy.rec.array(column, dtype=descr).field(0) 2387 else: 2388 # Make sure the result is always a *copy* of the original, 2389 # so the resulting object is safe for in-place conversion. 2390 iflavor = flavor_of(column) 2391 column = array_as_internal(column, iflavor) 2392 except Exception as exc: # XXX 2393 raise ValueError("column parameter cannot be converted into a " 2394 "ndarray object compliant with specified column " 2395 "'%s'. The error was: <%s>" % (str(column), exc)) 2396 2397 # Get rid of single-dimensional dimensions 2398 column = column.squeeze() 2399 if column.shape == (): 2400 # Oops, stripped off to much dimensions 2401 column.shape = (1,) 2402 2403 if stop is None: 2404 # compute the stop value. start + len(rows)*step does not work 2405 stop = start + (len(column) - 1) * step + 1 2406 (start, stop, step) = self._process_range(start, stop, step) 2407 if stop > self.nrows: 2408 raise IndexError("This modification will exceed the length of " 2409 "the table. Giving up.") 2410 # Compute the number of rows to read. 2411 nrows = len(range(start, stop, step)) 2412 if len(column) < nrows: 2413 raise ValueError("The value has not enough elements to fill-in " 2414 "the specified range") 2415 # Now, read the original values: 2416 mod_recarr = self._read(start, stop, step) 2417 # Modify the appropriate column in the original recarray 2418 mod_col = get_nested_field(mod_recarr, colname) 2419 mod_col[:] = column 2420 # save this modified rows in table 2421 self._update_records(start, stop, step, mod_recarr) 2422 # Redo the index if needed 2423 self._reindex([colname]) 2424 2425 return SizeType(nrows) 2426 2427 def modify_columns(self, start=None, stop=None, step=None, 2428 columns=None, names=None): 2429 """Modify a series of columns in the row slice [start:stop:step]. 2430 2431 The names argument specifies the names of the columns in the 2432 table to be modified with the data given in columns. This 2433 method returns the number of rows modified. Should the 2434 modification exceed the length of the table, an IndexError 2435 is raised before changing data. 2436 2437 The columns argument may be any object which can be converted 2438 to a structured array compliant with the structure of the 2439 columns to be modified (otherwise, a ValueError is raised). 2440 This includes NumPy structured arrays, lists of tuples or array 2441 records, and a string or Python buffer. 2442 2443 """ 2444 if step is None: 2445 step = 1 2446 if type(names) not in (list, tuple): 2447 raise TypeError("The 'names' parameter must be a list of strings.") 2448 2449 if columns is None: # Nothing to be done 2450 return SizeType(0) 2451 if start is None: 2452 start = 0 2453 if start < 0: 2454 raise ValueError("'start' must have a positive value.") 2455 if step < 1: 2456 raise ValueError(("'step' must have a value greater or " 2457 "equal than 1.")) 2458 descr = [] 2459 for colname in names: 2460 objcol = self._get_column_instance(colname) 2461 descr.append(objcol._v_parent._v_nested_descr[objcol._v_pos]) 2462 # descr.append(objcol._v_parent._v_dtype[objcol._v_pos]) 2463 # Try to convert the columns object into a recarray 2464 try: 2465 # Make sure the result is always a *copy* of the original, 2466 # so the resulting object is safe for in-place conversion. 2467 iflavor = flavor_of(columns) 2468 if iflavor != 'python': 2469 columns = array_as_internal(columns, iflavor) 2470 recarray = numpy.rec.array(columns, dtype=descr) 2471 else: 2472 recarray = numpy.rec.fromarrays(columns, dtype=descr) 2473 except Exception as exc: # XXX 2474 raise ValueError("columns parameter cannot be converted into a " 2475 "recarray object compliant with table '%s'. " 2476 "The error was: <%s>" % (str(self), exc)) 2477 2478 if stop is None: 2479 # compute the stop value. start + len(rows)*step does not work 2480 stop = start + (len(recarray) - 1) * step + 1 2481 (start, stop, step) = self._process_range(start, stop, step) 2482 if stop > self.nrows: 2483 raise IndexError("This modification will exceed the length of " 2484 "the table. Giving up.") 2485 # Compute the number of rows to read. 2486 nrows = len(range(start, stop, step)) 2487 if len(recarray) < nrows: 2488 raise ValueError("The value has not enough elements to fill-in " 2489 "the specified range") 2490 # Now, read the original values: 2491 mod_recarr = self._read(start, stop, step) 2492 # Modify the appropriate columns in the original recarray 2493 for i, name in enumerate(recarray.dtype.names): 2494 mod_col = get_nested_field(mod_recarr, names[i]) 2495 mod_col[:] = recarray[name].squeeze() 2496 # save this modified rows in table 2497 self._update_records(start, stop, step, mod_recarr) 2498 # Redo the index if needed 2499 self._reindex(names) 2500 2501 return SizeType(nrows) 2502 2503 def flush_rows_to_index(self, _lastrow=True): 2504 """Add remaining rows in buffers to non-dirty indexes. 2505 2506 This can be useful when you have chosen non-automatic indexing 2507 for the table (see the :attr:`Table.autoindex` property in 2508 :class:`Table`) and you want to update the indexes on it. 2509 2510 """ 2511 2512 rowsadded = 0 2513 if self.indexed: 2514 # Update the number of unsaved indexed rows 2515 start = self._indexedrows 2516 nrows = self._unsaved_indexedrows 2517 for (colname, colindexed) in self.colindexed.items(): 2518 if colindexed: 2519 col = self.cols._g_col(colname) 2520 if nrows > 0 and not col.index.dirty: 2521 rowsadded = self._add_rows_to_index( 2522 colname, start, nrows, _lastrow, update=True) 2523 self._unsaved_indexedrows -= rowsadded 2524 self._indexedrows += rowsadded 2525 return rowsadded 2526 2527 def _add_rows_to_index(self, colname, start, nrows, lastrow, update): 2528 """Add more elements to the existing index.""" 2529 2530 # This method really belongs to Column, but since it makes extensive 2531 # use of the table, it gets dangerous when closing the file, since the 2532 # column may be accessing a table which is being destroyed. 2533 index = self.cols._g_col(colname).index 2534 slicesize = index.slicesize 2535 # The next loop does not rely on xrange so that it can 2536 # deal with long ints (i.e. more than 32-bit integers) 2537 # This allows to index columns with more than 2**31 rows 2538 # F. Alted 2005-05-09 2539 startLR = index.sorted.nrows * slicesize 2540 indexedrows = startLR - start 2541 stop = start + nrows - slicesize + 1 2542 while startLR < stop: 2543 index.append( 2544 [self._read(startLR, startLR + slicesize, 1, colname)], 2545 update=update) 2546 indexedrows += slicesize 2547 startLR += slicesize 2548 # index the remaining rows in last row 2549 if lastrow and startLR < self.nrows: 2550 index.append_last_row( 2551 [self._read(startLR, self.nrows, 1, colname)], 2552 update=update) 2553 indexedrows += self.nrows - startLR 2554 return indexedrows 2555 2556 def remove_rows(self, start=None, stop=None, step=None): 2557 """Remove a range of rows in the table. 2558 2559 If only start is supplied, that row and all following will be deleted. 2560 If a range is supplied, i.e. both the start and stop parameters are 2561 passed, all the rows in the range are removed. 2562 2563 .. versionchanged:: 3.0 2564 The start, stop and step parameters now behave like in slice. 2565 2566 .. seealso:: remove_row() 2567 2568 Parameters 2569 ---------- 2570 start : int 2571 Sets the starting row to be removed. It accepts negative values 2572 meaning that the count starts from the end. A value of 0 means the 2573 first row. 2574 stop : int 2575 Sets the last row to be removed to stop-1, i.e. the end point is 2576 omitted (in the Python range() tradition). Negative values are also 2577 accepted. If None all rows after start will be removed. 2578 step : int 2579 The step size between rows to remove. 2580 2581 .. versionadded:: 3.0 2582 2583 Examples 2584 -------- 2585 2586 Removing rows from 5 to 10 (excluded):: 2587 2588 t.remove_rows(5, 10) 2589 2590 Removing all rows starting from the 10th:: 2591 2592 t.remove_rows(10) 2593 2594 Removing the 6th row:: 2595 2596 t.remove_rows(6, 7) 2597 2598 .. note:: 2599 2600 removing a single row can be done using the specific 2601 :meth:`remove_row` method. 2602 2603 """ 2604 2605 (start, stop, step) = self._process_range(start, stop, step) 2606 nrows = self._remove_rows(start, stop, step) 2607 # remove_rows is a invalidating index operation 2608 self._reindex(self.colpathnames) 2609 2610 return SizeType(nrows) 2611 2612 def remove_row(self, n): 2613 """Removes a row from the table. 2614 2615 Parameters 2616 ---------- 2617 n : int 2618 The index of the row to remove. 2619 2620 2621 .. versionadded:: 3.0 2622 2623 Examples 2624 -------- 2625 2626 Remove row 15:: 2627 2628 table.remove_row(15) 2629 2630 Which is equivalent to:: 2631 2632 table.remove_rows(15, 16) 2633 2634 .. warning:: 2635 2636 This is not equivalent to:: 2637 2638 table.remove_rows(15) 2639 2640 """ 2641 2642 self.remove_rows(start=n, stop=n + 1) 2643 2644 def _g_update_dependent(self): 2645 super(Table, self)._g_update_dependent() 2646 2647 # Update the new path in columns 2648 self.cols._g_update_table_location(self) 2649 2650 # Update the new path in the Row instance, if cached. Fixes #224. 2651 if 'row' in self.__dict__: 2652 self.__dict__['row'] = tableextension.Row(self) 2653 2654 def _g_move(self, newparent, newname): 2655 """Move this node in the hierarchy. 2656 2657 This overloads the Node._g_move() method. 2658 2659 """ 2660 2661 itgpathname = _index_pathname_of(self) 2662 2663 # First, move the table to the new location. 2664 super(Table, self)._g_move(newparent, newname) 2665 2666 # Then move the associated index group (if any). 2667 try: 2668 itgroup = self._v_file._get_node(itgpathname) 2669 except NoSuchNodeError: 2670 pass 2671 else: 2672 newigroup = self._v_parent 2673 newiname = _index_name_of(self) 2674 itgroup._g_move(newigroup, newiname) 2675 2676 def _g_remove(self, recursive=False, force=False): 2677 # Remove the associated index group (if any). 2678 itgpathname = _index_pathname_of(self) 2679 try: 2680 itgroup = self._v_file._get_node(itgpathname) 2681 except NoSuchNodeError: 2682 pass 2683 else: 2684 itgroup._f_remove(recursive=True) 2685 self.indexed = False # there are indexes no more 2686 2687 # Remove the leaf itself from the hierarchy. 2688 super(Table, self)._g_remove(recursive, force) 2689 2690 def _set_column_indexing(self, colpathname, indexed): 2691 """Mark the referred column as indexed or non-indexed.""" 2692 2693 colindexed = self.colindexed 2694 isindexed, wasindexed = bool(indexed), colindexed[colpathname] 2695 if isindexed == wasindexed: 2696 return # indexing state is unchanged 2697 2698 # Changing the set of indexed columns invalidates the condition cache 2699 self._condition_cache.clear() 2700 colindexed[colpathname] = isindexed 2701 self.indexed = max(colindexed.values()) # this is an OR :) 2702 2703 def _mark_columns_as_dirty(self, colnames): 2704 """Mark column indexes in `colnames` as dirty.""" 2705 2706 assert len(colnames) > 0 2707 if self.indexed: 2708 colindexed, cols = self.colindexed, self.cols 2709 # Mark the proper indexes as dirty 2710 for colname in colnames: 2711 if colindexed[colname]: 2712 col = cols._g_col(colname) 2713 col.index.dirty = True 2714 2715 def _reindex(self, colnames): 2716 """Re-index columns in `colnames` if automatic indexing is true.""" 2717 2718 if self.indexed: 2719 colindexed, cols = self.colindexed, self.cols 2720 colstoindex = [] 2721 # Mark the proper indexes as dirty 2722 for colname in colnames: 2723 if colindexed[colname]: 2724 col = cols._g_col(colname) 2725 col.index.dirty = True 2726 colstoindex.append(colname) 2727 # Now, re-index the dirty ones 2728 if self.autoindex and colstoindex: 2729 self._do_reindex(dirty=True) 2730 # The table caches for indexed queries are dirty now 2731 self._dirtycache = True 2732 2733 def _do_reindex(self, dirty): 2734 """Common code for `reindex()` and `reindex_dirty()`.""" 2735 2736 indexedrows = 0 2737 for (colname, colindexed) in self.colindexed.items(): 2738 if colindexed: 2739 indexcol = self.cols._g_col(colname) 2740 indexedrows = indexcol._do_reindex(dirty) 2741 # Update counters in case some column has been updated 2742 if indexedrows > 0: 2743 self._indexedrows = indexedrows 2744 self._unsaved_indexedrows = self.nrows - indexedrows 2745 2746 return SizeType(indexedrows) 2747 2748 def reindex(self): 2749 """Recompute all the existing indexes in the table. 2750 2751 This can be useful when you suspect that, for any reason, the 2752 index information for columns is no longer valid and want to 2753 rebuild the indexes on it. 2754 2755 """ 2756 2757 self._do_reindex(dirty=False) 2758 2759 def reindex_dirty(self): 2760 """Recompute the existing indexes in table, *if* they are dirty. 2761 2762 This can be useful when you have set :attr:`Table.autoindex` 2763 (see :class:`Table`) to false for the table and you want to 2764 update the indexes after a invalidating index operation 2765 (:meth:`Table.remove_rows`, for example). 2766 2767 """ 2768 2769 self._do_reindex(dirty=True) 2770 2771 def _g_copy_rows(self, object, start, stop, step, sortby, checkCSI): 2772 "Copy rows from self to object" 2773 if sortby is None: 2774 self._g_copy_rows_optim(object, start, stop, step) 2775 return 2776 lenbuf = self.nrowsinbuf 2777 absstep = step 2778 if step < 0: 2779 absstep = -step 2780 start, stop = stop + 1, start + 1 2781 if sortby is not None: 2782 index = self._check_sortby_csi(sortby, checkCSI) 2783 for start2 in range(start, stop, absstep * lenbuf): 2784 stop2 = start2 + absstep * lenbuf 2785 if stop2 > stop: 2786 stop2 = stop 2787 # The next 'if' is not needed, but it doesn't bother either 2788 if sortby is None: 2789 rows = self[start2:stop2:step] 2790 else: 2791 coords = index[start2:stop2:step] 2792 rows = self.read_coordinates(coords) 2793 # Save the records on disk 2794 object.append(rows) 2795 object.flush() 2796 2797 def _g_copy_rows_optim(self, object, start, stop, step): 2798 """Copy rows from self to object (optimized version)""" 2799 2800 nrowsinbuf = self.nrowsinbuf 2801 object._open_append(self._v_iobuf) 2802 nrowsdest = object.nrows 2803 for start2 in range(start, stop, step * nrowsinbuf): 2804 # Save the records on disk 2805 stop2 = start2 + step * nrowsinbuf 2806 if stop2 > stop: 2807 stop2 = stop 2808 # Optimized version (it saves some conversions) 2809 nrows = ((stop2 - start2 - 1) // step) + 1 2810 self.row._fill_col(self._v_iobuf, start2, stop2, step, None) 2811 # The output buffer is created anew, 2812 # so the operation is safe to in-place conversion. 2813 object._append_records(nrows) 2814 nrowsdest += nrows 2815 object._close_append() 2816 2817 def _g_prop_indexes(self, other): 2818 """Generate index in `other` table for every indexed column here.""" 2819 2820 oldcols, newcols = self.colinstances, other.colinstances 2821 for colname in newcols: 2822 if (isinstance(oldcols[colname], Column)): 2823 oldcolindexed = oldcols[colname].is_indexed 2824 if oldcolindexed: 2825 oldcolindex = oldcols[colname].index 2826 newcol = newcols[colname] 2827 newcol.create_index( 2828 kind=oldcolindex.kind, optlevel=oldcolindex.optlevel, 2829 filters=oldcolindex.filters, tmp_dir=None) 2830 2831 def _g_copy_with_stats(self, group, name, start, stop, step, 2832 title, filters, chunkshape, _log, **kwargs): 2833 """Private part of Leaf.copy() for each kind of leaf.""" 2834 2835 # Get the private args for the Table flavor of copy() 2836 sortby = kwargs.pop('sortby', None) 2837 propindexes = kwargs.pop('propindexes', False) 2838 checkCSI = kwargs.pop('checkCSI', False) 2839 # Compute the correct indices. 2840 (start, stop, step) = self._process_range_read( 2841 start, stop, step, warn_negstep=sortby is None) 2842 # And the number of final rows 2843 nrows = len(range(start, stop, step)) 2844 # Create the new table and copy the selected data. 2845 newtable = Table(group, name, self.description, title=title, 2846 filters=filters, expectedrows=nrows, 2847 chunkshape=chunkshape, 2848 _log=_log) 2849 self._g_copy_rows(newtable, start, stop, step, sortby, checkCSI) 2850 nbytes = newtable.nrows * newtable.rowsize 2851 # Generate equivalent indexes in the new table, if required. 2852 if propindexes and self.indexed: 2853 self._g_prop_indexes(newtable) 2854 return (newtable, nbytes) 2855 2856 # This overloading of copy is needed here in order to document 2857 # the additional keywords for the Table case. 2858 def copy(self, newparent=None, newname=None, overwrite=False, 2859 createparents=False, **kwargs): 2860 """Copy this table and return the new one. 2861 2862 This method has the behavior and keywords described in 2863 :meth:`Leaf.copy`. Moreover, it recognises the following additional 2864 keyword arguments. 2865 2866 Parameters 2867 ---------- 2868 sortby 2869 If specified, and sortby corresponds to a column with an index, 2870 then the copy will be sorted by this index. If you want to ensure 2871 a fully sorted order, the index must be a CSI one. A reverse 2872 sorted copy can be achieved by specifying a negative value for the 2873 step keyword. If sortby is omitted or None, the original table 2874 order is used. 2875 checkCSI 2876 If true and a CSI index does not exist for the sortby column, an 2877 error will be raised. If false (the default), it does nothing. 2878 You can use this flag in order to explicitly check for the 2879 existence of a CSI index. 2880 propindexes 2881 If true, the existing indexes in the source table are propagated 2882 (created) to the new one. If false (the default), the indexes are 2883 not propagated. 2884 2885 """ 2886 2887 return super(Table, self).copy( 2888 newparent, newname, overwrite, createparents, **kwargs) 2889 2890 def flush(self): 2891 """Flush the table buffers.""" 2892 2893 # Flush rows that remains to be appended 2894 if 'row' in self.__dict__: 2895 self.row._flush_buffered_rows() 2896 if self.indexed and self.autoindex: 2897 # Flush any unindexed row 2898 rowsadded = self.flush_rows_to_index(_lastrow=True) 2899 assert rowsadded <= 0 or self._indexedrows == self.nrows, \ 2900 ("internal error: the number of indexed rows (%d) " 2901 "and rows in the table (%d) is not equal; " 2902 "please report this to the authors." 2903 % (self._indexedrows, self.nrows)) 2904 if self._dirtyindexes: 2905 # Finally, re-index any dirty column 2906 self.reindex_dirty() 2907 2908 super(Table, self).flush() 2909 2910 def _g_pre_kill_hook(self): 2911 """Code to be called before killing the node.""" 2912 2913 # Flush the buffers before to clean-up them 2914 # self.flush() 2915 # It seems that flushing during the __del__ phase is a sure receipt for 2916 # bringing all kind of problems: 2917 # 1. Illegal Instruction 2918 # 2. Malloc(): trying to call free() twice 2919 # 3. Bus Error 2920 # 4. Segmentation fault 2921 # So, the best would be doing *nothing* at all in this __del__ phase. 2922 # As a consequence, the I/O will not be cleaned until a call to 2923 # Table.flush() would be done. This could lead to a potentially large 2924 # memory consumption. 2925 # NOTE: The user should make a call to Table.flush() whenever he has 2926 # finished working with his table. 2927 # I've added a Performance warning in order to compel the user to 2928 # call self.flush() before the table is being preempted. 2929 # F. Alted 2006-08-03 2930 if (('row' in self.__dict__ and self.row._get_unsaved_nrows() > 0) or 2931 (self.indexed and self.autoindex and 2932 (self._unsaved_indexedrows > 0 or self._dirtyindexes))): 2933 warnings.warn(("table ``%s`` is being preempted from alive nodes " 2934 "without its buffers being flushed or with some " 2935 "index being dirty. This may lead to very " 2936 "ineficient use of resources and even to fatal " 2937 "errors in certain situations. Please do a call " 2938 "to the .flush() or .reindex_dirty() methods on " 2939 "this table before start using other nodes.") 2940 % (self._v_pathname), PerformanceWarning) 2941 # Get rid of the IO buffers (if they have been created at all) 2942 mydict = self.__dict__ 2943 if '_v_iobuf' in mydict: 2944 del mydict['_v_iobuf'] 2945 if '_v_wdflts' in mydict: 2946 del mydict['_v_wdflts'] 2947 2948 def _f_close(self, flush=True): 2949 if not self._v_isopen: 2950 return # the node is already closed 2951 2952 # .. note:: 2953 # 2954 # As long as ``Table`` objects access their indices on closing, 2955 # ``File.close()`` will need to make *two separate passes* 2956 # to first close ``Table`` objects and then ``Index`` hierarchies. 2957 # 2958 2959 # Flush right now so the row object does not get in the middle. 2960 if flush: 2961 self.flush() 2962 2963 # Some warnings can be issued after calling `self._g_set_location()` 2964 # in `self.__init__()`. If warnings are turned into exceptions, 2965 # `self._g_post_init_hook` may not be called and `self.cols` not set. 2966 # One example of this is 2967 # ``test_create.createTestCase.test05_maxFieldsExceeded()``. 2968 cols = self.cols 2969 if cols is not None: 2970 cols._g_close() 2971 2972 # Close myself as a leaf. 2973 super(Table, self)._f_close(False) 2974 2975 def __repr__(self): 2976 """This provides column metainfo in addition to standard __str__""" 2977 2978 if self.indexed: 2979 format = """\ 2980%s 2981 description := %r 2982 byteorder := %r 2983 chunkshape := %r 2984 autoindex := %r 2985 colindexes := %r""" 2986 return format % (str(self), self.description, self.byteorder, 2987 self.chunkshape, self.autoindex, 2988 _ColIndexes(self.colindexes)) 2989 else: 2990 return """\ 2991%s 2992 description := %r 2993 byteorder := %r 2994 chunkshape := %r""" % \ 2995 (str(self), self.description, self.byteorder, self.chunkshape) 2996 2997 2998class Cols(object): 2999 """Container for columns in a table or nested column. 3000 3001 This class is used as an *accessor* to the columns in a table or nested 3002 column. It supports the *natural naming* convention, so that you can 3003 access the different columns as attributes which lead to Column instances 3004 (for non-nested columns) or other Cols instances (for nested columns). 3005 3006 For instance, if table.cols is a Cols instance with a column named col1 3007 under it, the later can be accessed as table.cols.col1. If col1 is nested 3008 and contains a col2 column, this can be accessed as table.cols.col1.col2 3009 and so on. Because of natural naming, the names of members start with 3010 special prefixes, like in the Group class (see :ref:`GroupClassDescr`). 3011 3012 Like the Column class (see :ref:`ColumnClassDescr`), Cols supports item 3013 access to read and write ranges of values in the table or nested column. 3014 3015 3016 .. rubric:: Cols attributes 3017 3018 .. attribute:: _v_colnames 3019 3020 A list of the names of the columns hanging directly 3021 from the associated table or nested column. The order of 3022 the names matches the order of their respective columns in 3023 the containing table. 3024 3025 .. attribute:: _v_colpathnames 3026 3027 A list of the pathnames of all the columns under the 3028 associated table or nested column (in preorder). If it does 3029 not contain nested columns, this is exactly the same as the 3030 :attr:`Cols._v_colnames` attribute. 3031 3032 .. attribute:: _v_desc 3033 3034 The associated Description instance (see 3035 :ref:`DescriptionClassDescr`). 3036 3037 """ 3038 3039 @property 3040 def _v_table(self): 3041 "The parent Table instance (see :ref:`TableClassDescr`)." 3042 return self._v__tableFile._get_node(self._v__tablePath) 3043 3044 def __init__(self, table, desc): 3045 myDict = self.__dict__ 3046 myDict['_v__tableFile'] = table._v_file 3047 myDict['_v__tablePath'] = table._v_pathname 3048 myDict['_v_desc'] = desc 3049 myDict['_v_colnames'] = desc._v_names 3050 myDict['_v_colpathnames'] = table.description._v_pathnames 3051 # Put the column in the local dictionary 3052 for name in desc._v_names: 3053 if name in desc._v_types: 3054 myDict[name] = Column(table, name, desc) 3055 else: 3056 myDict[name] = Cols(table, desc._v_colobjects[name]) 3057 3058 def _g_update_table_location(self, table): 3059 """Updates the location information about the associated `table`.""" 3060 3061 myDict = self.__dict__ 3062 myDict['_v__tableFile'] = table._v_file 3063 myDict['_v__tablePath'] = table._v_pathname 3064 3065 # Update the locations in individual columns. 3066 for colname in self._v_colnames: 3067 myDict[colname]._g_update_table_location(table) 3068 3069 def __len__(self): 3070 """Get the number of top level columns in table.""" 3071 3072 return len(self._v_colnames) 3073 3074 def _f_col(self, colname): 3075 """Get an accessor to the column colname. 3076 3077 This method returns a Column instance (see :ref:`ColumnClassDescr`) if 3078 the requested column is not nested, and a Cols instance (see 3079 :ref:`ColsClassDescr`) if it is. You may use full column pathnames in 3080 colname. 3081 3082 Calling cols._f_col('col1/col2') is equivalent to using cols.col1.col2. 3083 However, the first syntax is more intended for programmatic use. It is 3084 also better if you want to access columns with names that are not valid 3085 Python identifiers. 3086 3087 """ 3088 3089 if not isinstance(colname, str): 3090 raise TypeError("Parameter can only be an string. You passed " 3091 "object: %s" % colname) 3092 if ((colname.find('/') > -1 and 3093 colname not in self._v_colpathnames) and 3094 colname not in self._v_colnames): 3095 raise KeyError(("Cols accessor ``%s.cols%s`` does not have a " 3096 "column named ``%s``") 3097 % (self._v__tablePath, self._v_desc._v_pathname, 3098 colname)) 3099 3100 return self._g_col(colname) 3101 3102 def _g_col(self, colname): 3103 """Like `self._f_col()` but it does not check arguments.""" 3104 3105 # Get the Column or Description object 3106 inames = colname.split('/') 3107 cols = self 3108 for iname in inames: 3109 cols = cols.__dict__[iname] 3110 return cols 3111 3112 def __getitem__(self, key): 3113 """Get a row or a range of rows from a table or nested column. 3114 3115 If key argument is an integer, the corresponding nested type row is 3116 returned as a record of the current flavor. If key is a slice, the 3117 range of rows determined by it is returned as a structured array of the 3118 current flavor. 3119 3120 Examples 3121 -------- 3122 3123 :: 3124 3125 record = table.cols[4] # equivalent to table[4] 3126 recarray = table.cols.Info[4:1000:2] 3127 3128 Those statements are equivalent to:: 3129 3130 nrecord = table.read(start=4)[0] 3131 nrecarray = table.read(start=4, stop=1000, step=2).field('Info') 3132 3133 Here you can see how a mix of natural naming, indexing and slicing can 3134 be used as shorthands for the :meth:`Table.read` method. 3135 3136 """ 3137 3138 table = self._v_table 3139 nrows = table.nrows 3140 if is_idx(key): 3141 key = operator.index(key) 3142 3143 # Index out of range protection 3144 if key >= nrows: 3145 raise IndexError("Index out of range") 3146 if key < 0: 3147 # To support negative values 3148 key += nrows 3149 (start, stop, step) = table._process_range(key, key + 1, 1) 3150 colgroup = self._v_desc._v_pathname 3151 if colgroup == "": # The root group 3152 return table.read(start, stop, step)[0] 3153 else: 3154 crecord = table.read(start, stop, step)[0] 3155 return crecord[colgroup] 3156 elif isinstance(key, slice): 3157 (start, stop, step) = table._process_range( 3158 key.start, key.stop, key.step) 3159 colgroup = self._v_desc._v_pathname 3160 if colgroup == "": # The root group 3161 return table.read(start, stop, step) 3162 else: 3163 crecarray = table.read(start, stop, step) 3164 if hasattr(crecarray, "field"): 3165 return crecarray.field(colgroup) # RecArray case 3166 else: 3167 return get_nested_field(crecarray, colgroup) # numpy case 3168 else: 3169 raise TypeError("invalid index or slice: %r" % (key,)) 3170 3171 def __setitem__(self, key, value): 3172 """Set a row or a range of rows in a table or nested column. 3173 3174 If key argument is an integer, the corresponding row is set to 3175 value. If key is a slice, the range of rows determined by it is set to 3176 value. 3177 3178 Examples 3179 -------- 3180 3181 :: 3182 3183 table.cols[4] = record 3184 table.cols.Info[4:1000:2] = recarray 3185 3186 Those statements are equivalent to:: 3187 3188 table.modify_rows(4, rows=record) 3189 table.modify_column(4, 1000, 2, colname='Info', column=recarray) 3190 3191 Here you can see how a mix of natural naming, indexing and slicing 3192 can be used as shorthands for the :meth:`Table.modify_rows` and 3193 :meth:`Table.modify_column` methods. 3194 3195 """ 3196 3197 table = self._v_table 3198 nrows = table.nrows 3199 if is_idx(key): 3200 key = operator.index(key) 3201 3202 # Index out of range protection 3203 if key >= nrows: 3204 raise IndexError("Index out of range") 3205 if key < 0: 3206 # To support negative values 3207 key += nrows 3208 (start, stop, step) = table._process_range(key, key + 1, 1) 3209 elif isinstance(key, slice): 3210 (start, stop, step) = table._process_range( 3211 key.start, key.stop, key.step) 3212 else: 3213 raise TypeError("invalid index or slice: %r" % (key,)) 3214 3215 # Actually modify the correct columns 3216 colgroup = self._v_desc._v_pathname 3217 if colgroup == "": # The root group 3218 table.modify_rows(start, stop, step, rows=value) 3219 else: 3220 table.modify_column( 3221 start, stop, step, colname=colgroup, column=value) 3222 3223 def _g_close(self): 3224 # First, close the columns (ie possible indices open) 3225 for col in self._v_colnames: 3226 colobj = self._g_col(col) 3227 if isinstance(colobj, Column): 3228 colobj.close() 3229 # Delete the reference to column 3230 del self.__dict__[col] 3231 else: 3232 colobj._g_close() 3233 3234 self.__dict__.clear() 3235 3236 def __str__(self): 3237 """The string representation for this object.""" 3238 3239 # The pathname 3240 tablepathname = self._v__tablePath 3241 descpathname = self._v_desc._v_pathname 3242 if descpathname: 3243 descpathname = "." + descpathname 3244 # Get this class name 3245 classname = self.__class__.__name__ 3246 # The number of columns 3247 ncols = len(self._v_colnames) 3248 return "%s.cols%s (%s), %s columns" % \ 3249 (tablepathname, descpathname, classname, ncols) 3250 3251 def __repr__(self): 3252 """A detailed string representation for this object.""" 3253 3254 out = str(self) + "\n" 3255 for name in self._v_colnames: 3256 # Get this class name 3257 classname = getattr(self, name).__class__.__name__ 3258 # The type 3259 if name in self._v_desc._v_dtypes: 3260 tcol = self._v_desc._v_dtypes[name] 3261 # The shape for this column 3262 shape = (self._v_table.nrows,) + \ 3263 self._v_desc._v_dtypes[name].shape 3264 else: 3265 tcol = "Description" 3266 # Description doesn't have a shape currently 3267 shape = () 3268 out += " %s (%s%s, %s)" % (name, classname, shape, tcol) + "\n" 3269 return out 3270 3271 3272class Column(object): 3273 """Accessor for a non-nested column in a table. 3274 3275 Each instance of this class is associated with one *non-nested* column of a 3276 table. These instances are mainly used to read and write data from the 3277 table columns using item access (like the Cols class - see 3278 :ref:`ColsClassDescr`), but there are a few other associated methods to 3279 deal with indexes. 3280 3281 .. rubric:: Column attributes 3282 3283 .. attribute:: descr 3284 3285 The Description (see :ref:`DescriptionClassDescr`) instance of the 3286 parent table or nested column. 3287 3288 .. attribute:: name 3289 3290 The name of the associated column. 3291 3292 .. attribute:: pathname 3293 3294 The complete pathname of the associated column (the same as 3295 Column.name if the column is not inside a nested column). 3296 3297 Parameters 3298 ---------- 3299 table 3300 The parent table instance 3301 name 3302 The name of the column that is associated with this object 3303 descr 3304 The parent description object 3305 3306 """ 3307 3308 # Lazy read-only attributes 3309 # ````````````````````````` 3310 @lazyattr 3311 def dtype(self): 3312 """The NumPy dtype that most closely matches this column.""" 3313 3314 return self.descr._v_dtypes[self.name].base # Get rid of shape info 3315 3316 @lazyattr 3317 def type(self): 3318 """The PyTables type of the column (a string).""" 3319 3320 return self.descr._v_types[self.name] 3321 3322 # Properties 3323 # ~~~~~~~~~~ 3324 3325 @property 3326 def table(self): 3327 """The parent Table instance (see :ref:`TableClassDescr`).""" 3328 return self._table_file._get_node(self._table_path) 3329 3330 @property 3331 def index(self): 3332 """The Index instance (see :ref:`IndexClassDescr`) associated with this 3333 column (None if the column is not indexed).""" 3334 indexPath = _index_pathname_of_column_(self._table_path, self.pathname) 3335 try: 3336 index = self._table_file._get_node(indexPath) 3337 except NodeError: 3338 index = None # The column is not indexed 3339 return index 3340 3341 @lazyattr 3342 def _itemtype(self): 3343 return self.descr._v_dtypes[self.name] 3344 3345 @property 3346 def shape(self): 3347 "The shape of this column." 3348 return (self.table.nrows,) + self.descr._v_dtypes[self.name].shape 3349 3350 @property 3351 def is_indexed(self): 3352 "True if the column is indexed, false otherwise." 3353 if self.index is None: 3354 return False 3355 else: 3356 return True 3357 3358 @property 3359 def maindim(self): 3360 """"The dimension along which iterators work. Its value is 0 (i.e. the 3361 first dimension).""" 3362 return 0 3363 3364 def __init__(self, table, name, descr): 3365 self._table_file = table._v_file 3366 self._table_path = table._v_pathname 3367 self.name = name 3368 """The name of the associated column.""" 3369 self.pathname = descr._v_colobjects[name]._v_pathname 3370 """The complete pathname of the associated column (the same as 3371 Column.name if the column is not inside a nested column).""" 3372 self.descr = descr 3373 """The Description (see :ref:`DescriptionClassDescr`) instance of the 3374 parent table or nested column.""" 3375 3376 def _g_update_table_location(self, table): 3377 """Updates the location information about the associated `table`.""" 3378 3379 self._table_file = table._v_file 3380 self._table_path = table._v_pathname 3381 3382 def __len__(self): 3383 """Get the number of elements in the column. 3384 3385 This matches the length in rows of the parent table. 3386 3387 """ 3388 3389 return self.table.nrows 3390 3391 def __getitem__(self, key): 3392 """Get a row or a range of rows from a column. 3393 3394 If key argument is an integer, the corresponding element in the column 3395 is returned as an object of the current flavor. If key is a slice, the 3396 range of elements determined by it is returned as an array of the 3397 current flavor. 3398 3399 Examples 3400 -------- 3401 3402 :: 3403 3404 print("Column handlers:") 3405 for name in table.colnames: 3406 print(table.cols._f_col(name)) 3407 print("Select table.cols.name[1]-->", table.cols.name[1]) 3408 print("Select table.cols.name[1:2]-->", table.cols.name[1:2]) 3409 print("Select table.cols.name[:]-->", table.cols.name[:]) 3410 print("Select table.cols._f_col('name')[:]-->", 3411 table.cols._f_col('name')[:]) 3412 3413 The output of this for a certain arbitrary table is:: 3414 3415 Column handlers: 3416 /table.cols.name (Column(), string, idx=None) 3417 /table.cols.lati (Column(), int32, idx=None) 3418 /table.cols.longi (Column(), int32, idx=None) 3419 /table.cols.vector (Column(2,), int32, idx=None) 3420 /table.cols.matrix2D (Column(2, 2), float64, idx=None) 3421 Select table.cols.name[1]--> Particle: 11 3422 Select table.cols.name[1:2]--> ['Particle: 11'] 3423 Select table.cols.name[:]--> ['Particle: 10' 3424 'Particle: 11' 'Particle: 12' 3425 'Particle: 13' 'Particle: 14'] 3426 Select table.cols._f_col('name')[:]--> ['Particle: 10' 3427 'Particle: 11' 'Particle: 12' 3428 'Particle: 13' 'Particle: 14'] 3429 3430 See the :file:`examples/table2.py` file for a more complete example. 3431 3432 """ 3433 3434 table = self.table 3435 3436 # Generalized key support not there yet, but at least allow 3437 # for a tuple with one single element (the main dimension). 3438 # (key,) --> key 3439 if isinstance(key, tuple) and len(key) == 1: 3440 key = key[0] 3441 3442 if is_idx(key): 3443 key = operator.index(key) 3444 3445 # Index out of range protection 3446 if key >= table.nrows: 3447 raise IndexError("Index out of range") 3448 if key < 0: 3449 # To support negative values 3450 key += table.nrows 3451 (start, stop, step) = table._process_range(key, key + 1, 1) 3452 return table.read(start, stop, step, self.pathname)[0] 3453 elif isinstance(key, slice): 3454 (start, stop, step) = table._process_range( 3455 key.start, key.stop, key.step) 3456 return table.read(start, stop, step, self.pathname) 3457 else: 3458 raise TypeError( 3459 "'%s' key type is not valid in this context" % key) 3460 3461 def __iter__(self): 3462 """Iterate through all items in the column.""" 3463 3464 table = self.table 3465 itemsize = self.dtype.itemsize 3466 nrowsinbuf = table._v_file.params['IO_BUFFER_SIZE'] // itemsize 3467 buf = numpy.empty((nrowsinbuf, ), self._itemtype) 3468 max_row = len(self) 3469 for start_row in range(0, len(self), nrowsinbuf): 3470 end_row = min(start_row + nrowsinbuf, max_row) 3471 buf_slice = buf[0:end_row - start_row] 3472 table.read(start_row, end_row, 1, field=self.pathname, 3473 out=buf_slice) 3474 for row in buf_slice: 3475 yield row 3476 3477 def __setitem__(self, key, value): 3478 """Set a row or a range of rows in a column. 3479 3480 If key argument is an integer, the corresponding element is set to 3481 value. If key is a slice, the range of elements determined by it is 3482 set to value. 3483 3484 Examples 3485 -------- 3486 3487 :: 3488 3489 # Modify row 1 3490 table.cols.col1[1] = -1 3491 3492 # Modify rows 1 and 3 3493 table.cols.col1[1::2] = [2,3] 3494 3495 Which is equivalent to:: 3496 3497 # Modify row 1 3498 table.modify_columns(start=1, columns=[[-1]], names=['col1']) 3499 3500 # Modify rows 1 and 3 3501 columns = numpy.rec.fromarrays([[2,3]], formats='i4') 3502 table.modify_columns(start=1, step=2, columns=columns, 3503 names=['col1']) 3504 3505 """ 3506 3507 table = self.table 3508 table._v_file._check_writable() 3509 3510 # Generalized key support not there yet, but at least allow 3511 # for a tuple with one single element (the main dimension). 3512 # (key,) --> key 3513 if isinstance(key, tuple) and len(key) == 1: 3514 key = key[0] 3515 3516 if is_idx(key): 3517 key = operator.index(key) 3518 3519 # Index out of range protection 3520 if key >= table.nrows: 3521 raise IndexError("Index out of range") 3522 if key < 0: 3523 # To support negative values 3524 key += table.nrows 3525 return table.modify_column(key, key + 1, 1, 3526 [[value]], self.pathname) 3527 elif isinstance(key, slice): 3528 (start, stop, step) = table._process_range( 3529 key.start, key.stop, key.step) 3530 return table.modify_column(start, stop, step, 3531 value, self.pathname) 3532 else: 3533 raise ValueError("Non-valid index or slice: %s" % key) 3534 3535 def create_index(self, optlevel=6, kind="medium", filters=None, 3536 tmp_dir=None, _blocksizes=None, _testmode=False, 3537 _verbose=False): 3538 """Create an index for this column. 3539 3540 .. warning:: 3541 3542 In some situations it is useful to get a completely sorted 3543 index (CSI). For those cases, it is best to use the 3544 :meth:`Column.create_csindex` method instead. 3545 3546 Parameters 3547 ---------- 3548 optlevel : int 3549 The optimization level for building the index. The levels ranges 3550 from 0 (no optimization) up to 9 (maximum optimization). Higher 3551 levels of optimization mean better chances for reducing the entropy 3552 of the index at the price of using more CPU, memory and I/O 3553 resources for creating the index. 3554 kind : str 3555 The kind of the index to be built. It can take the 'ultralight', 3556 'light', 'medium' or 'full' values. Lighter kinds ('ultralight' 3557 and 'light') mean that the index takes less space on disk, but will 3558 perform queries slower. Heavier kinds ('medium' and 'full') mean 3559 better chances for reducing the entropy of the index (increasing 3560 the query speed) at the price of using more disk space as well as 3561 more CPU, memory and I/O resources for creating the index. 3562 3563 Note that selecting a full kind with an optlevel of 9 (the maximum) 3564 guarantees the creation of an index with zero entropy, that is, a 3565 completely sorted index (CSI) - provided that the number of rows in 3566 the table does not exceed the 2**48 figure (that is more than 100 3567 trillions of rows). See :meth:`Column.create_csindex` method for a 3568 more direct way to create a CSI index. 3569 filters : Filters 3570 Specify the Filters instance used to compress the index. If None, 3571 default index filters will be used (currently, zlib level 1 with 3572 shuffling). 3573 tmp_dir 3574 When kind is other than 'ultralight', a temporary file is created 3575 during the index build process. You can use the tmp_dir argument 3576 to specify the directory for this temporary file. The default is 3577 to create it in the same directory as the file containing the 3578 original table. 3579 3580 """ 3581 3582 kinds = ['ultralight', 'light', 'medium', 'full'] 3583 if kind not in kinds: 3584 raise ValueError("Kind must have any of these values: %s" % kinds) 3585 if (not isinstance(optlevel, int) or 3586 (optlevel < 0 or optlevel > 9)): 3587 raise ValueError("Optimization level must be an integer in the " 3588 "range 0-9") 3589 if filters is None: 3590 filters = default_index_filters 3591 if tmp_dir is None: 3592 tmp_dir = os.path.dirname(self._table_file.filename) 3593 else: 3594 if not os.path.isdir(tmp_dir): 3595 raise ValueError("Temporary directory '%s' does not exist" % 3596 tmp_dir) 3597 if (_blocksizes is not None and 3598 (not isinstance(_blocksizes, tuple) or len(_blocksizes) != 4)): 3599 raise ValueError("_blocksizes must be a tuple with exactly 4 " 3600 "elements") 3601 idxrows = _column__create_index(self, optlevel, kind, filters, 3602 tmp_dir, _blocksizes, _verbose) 3603 return SizeType(idxrows) 3604 3605 def create_csindex(self, filters=None, tmp_dir=None, 3606 _blocksizes=None, _testmode=False, _verbose=False): 3607 """Create a completely sorted index (CSI) for this column. 3608 3609 This method guarantees the creation of an index with zero entropy, that 3610 is, a completely sorted index (CSI) -- provided that the number of rows 3611 in the table does not exceed the 2**48 figure (that is more than 100 3612 trillions of rows). A CSI index is needed for some table methods (like 3613 :meth:`Table.itersorted` or :meth:`Table.read_sorted`) in order to 3614 ensure completely sorted results. 3615 3616 For the meaning of filters and tmp_dir arguments see 3617 :meth:`Column.create_index`. 3618 3619 Notes 3620 ----- 3621 This method is equivalent to 3622 Column.create_index(optlevel=9, kind='full', ...). 3623 3624 """ 3625 3626 return self.create_index( 3627 kind='full', optlevel=9, filters=filters, tmp_dir=tmp_dir, 3628 _blocksizes=_blocksizes, _testmode=_testmode, _verbose=_verbose) 3629 3630 def _do_reindex(self, dirty): 3631 """Common code for reindex() and reindex_dirty() codes.""" 3632 3633 index = self.index 3634 dodirty = True 3635 if dirty and not index.dirty: 3636 dodirty = False 3637 if index is not None and dodirty: 3638 self._table_file._check_writable() 3639 # Get the old index parameters 3640 kind = index.kind 3641 optlevel = index.optlevel 3642 filters = index.filters 3643 # We *need* to tell the index that it is going to be undirty. 3644 # This is needed here so as to unnail() the condition cache. 3645 index.dirty = False 3646 # Delete the existing Index 3647 index._f_remove() 3648 # Create a new Index with the previous parameters 3649 return SizeType(self.create_index( 3650 kind=kind, optlevel=optlevel, filters=filters)) 3651 else: 3652 return SizeType(0) # The column is not intended for indexing 3653 3654 def reindex(self): 3655 """Recompute the index associated with this column. 3656 3657 This can be useful when you suspect that, for any reason, 3658 the index information is no longer valid and you want to rebuild it. 3659 3660 This method does nothing if the column is not indexed. 3661 3662 """ 3663 3664 self._do_reindex(dirty=False) 3665 3666 def reindex_dirty(self): 3667 """Recompute the associated index only if it is dirty. 3668 3669 This can be useful when you have set :attr:`Table.autoindex` to false 3670 for the table and you want to update the column's index after an 3671 invalidating index operation (like :meth:`Table.remove_rows`). 3672 3673 This method does nothing if the column is not indexed. 3674 3675 """ 3676 3677 self._do_reindex(dirty=True) 3678 3679 def remove_index(self): 3680 """Remove the index associated with this column. 3681 3682 This method does nothing if the column is not indexed. The removed 3683 index can be created again by calling the :meth:`Column.create_index` 3684 method. 3685 3686 """ 3687 3688 self._table_file._check_writable() 3689 3690 # Remove the index if existing. 3691 if self.is_indexed: 3692 index = self.index 3693 index._f_remove() 3694 self.table._set_column_indexing(self.pathname, False) 3695 3696 def close(self): 3697 """Close this column.""" 3698 3699 self.__dict__.clear() 3700 3701 def __str__(self): 3702 """The string representation for this object.""" 3703 3704 # The pathname 3705 tablepathname = self._table_path 3706 pathname = self.pathname.replace('/', '.') 3707 # Get this class name 3708 classname = self.__class__.__name__ 3709 # The shape for this column 3710 shape = self.shape 3711 # The type 3712 tcol = self.descr._v_types[self.name] 3713 return "%s.cols.%s (%s%s, %s, idx=%s)" % \ 3714 (tablepathname, pathname, classname, shape, tcol, self.index) 3715 3716 def __repr__(self): 3717 """A detailed string representation for this object.""" 3718 3719 return str(self) 3720 3721 3722## Local Variables: 3723## mode: python 3724## py-indent-offset: 4 3725## tab-width: 4 3726## fill-column: 72 3727## End: 3728