1# -*- coding: utf-8 -*- 2 3####################################################################### 4# 5# License: BSD 6# Created: June 08, 2004 7# Author: Francesc Alted - faltet@pytables.com 8# 9# $Id$ 10# 11######################################################################## 12 13"""Here is defined the Index class.""" 14 15 16import math 17import operator 18import os 19import os.path 20import sys 21import tempfile 22import warnings 23 24from time import time, perf_counter 25 26import numpy 27 28from .idxutils import (calc_chunksize, calcoptlevels, 29 get_reduction_level, nextafter, inftype) 30 31from . import indexesextension 32from .node import NotLoggedMixin 33from .atom import UIntAtom, Atom 34from .earray import EArray 35from .carray import CArray 36from .leaf import Filters 37from .indexes import CacheArray, LastRowArray, IndexArray 38from .group import Group 39from .path import join_path 40from .exceptions import PerformanceWarning 41from .utils import is_idx, idx2long, lazyattr 42from .utilsextension import (nan_aware_gt, nan_aware_ge, 43 nan_aware_lt, nan_aware_le, 44 bisect_left, bisect_right) 45from .lrucacheextension import ObjectCache 46 47 48 49# default version for INDEX objects 50# obversion = "1.0" # Version of indexes in PyTables 1.x series 51# obversion = "2.0" # Version of indexes in PyTables Pro 2.0 series 52obversion = "2.1" # Version of indexes in PyTables Pro 2.1 and up series, 53 # including the join 2.3 Std + Pro version 54 55 56debug = False 57# debug = True # Uncomment this for printing sizes purposes 58profile = False 59# profile = True # Uncomment for profiling 60if profile: 61 from .utils import show_stats 62 63 64# The default method for sorting 65# defsort = "quicksort" 66# Changing to mergesort to fix #441 67defsort = "mergesort" 68 69# Default policy for automatically updating indexes after a table 70# append operation, or automatically reindexing after an 71# index-invalidating operation like removing or modifying table rows. 72default_auto_index = True 73# Keep in sync with ``Table.autoindex`` docstring. 74 75# Default filters used to compress indexes. This is quite fast and 76# compression is pretty good. 77# Remember to keep these defaults in sync with the docstrings and UG. 78default_index_filters = Filters(complevel=1, complib='zlib', 79 shuffle=True, fletcher32=False) 80 81# Deprecated API 82defaultAutoIndex = default_auto_index 83defaultIndexFilters = default_index_filters 84 85# The list of types for which an optimised search in cython and C has 86# been implemented. Always add here the name of a new optimised type. 87opt_search_types = ("int8", "int16", "int32", "int64", 88 "uint8", "uint16", "uint32", "uint64", 89 "float32", "float64") 90 91# The upper limit for uint32 ints 92max32 = 2**32 93 94 95def _table_column_pathname_of_index(indexpathname): 96 names = indexpathname.split("/") 97 for i, name in enumerate(names): 98 if name.startswith('_i_'): 99 break 100 tablepathname = "/".join(names[:i]) + "/" + name[3:] 101 colpathname = "/".join(names[i + 1:]) 102 return (tablepathname, colpathname) 103 104 105class Index(NotLoggedMixin, Group, indexesextension.Index): 106 """Represents the index of a column in a table. 107 108 This class is used to keep the indexing information for columns in a Table 109 dataset (see :ref:`TableClassDescr`). It is actually a descendant of the 110 Group class (see :ref:`GroupClassDescr`), with some added functionality. An 111 Index is always associated with one and only one column in the table. 112 113 .. note:: 114 115 This class is mainly intended for internal use, but some of its 116 documented attributes and methods may be interesting for the 117 programmer. 118 119 Parameters 120 ---------- 121 parentnode 122 The parent :class:`Group` object. 123 124 .. versionchanged:: 3.0 125 Renamed from *parentNode* to *parentnode*. 126 127 name : str 128 The name of this node in its parent group. 129 atom : Atom 130 An Atom object representing the shape and type of the atomic objects to 131 be saved. Only scalar atoms are supported. 132 title 133 Sets a TITLE attribute of the Index entity. 134 kind 135 The desired kind for this index. The 'full' kind specifies a complete 136 track of the row position (64-bit), while the 'medium', 'light' or 137 'ultralight' kinds only specify in which chunk the row is (using 138 32-bit, 16-bit and 8-bit respectively). 139 optlevel 140 The desired optimization level for this index. 141 filters : Filters 142 An instance of the Filters class that provides information about the 143 desired I/O filters to be applied during the life of this object. 144 tmp_dir 145 The directory for the temporary files. 146 expectedrows 147 Represents an user estimate about the number of row slices that will be 148 added to the growable dimension in the IndexArray object. 149 byteorder 150 The byteorder of the index datasets *on-disk*. 151 blocksizes 152 The four main sizes of the compound blocks in index datasets (a low 153 level parameter). 154 155 """ 156 157 _c_classid = 'INDEX' 158 159 160 @property 161 def kind(self): 162 "The kind of this index." 163 return {1: 'ultralight', 2: 'light', 164 4: 'medium', 8: 'full'}[self.indsize] 165 166 @property 167 def filters(self): 168 """Filter properties for this index - see Filters in 169 :ref:`FiltersClassDescr`.""" 170 return self._v_filters 171 172 @property 173 def dirty(self): 174 """Whether the index is dirty or not. 175 Dirty indexes are out of sync with column data, so they exist but they 176 are not usable. 177 """ 178 179 # If there is no ``DIRTY`` attribute, index should be clean. 180 return getattr(self._v_attrs, 'DIRTY', False) 181 182 @dirty.setter 183 def dirty(self, dirty): 184 wasdirty, isdirty = self.dirty, bool(dirty) 185 self._v_attrs.DIRTY = dirty 186 # If an *actual* change in dirtiness happens, 187 # notify the condition cache by setting or removing a nail. 188 conditioncache = self.table._condition_cache 189 if not wasdirty and isdirty: 190 conditioncache.nail() 191 if wasdirty and not isdirty: 192 conditioncache.unnail() 193 194 @property 195 def column(self): 196 """The Column (see :ref:`ColumnClassDescr`) instance for the indexed 197 column.""" 198 199 tablepath, columnpath = _table_column_pathname_of_index( 200 self._v_pathname) 201 table = self._v_file._get_node(tablepath) 202 column = table.cols._g_col(columnpath) 203 return column 204 205 @property 206 def table(self): 207 """Accessor for the `Table` object of this index.""" 208 tablepath, columnpath = _table_column_pathname_of_index(self._v_pathname) 209 table = self._v_file._get_node(tablepath) 210 return table 211 212 @property 213 def nblockssuperblock(self): 214 "The number of blocks in a superblock." 215 return self.superblocksize // self.blocksize 216 217 @property 218 def nslicesblock(self): 219 "The number of slices in a block." 220 return self.blocksize // self.slicesize 221 222 @property 223 def nchunkslice(self): 224 "The number of chunks in a slice." 225 return self.slicesize // self.chunksize 226 227 @property 228 def nsuperblocks(self): 229 "The total number of superblocks in index." 230 # Last row should not be considered as a superblock 231 nelements = self.nelements - self.nelementsILR 232 nblocks = nelements // self.superblocksize 233 if nelements % self.blocksize > 0: 234 nblocks += 1 235 return nblocks 236 237 @property 238 def nblocks(self): 239 "The total number of blocks in index." 240 # Last row should not be considered as a block 241 nelements = self.nelements - self.nelementsILR 242 nblocks = nelements // self.blocksize 243 if nelements % self.blocksize > 0: 244 nblocks += 1 245 return nblocks 246 247 @property 248 def nslices(self): 249 "The number of complete slices in index." 250 return self.nelements // self.slicesize 251 252 @property 253 def nchunks(self): 254 "The number of complete chunks in index." 255 return self.nelements // self.chunksize 256 257 @property 258 def shape(self): 259 "The shape of this index (in slices and elements)." 260 return (self.nrows, self.slicesize) 261 262 @property 263 def temp_required(self): 264 "Whether a temporary file for indexes is required or not." 265 return self.indsize > 1 and self.optlevel > 0 and self.table.nrows > self.slicesize 266 267 @property 268 def want_complete_sort(self): 269 "Whether we should try to build a completely sorted index or not." 270 return self.indsize == 8 and self.optlevel == 9 271 272 @property 273 def is_csi(self): 274 """Whether the index is completely sorted or not. 275 276 .. versionchanged:: 3.0 277 The *is_CSI* property has been renamed into *is_csi*. 278 279 """ 280 281 if self.nelements == 0: 282 # An index with 0 indexed elements is not a CSI one (by definition) 283 return False 284 if self.indsize < 8: 285 # An index that is not full cannot be completely sorted 286 return False 287 # Try with the 'is_csi' attribute 288 if 'is_csi' in self._v_attrs: 289 return self._v_attrs.is_csi 290 # If not, then compute the overlaps manually 291 # (the attribute 'is_csi' will be set there) 292 self.compute_overlaps(self, None, False) 293 return self.noverlaps == 0 294 295 @lazyattr 296 def nrowsinchunk(self): 297 """The number of rows that fits in a *table* chunk.""" 298 299 return self.table.chunkshape[0] 300 301 @lazyattr 302 def lbucket(self): 303 """Return the length of a bucket based index type.""" 304 305 # Avoid to set a too large lbucket size (mainly useful for tests) 306 lbucket = min(self.nrowsinchunk, self.chunksize) 307 if self.indsize == 1: 308 # For ultra-light, we will never have to keep track of a 309 # bucket outside of a slice. 310 maxnb = 2**8 311 if self.slicesize > maxnb * lbucket: 312 lbucket = int(math.ceil(float(self.slicesize) / maxnb)) 313 elif self.indsize == 2: 314 # For light, we will never have to keep track of a 315 # bucket outside of a block. 316 maxnb = 2**16 317 if self.blocksize > maxnb * lbucket: 318 lbucket = int(math.ceil(float(self.blocksize) / maxnb)) 319 else: 320 # For medium and full indexes there should not be a need to 321 # increase lbucket 322 pass 323 return lbucket 324 325 def __init__(self, parentnode, name, 326 atom=None, title="", 327 kind=None, 328 optlevel=None, 329 filters=None, 330 tmp_dir=None, 331 expectedrows=0, 332 byteorder=None, 333 blocksizes=None, 334 new=True): 335 336 self._v_version = None 337 """The object version of this index.""" 338 self.optlevel = optlevel 339 """The optimization level for this index.""" 340 self.tmp_dir = tmp_dir 341 """The directory for the temporary files.""" 342 self.expectedrows = expectedrows 343 """The expected number of items of index arrays.""" 344 if byteorder in ["little", "big"]: 345 self.byteorder = byteorder 346 else: 347 self.byteorder = sys.byteorder 348 """The byteorder of the index datasets.""" 349 if atom is not None: 350 self.dtype = atom.dtype.base 351 self.type = atom.type 352 """The datatypes to be stored by the sorted index array.""" 353 ############### Important note ########################### 354 # The datatypes saved as index values are NumPy native 355 # types, so we get rid of type metainfo like Time* or Enum* 356 # that belongs to HDF5 types (actually, this metainfo is 357 # not needed for sorting and looking-up purposes). 358 ########################################################## 359 indsize = { 360 'ultralight': 1, 'light': 2, 'medium': 4, 'full': 8}[kind] 361 assert indsize in (1, 2, 4, 8), "indsize should be 1, 2, 4 or 8!" 362 self.indsize = indsize 363 """The itemsize for the indices part of the index.""" 364 365 self.nrows = None 366 """The total number of slices in the index.""" 367 self.nelements = None 368 """The number of currently indexed rows for this column.""" 369 self.blocksizes = blocksizes 370 """The four main sizes of the compound blocks (if specified).""" 371 self.dirtycache = True 372 """Dirty cache (for ranges, bounds & sorted) flag.""" 373 self.superblocksize = None 374 """Size of the superblock for this index.""" 375 self.blocksize = None 376 """Size of the block for this index.""" 377 self.slicesize = None 378 """Size of the slice for this index.""" 379 self.chunksize = None 380 """Size of the chunk for this index.""" 381 self.tmpfilename = None 382 """Filename for temporary bounds.""" 383 self.opt_search_types = opt_search_types 384 """The types for which and optimized search has been implemented.""" 385 self.noverlaps = -1 386 """The number of overlaps in an index. 0 means a completely 387 sorted index. -1 means that this number is not computed yet.""" 388 self.tprof = 0 389 """Time counter for benchmarking purposes.""" 390 391 from .file import open_file 392 self._openFile = open_file 393 """The `open_file()` function, to avoid a circular import.""" 394 395 super(Index, self).__init__(parentnode, name, title, new, filters) 396 397 def _g_post_init_hook(self): 398 if self._v_new: 399 # The version for newly created indexes 400 self._v_version = obversion 401 super(Index, self)._g_post_init_hook() 402 403 # Index arrays must only be created for new indexes 404 if not self._v_new: 405 idxversion = self._v_version 406 # Set-up some variables from info on disk and return 407 attrs = self._v_attrs 408 # Coerce NumPy scalars to Python scalars in order 409 # to avoid undesired upcasting operations. 410 self.superblocksize = int(attrs.superblocksize) 411 self.blocksize = int(attrs.blocksize) 412 self.slicesize = int(attrs.slicesize) 413 self.chunksize = int(attrs.chunksize) 414 self.blocksizes = (self.superblocksize, self.blocksize, 415 self.slicesize, self.chunksize) 416 self.optlevel = int(attrs.optlevel) 417 sorted = self.sorted 418 indices = self.indices 419 self.dtype = sorted.atom.dtype 420 self.type = sorted.atom.type 421 self.indsize = indices.atom.itemsize 422 # Some sanity checks for slicesize, chunksize and indsize 423 assert self.slicesize == indices.shape[1], "Wrong slicesize" 424 assert self.chunksize == indices._v_chunkshape[ 425 1], "Wrong chunksize" 426 assert self.indsize in (1, 2, 4, 8), "Wrong indices itemsize" 427 if idxversion > "2.0": 428 self.reduction = int(attrs.reduction) 429 nelementsSLR = int(self.sortedLR.attrs.nelements) 430 nelementsILR = int(self.indicesLR.attrs.nelements) 431 else: 432 self.reduction = 1 433 nelementsILR = self.indicesLR[-1] 434 nelementsSLR = nelementsILR 435 self.nrows = sorted.nrows 436 self.nelements = self.nrows * self.slicesize + nelementsILR 437 self.nelementsSLR = nelementsSLR 438 self.nelementsILR = nelementsILR 439 if nelementsILR > 0: 440 self.nrows += 1 441 # Get the bounds as a cache (this has to remain here!) 442 rchunksize = self.chunksize // self.reduction 443 nboundsLR = (nelementsSLR - 1) // rchunksize 444 if nboundsLR < 0: 445 nboundsLR = 0 # correction for -1 bounds 446 nboundsLR += 2 # bounds + begin + end 447 # All bounds values (+begin + end) are at the end of sortedLR 448 self.bebounds = self.sortedLR[ 449 nelementsSLR:nelementsSLR + nboundsLR] 450 return 451 452 # The index is new. Initialize the values 453 self.nrows = 0 454 self.nelements = 0 455 self.nelementsSLR = 0 456 self.nelementsILR = 0 457 458 # The atom 459 atom = Atom.from_dtype(self.dtype) 460 461 # The filters 462 filters = self.filters 463 464 # Compute the superblocksize, blocksize, slicesize and chunksize values 465 # (in case these parameters haven't been passed to the constructor) 466 if self.blocksizes is None: 467 self.blocksizes = calc_chunksize( 468 self.expectedrows, self.optlevel, self.indsize) 469 (self.superblocksize, self.blocksize, 470 self.slicesize, self.chunksize) = self.blocksizes 471 if debug: 472 print("blocksizes:", self.blocksizes) 473 # Compute the reduction level 474 self.reduction = get_reduction_level( 475 self.indsize, self.optlevel, self.slicesize, self.chunksize) 476 rchunksize = self.chunksize // self.reduction 477 rslicesize = self.slicesize // self.reduction 478 479 # Save them on disk as attributes 480 self._v_attrs.superblocksize = numpy.uint64(self.superblocksize) 481 self._v_attrs.blocksize = numpy.uint64(self.blocksize) 482 self._v_attrs.slicesize = numpy.uint32(self.slicesize) 483 self._v_attrs.chunksize = numpy.uint32(self.chunksize) 484 # Save the optlevel as well 485 self._v_attrs.optlevel = self.optlevel 486 # Save the reduction level 487 self._v_attrs.reduction = self.reduction 488 489 # Create the IndexArray for sorted values 490 sorted = IndexArray(self, 'sorted', atom, "Sorted Values", 491 filters, self.byteorder) 492 493 # Create the IndexArray for index values 494 IndexArray(self, 'indices', UIntAtom(itemsize=self.indsize), 495 "Number of chunk in table", filters, self.byteorder) 496 497 # Create the cache for range values (1st order cache) 498 CacheArray(self, 'ranges', atom, (0, 2), "Range Values", filters, 499 self.expectedrows // self.slicesize, 500 byteorder=self.byteorder) 501 # median ranges 502 EArray(self, 'mranges', atom, (0,), "Median ranges", filters, 503 byteorder=self.byteorder, _log=False) 504 505 # Create the cache for boundary values (2nd order cache) 506 nbounds_inslice = (rslicesize - 1) // rchunksize 507 CacheArray(self, 'bounds', atom, (0, nbounds_inslice), 508 "Boundary Values", filters, self.nchunks, 509 (1, nbounds_inslice), byteorder=self.byteorder) 510 511 # begin, end & median bounds (only for numerical types) 512 EArray(self, 'abounds', atom, (0,), "Start bounds", filters, 513 byteorder=self.byteorder, _log=False) 514 EArray(self, 'zbounds', atom, (0,), "End bounds", filters, 515 byteorder=self.byteorder, _log=False) 516 EArray(self, 'mbounds', atom, (0,), "Median bounds", filters, 517 byteorder=self.byteorder, _log=False) 518 519 # Create the Array for last (sorted) row values + bounds 520 shape = (rslicesize + 2 + nbounds_inslice,) 521 sortedLR = LastRowArray(self, 'sortedLR', atom, shape, 522 "Last Row sorted values + bounds", 523 filters, (rchunksize,), 524 byteorder=self.byteorder) 525 526 # Create the Array for the number of chunk in last row 527 shape = (self.slicesize,) # enough for indexes and length 528 indicesLR = LastRowArray(self, 'indicesLR', 529 UIntAtom(itemsize=self.indsize), 530 shape, "Last Row indices", 531 filters, (self.chunksize,), 532 byteorder=self.byteorder) 533 534 # The number of elements in LR will be initialized here 535 sortedLR.attrs.nelements = 0 536 indicesLR.attrs.nelements = 0 537 538 # All bounds values (+begin + end) are uninitialized in creation time 539 self.bebounds = None 540 541 # The starts and lengths initialization 542 self.starts = numpy.empty(shape=self.nrows, dtype=numpy.int32) 543 """Where the values fulfiling conditions starts for every slice.""" 544 self.lengths = numpy.empty(shape=self.nrows, dtype=numpy.int32) 545 """Lengths of the values fulfilling conditions for every slice.""" 546 547 # Finally, create a temporary file for indexes if needed 548 if self.temp_required: 549 self.create_temp() 550 551 552 def initial_append(self, xarr, nrow, reduction): 553 """Compute an initial indices arrays for data to be indexed.""" 554 555 if profile: 556 tref = time() 557 if profile: 558 show_stats("Entering initial_append", tref) 559 arr = xarr.pop() 560 indsize = self.indsize 561 slicesize = self.slicesize 562 nelementsILR = self.nelementsILR 563 if profile: 564 show_stats("Before creating idx", tref) 565 if indsize == 8: 566 idx = numpy.arange(0, len(arr), dtype="uint64") + nrow * slicesize 567 elif indsize == 4: 568 # For medium (32-bit) all the rows in tables should be 569 # directly reachable. But as len(arr) < 2**31, we can 570 # choose uint32 for representing indices. In this way, we 571 # consume far less memory during the keysort process. The 572 # offset will be added in self.final_idx32() later on. 573 # 574 # This optimization also prevents the values in LR to 575 # participate in the ``swap_chunks`` process, and this is 576 # the main reason to not allow the medium indexes to create 577 # completely sorted indexes. However, I don't find this to 578 # be a big limitation, as probably fully indexes are much 579 # more suitable for producing completely sorted indexes 580 # because in this case the indices part is usable for 581 # getting the reverse indices of the index, and I forsee 582 # this to be a common requirement in many operations (for 583 # example, in table sorts). 584 # 585 # F. Alted 2008-09-15 586 idx = numpy.arange(0, len(arr), dtype="uint32") 587 else: 588 idx = numpy.empty(len(arr), "uint%d" % (indsize * 8)) 589 lbucket = self.lbucket 590 # Fill the idx with the bucket indices 591 offset = int(lbucket - ((nrow * (slicesize % lbucket)) % lbucket)) 592 idx[0:offset] = 0 593 for i in range(offset, slicesize, lbucket): 594 idx[i:i + lbucket] = (i + lbucket - 1) // lbucket 595 if indsize == 2: 596 # Add a second offset in this case 597 # First normalize the number of rows 598 offset2 = (nrow % self.nslicesblock) * slicesize // lbucket 599 idx += offset2 600 # Add the last row at the beginning of arr & idx (if needed) 601 if (indsize == 8 and nelementsILR > 0): 602 # It is possible that the values in LR are already sorted. 603 # Fetch them and override existing values in arr and idx. 604 assert len(arr) > nelementsILR 605 self.read_slice_lr(self.sortedLR, arr[:nelementsILR]) 606 self.read_slice_lr(self.indicesLR, idx[:nelementsILR]) 607 # In-place sorting 608 if profile: 609 show_stats("Before keysort", tref) 610 indexesextension.keysort(arr, idx) 611 larr = arr[-1] 612 if reduction > 1: 613 # It's important to do a copy() here in order to ensure that 614 # sorted._append() will receive a contiguous array. 615 if profile: 616 show_stats("Before reduction", tref) 617 reduc = arr[::reduction].copy() 618 if profile: 619 show_stats("After reduction", tref) 620 arr = reduc 621 if profile: 622 show_stats("After arr <-- reduc", tref) 623 # A completely sorted index is not longer possible after an 624 # append of an index with already one slice. 625 if nrow > 0: 626 self._v_attrs.is_csi = False 627 if profile: 628 show_stats("Exiting initial_append", tref) 629 return larr, arr, idx 630 631 def final_idx32(self, idx, offset): 632 """Perform final operations in 32-bit indices.""" 633 634 if profile: 635 tref = time() 636 if profile: 637 show_stats("Entering final_idx32", tref) 638 # Do an upcast first in order to add the offset. 639 idx = idx.astype('uint64') 640 idx += offset 641 # The next partition is valid up to table sizes of 642 # 2**30 * 2**18 = 2**48 bytes, that is, 256 Tera-elements, 643 # which should be a safe figure, at least for a while. 644 idx //= self.lbucket 645 # After the division, we can downsize the indexes to 'uint32' 646 idx = idx.astype('uint32') 647 if profile: 648 show_stats("Exiting final_idx32", tref) 649 return idx 650 651 def append(self, xarr, update=False): 652 """Append the array to the index objects.""" 653 654 if profile: 655 tref = time() 656 if profile: 657 show_stats("Entering append", tref) 658 if not update and self.temp_required: 659 where = self.tmp 660 # The reduction will take place *after* the optimization process 661 reduction = 1 662 else: 663 where = self 664 reduction = self.reduction 665 sorted = where.sorted 666 indices = where.indices 667 ranges = where.ranges 668 mranges = where.mranges 669 bounds = where.bounds 670 mbounds = where.mbounds 671 abounds = where.abounds 672 zbounds = where.zbounds 673 sortedLR = where.sortedLR 674 indicesLR = where.indicesLR 675 nrows = sorted.nrows # before sorted.append() 676 larr, arr, idx = self.initial_append(xarr, nrows, reduction) 677 # Save the sorted array 678 sorted.append(arr.reshape(1, arr.size)) 679 cs = self.chunksize // reduction 680 ncs = self.nchunkslice 681 # Save ranges & bounds 682 ranges.append([[arr[0], larr]]) 683 bounds.append([arr[cs::cs]]) 684 abounds.append(arr[0::cs]) 685 zbounds.append(arr[cs - 1::cs]) 686 # Compute the medians 687 smedian = arr[cs // 2::cs] 688 mbounds.append(smedian) 689 mranges.append([smedian[ncs // 2]]) 690 if profile: 691 show_stats("Before deleting arr & smedian", tref) 692 del arr, smedian # delete references 693 if profile: 694 show_stats("After deleting arr & smedian", tref) 695 # Now that arr is gone, we can upcast the indices and add the offset 696 if self.indsize == 4: 697 idx = self.final_idx32(idx, nrows * self.slicesize) 698 indices.append(idx.reshape(1, idx.size)) 699 if profile: 700 show_stats("Before deleting idx", tref) 701 del idx 702 # Update counters after a successful append 703 self.nrows = nrows + 1 704 self.nelements = self.nrows * self.slicesize 705 self.nelementsSLR = 0 # reset the counter of the last row index to 0 706 self.nelementsILR = 0 # reset the counter of the last row index to 0 707 # The number of elements will be saved as an attribute. 708 # This is necessary in case the LR arrays can remember its values 709 # after a possible node preemtion/reload. 710 sortedLR.attrs.nelements = self.nelementsSLR 711 indicesLR.attrs.nelements = self.nelementsILR 712 self.dirtycache = True # the cache is dirty now 713 if profile: 714 show_stats("Exiting append", tref) 715 716 def append_last_row(self, xarr, update=False): 717 """Append the array to the last row index objects.""" 718 719 if profile: 720 tref = time() 721 if profile: 722 show_stats("Entering appendLR", tref) 723 # compute the elements in the last row sorted & bounds array 724 nrows = self.nslices 725 if not update and self.temp_required: 726 where = self.tmp 727 # The reduction will take place *after* the optimization process 728 reduction = 1 729 else: 730 where = self 731 reduction = self.reduction 732 indicesLR = where.indicesLR 733 sortedLR = where.sortedLR 734 larr, arr, idx = self.initial_append(xarr, nrows, reduction) 735 nelementsSLR = len(arr) 736 nelementsILR = len(idx) 737 # Build the cache of bounds 738 rchunksize = self.chunksize // reduction 739 self.bebounds = numpy.concatenate((arr[::rchunksize], [larr])) 740 # The number of elements will be saved as an attribute 741 sortedLR.attrs.nelements = nelementsSLR 742 indicesLR.attrs.nelements = nelementsILR 743 # Save the number of elements, bounds and sorted values 744 # at the end of the sorted array 745 offset2 = len(self.bebounds) 746 sortedLR[nelementsSLR:nelementsSLR + offset2] = self.bebounds 747 sortedLR[:nelementsSLR] = arr 748 del arr 749 # Now that arr is gone, we can upcast the indices and add the offset 750 if self.indsize == 4: 751 idx = self.final_idx32(idx, nrows * self.slicesize) 752 # Save the reverse index array 753 indicesLR[:len(idx)] = idx 754 del idx 755 # Update counters after a successful append 756 self.nrows = nrows + 1 757 self.nelements = nrows * self.slicesize + nelementsILR 758 self.nelementsILR = nelementsILR 759 self.nelementsSLR = nelementsSLR 760 self.dirtycache = True # the cache is dirty now 761 if profile: 762 show_stats("Exiting appendLR", tref) 763 764 765 def optimize(self, verbose=False): 766 """Optimize an index so as to allow faster searches. 767 768 verbose 769 If True, messages about the progress of the 770 optimization process are printed out. 771 772 """ 773 774 if not self.temp_required: 775 return 776 777 if verbose: 778 self.verbose = True 779 else: 780 self.verbose = debug 781 782 # Initialize last_tover and last_nover 783 self.last_tover = 0 784 self.last_nover = 0 785 786 # Compute the correct optimizations for current optim level 787 opts = calcoptlevels(self.nblocks, self.optlevel, self.indsize) 788 optmedian, optstarts, optstops, optfull = opts 789 790 if debug: 791 print("optvalues:", opts) 792 793 self.create_temp2() 794 # Start the optimization process 795 while True: 796 if optfull: 797 for niter in range(optfull): 798 if self.swap('chunks', 'median'): 799 break 800 if self.nblocks > 1: 801 # Swap slices only in the case that we have 802 # several blocks 803 if self.swap('slices', 'median'): 804 break 805 if self.swap('chunks', 'median'): 806 break 807 if self.swap('chunks', 'start'): 808 break 809 if self.swap('chunks', 'stop'): 810 break 811 else: 812 if optmedian: 813 if self.swap('chunks', 'median'): 814 break 815 if optstarts: 816 if self.swap('chunks', 'start'): 817 break 818 if optstops: 819 if self.swap('chunks', 'stop'): 820 break 821 break # If we reach this, exit the loop 822 823 # Check if we require a complete sort. Important: this step 824 # should be carried out *after* the optimization process has 825 # been completed (this is to guarantee that the complete sort 826 # does not take too much memory). 827 if self.want_complete_sort: 828 if self.noverlaps > 0: 829 self.do_complete_sort() 830 # Check that we have effectively achieved the complete sort 831 if self.noverlaps > 0: 832 warnings.warn( 833 "OPSI was not able to achieve a completely sorted index." 834 " Please report this to the authors.", UserWarning) 835 836 # Close and delete the temporal optimization index file 837 self.cleanup_temp() 838 return 839 840 def do_complete_sort(self): 841 """Bring an already optimized index into a complete sorted state.""" 842 843 if self.verbose: 844 t1 = time() 845 c1 = perf_counter() 846 ss = self.slicesize 847 tmp = self.tmp 848 ranges = tmp.ranges[:] 849 nslices = self.nslices 850 851 nelementsLR = self.nelementsILR 852 if nelementsLR > 0: 853 # Add the ranges corresponding to the last row 854 rangeslr = numpy.array([self.bebounds[0], self.bebounds[-1]]) 855 ranges = numpy.concatenate((ranges, [rangeslr])) 856 nslices += 1 857 858 sorted = tmp.sorted 859 indices = tmp.indices 860 sortedLR = tmp.sortedLR 861 indicesLR = tmp.indicesLR 862 sremain = numpy.array([], dtype=self.dtype) 863 iremain = numpy.array([], dtype='u%d' % self.indsize) 864 starts = numpy.zeros(shape=nslices, dtype=numpy.int_) 865 for i in range(nslices): 866 # Find the overlapping elements for slice i 867 sover = numpy.array([], dtype=self.dtype) 868 iover = numpy.array([], dtype='u%d' % self.indsize) 869 prev_end = ranges[i, 1] 870 for j in range(i + 1, nslices): 871 stj = starts[j] 872 if ((j < self.nslices and stj == ss) or 873 (j == self.nslices and stj == nelementsLR)): 874 # This slice has been already dealt with 875 continue 876 if j < self.nslices: 877 assert stj < ss, \ 878 "Two slices cannot overlap completely at this stage!" 879 next_beg = sorted[j, stj] 880 else: 881 assert stj < nelementsLR, \ 882 "Two slices cannot overlap completely at this stage!" 883 next_beg = sortedLR[stj] 884 next_end = ranges[j, 1] 885 if prev_end > next_end: 886 # Complete overlapping case 887 if j < self.nslices: 888 sover = numpy.concatenate((sover, sorted[j, stj:])) 889 iover = numpy.concatenate((iover, indices[j, stj:])) 890 starts[j] = ss 891 else: 892 n = nelementsLR 893 sover = numpy.concatenate((sover, sortedLR[stj:n])) 894 iover = numpy.concatenate((iover, indicesLR[stj:n])) 895 starts[j] = nelementsLR 896 elif prev_end > next_beg: 897 idx = self.search_item_lt(tmp, prev_end, j, ranges[j], stj) 898 if j < self.nslices: 899 sover = numpy.concatenate((sover, sorted[j, stj:idx])) 900 iover = numpy.concatenate((iover, indices[j, stj:idx])) 901 else: 902 sover = numpy.concatenate((sover, sortedLR[stj:idx])) 903 iover = numpy.concatenate((iover, indicesLR[stj:idx])) 904 starts[j] = idx 905 # Build the extended slices to sort out 906 if i < self.nslices: 907 ssorted = numpy.concatenate( 908 (sremain, sorted[i, starts[i]:], sover)) 909 sindices = numpy.concatenate( 910 (iremain, indices[i, starts[i]:], iover)) 911 else: 912 ssorted = numpy.concatenate( 913 (sremain, sortedLR[starts[i]:nelementsLR], sover)) 914 sindices = numpy.concatenate( 915 (iremain, indicesLR[starts[i]:nelementsLR], iover)) 916 # Sort the extended slices 917 indexesextension.keysort(ssorted, sindices) 918 # Save the first elements of extended slices in the slice i 919 if i < self.nslices: 920 sorted[i] = ssorted[:ss] 921 indices[i] = sindices[:ss] 922 # Update caches for this slice 923 self.update_caches(i, ssorted[:ss]) 924 # Save the remaining values in a separate array 925 send = len(sover) + len(sremain) 926 sremain = ssorted[ss:ss + send] 927 iremain = sindices[ss:ss + send] 928 else: 929 # Still some elements remain for the last row 930 n = len(ssorted) 931 assert n == nelementsLR 932 send = 0 933 sortedLR[:n] = ssorted 934 indicesLR[:n] = sindices 935 # Update the caches for last row 936 sortedlr = sortedLR[:nelementsLR] 937 bebounds = numpy.concatenate( 938 (sortedlr[::self.chunksize], [sortedlr[-1]])) 939 sortedLR[nelementsLR:nelementsLR + len(bebounds)] = bebounds 940 self.bebounds = bebounds 941 942 # Verify that we have dealt with all the remaining values 943 assert send == 0 944 945 # Compute the overlaps in order to verify that we have achieved 946 # a complete sort. This has to be executed always (and not only 947 # in verbose mode!). 948 self.compute_overlaps(self.tmp, "do_complete_sort()", self.verbose) 949 if self.verbose: 950 t = round(time() - t1, 4) 951 c = round(perf_counter() - c1, 4) 952 print("time: %s. clock: %s" % (t, c)) 953 954 def swap(self, what, mode=None): 955 """Swap chunks or slices using a certain bounds reference.""" 956 957 # Thresholds for avoiding continuing the optimization 958 # thnover = 4 * self.slicesize # minimum number of overlapping 959 # # elements 960 thnover = 40 961 thmult = 0.1 # minimum ratio of multiplicity (a 10%) 962 thtover = 0.01 # minimum overlaping index for slices (a 1%) 963 964 if self.verbose: 965 t1 = time() 966 c1 = perf_counter() 967 if what == "chunks": 968 self.swap_chunks(mode) 969 elif what == "slices": 970 self.swap_slices(mode) 971 if mode: 972 message = "swap_%s(%s)" % (what, mode) 973 else: 974 message = "swap_%s" % (what,) 975 (nover, mult, tover) = self.compute_overlaps( 976 self.tmp, message, self.verbose) 977 rmult = len(mult.nonzero()[0]) / float(len(mult)) 978 if self.verbose: 979 t = round(time() - t1, 4) 980 c = round(perf_counter() - c1, 4) 981 print("time: %s. clock: %s" % (t, c)) 982 # Check that entropy is actually decreasing 983 if what == "chunks" and self.last_tover > 0. and self.last_nover > 0: 984 tover_var = (self.last_tover - tover) / self.last_tover 985 nover_var = (self.last_nover - nover) / self.last_nover 986 if tover_var < 0.05 and nover_var < 0.05: 987 # Less than a 5% of improvement is too few 988 return True 989 self.last_tover = tover 990 self.last_nover = nover 991 # Check if some threshold has met 992 if nover < thnover: 993 return True 994 if rmult < thmult: 995 return True 996 # Additional check for the overlap ratio 997 if tover >= 0. and tover < thtover: 998 return True 999 return False 1000 1001 def create_temp(self): 1002 """Create some temporary objects for slice sorting purposes.""" 1003 1004 # The index will be dirty during the index optimization process 1005 self.dirty = True 1006 # Build the name of the temporary file 1007 fd, self.tmpfilename = tempfile.mkstemp( 1008 ".tmp", "pytables-", self.tmp_dir) 1009 # Close the file descriptor so as to avoid leaks 1010 os.close(fd) 1011 # Create the proper PyTables file 1012 self.tmpfile = self._openFile(self.tmpfilename, "w") 1013 self.tmp = tmp = self.tmpfile.root 1014 cs = self.chunksize 1015 ss = self.slicesize 1016 filters = self.filters 1017 # temporary sorted & indices arrays 1018 shape = (0, ss) 1019 atom = Atom.from_dtype(self.dtype) 1020 EArray(tmp, 'sorted', atom, shape, 1021 "Temporary sorted", filters, chunkshape=(1, cs)) 1022 EArray(tmp, 'indices', UIntAtom(itemsize=self.indsize), shape, 1023 "Temporary indices", filters, chunkshape=(1, cs)) 1024 # temporary bounds 1025 nbounds_inslice = (ss - 1) // cs 1026 shape = (0, nbounds_inslice) 1027 EArray(tmp, 'bounds', atom, shape, "Temp chunk bounds", 1028 filters, chunkshape=(cs, nbounds_inslice)) 1029 shape = (0,) 1030 EArray(tmp, 'abounds', atom, shape, "Temp start bounds", 1031 filters, chunkshape=(cs,)) 1032 EArray(tmp, 'zbounds', atom, shape, "Temp end bounds", 1033 filters, chunkshape=(cs,)) 1034 EArray(tmp, 'mbounds', atom, shape, "Median bounds", 1035 filters, chunkshape=(cs,)) 1036 # temporary ranges 1037 EArray(tmp, 'ranges', atom, (0, 2), 1038 "Temporary range values", filters, chunkshape=(cs, 2)) 1039 EArray(tmp, 'mranges', atom, (0,), 1040 "Median ranges", filters, chunkshape=(cs,)) 1041 # temporary last row (sorted) 1042 shape = (ss + 2 + nbounds_inslice,) 1043 CArray(tmp, 'sortedLR', atom, shape, 1044 "Temp Last Row sorted values + bounds", 1045 filters, chunkshape=(cs,)) 1046 # temporary last row (indices) 1047 shape = (ss,) 1048 CArray(tmp, 'indicesLR', 1049 UIntAtom(itemsize=self.indsize), 1050 shape, "Temp Last Row indices", 1051 filters, chunkshape=(cs,)) 1052 1053 def create_temp2(self): 1054 """Create some temporary objects for slice sorting purposes.""" 1055 1056 # The algorithms for doing the swap can be optimized so that 1057 # one should be necessary to create temporaries for keeping just 1058 # the contents of a single superblock. 1059 # F. Alted 2007-01-03 1060 cs = self.chunksize 1061 ss = self.slicesize 1062 filters = self.filters 1063 # temporary sorted & indices arrays 1064 shape = (self.nslices, ss) 1065 atom = Atom.from_dtype(self.dtype) 1066 tmp = self.tmp 1067 CArray(tmp, 'sorted2', atom, shape, 1068 "Temporary sorted 2", filters, chunkshape=(1, cs)) 1069 CArray(tmp, 'indices2', UIntAtom(itemsize=self.indsize), shape, 1070 "Temporary indices 2", filters, chunkshape=(1, cs)) 1071 # temporary bounds 1072 nbounds_inslice = (ss - 1) // cs 1073 shape = (self.nslices, nbounds_inslice) 1074 CArray(tmp, 'bounds2', atom, shape, "Temp chunk bounds 2", 1075 filters, chunkshape=(cs, nbounds_inslice)) 1076 shape = (self.nchunks,) 1077 CArray(tmp, 'abounds2', atom, shape, "Temp start bounds 2", 1078 filters, chunkshape=(cs,)) 1079 CArray(tmp, 'zbounds2', atom, shape, "Temp end bounds 2", 1080 filters, chunkshape=(cs,)) 1081 CArray(tmp, 'mbounds2', atom, shape, "Median bounds 2", 1082 filters, chunkshape=(cs,)) 1083 # temporary ranges 1084 CArray(tmp, 'ranges2', atom, (self.nslices, 2), 1085 "Temporary range values 2", filters, chunkshape=(cs, 2)) 1086 CArray(tmp, 'mranges2', atom, (self.nslices,), 1087 "Median ranges 2", filters, chunkshape=(cs,)) 1088 1089 def cleanup_temp(self): 1090 """Copy the data and delete the temporaries for sorting purposes.""" 1091 1092 if self.verbose: 1093 print("Copying temporary data...") 1094 # tmp -> index 1095 reduction = self.reduction 1096 cs = self.chunksize // reduction 1097 ncs = self.nchunkslice 1098 tmp = self.tmp 1099 for i in range(self.nslices): 1100 # Copy sorted & indices slices 1101 sorted = tmp.sorted[i][::reduction].copy() 1102 self.sorted.append(sorted.reshape(1, sorted.size)) 1103 # Compute ranges 1104 self.ranges.append([[sorted[0], sorted[-1]]]) 1105 # Compute chunk bounds 1106 self.bounds.append([sorted[cs::cs]]) 1107 # Compute start, stop & median bounds and ranges 1108 self.abounds.append(sorted[0::cs]) 1109 self.zbounds.append(sorted[cs - 1::cs]) 1110 smedian = sorted[cs // 2::cs] 1111 self.mbounds.append(smedian) 1112 self.mranges.append([smedian[ncs // 2]]) 1113 del sorted, smedian # delete references 1114 # Now that sorted is gone, we can copy the indices 1115 indices = tmp.indices[i] 1116 self.indices.append(indices.reshape(1, indices.size)) 1117 1118 # Now it is the last row turn (if needed) 1119 if self.nelementsSLR > 0: 1120 # First, the sorted values 1121 sortedLR = self.sortedLR 1122 indicesLR = self.indicesLR 1123 nelementsLR = self.nelementsILR 1124 sortedlr = tmp.sortedLR[:nelementsLR][::reduction].copy() 1125 nelementsSLR = len(sortedlr) 1126 sortedLR[:nelementsSLR] = sortedlr 1127 # Now, the bounds 1128 self.bebounds = numpy.concatenate((sortedlr[::cs], [sortedlr[-1]])) 1129 offset2 = len(self.bebounds) 1130 sortedLR[nelementsSLR:nelementsSLR + offset2] = self.bebounds 1131 # Finally, the indices 1132 indicesLR[:] = tmp.indicesLR[:] 1133 # Update the number of (reduced) sorted elements 1134 self.nelementsSLR = nelementsSLR 1135 # The number of elements will be saved as an attribute 1136 self.sortedLR.attrs.nelements = self.nelementsSLR 1137 self.indicesLR.attrs.nelements = self.nelementsILR 1138 1139 if self.verbose: 1140 print("Deleting temporaries...") 1141 self.tmp = None 1142 self.tmpfile.close() 1143 os.remove(self.tmpfilename) 1144 self.tmpfilename = None 1145 1146 # The optimization process has finished, and the index is ok now 1147 self.dirty = False 1148 # ...but the memory data cache is dirty now 1149 self.dirtycache = True 1150 1151 def get_neworder(self, neworder, src_disk, tmp_disk, 1152 lastrow, nslices, offset, dtype): 1153 """Get sorted & indices values in new order.""" 1154 1155 cs = self.chunksize 1156 ncs = ncs2 = self.nchunkslice 1157 self_nslices = self.nslices 1158 tmp = numpy.empty(shape=self.slicesize, dtype=dtype) 1159 for i in range(nslices): 1160 ns = offset + i 1161 if ns == self_nslices: 1162 # The number of complete chunks in the last row 1163 ncs2 = self.nelementsILR // cs 1164 # Get slices in new order 1165 for j in range(ncs2): 1166 idx = neworder[i * ncs + j] 1167 ins = idx // ncs 1168 inc = (idx - ins * ncs) * cs 1169 ins += offset 1170 nc = j * cs 1171 if ins == self_nslices: 1172 tmp[nc:nc + cs] = lastrow[inc:inc + cs] 1173 else: 1174 tmp[nc:nc + cs] = src_disk[ins, inc:inc + cs] 1175 if ns == self_nslices: 1176 # The number of complete chunks in the last row 1177 lastrow[:ncs2 * cs] = tmp[:ncs2 * cs] 1178 # The elements in the last chunk of the last row will 1179 # participate in the global reordering later on, during 1180 # the phase of sorting of *two* slices at a time 1181 # (including the last row slice, see 1182 # self.reorder_slices()). The caches for last row will 1183 # be updated in self.reorder_slices() too. 1184 # F. Altet 2008-08-25 1185 else: 1186 tmp_disk[ns] = tmp 1187 1188 def swap_chunks(self, mode="median"): 1189 """Swap & reorder the different chunks in a block.""" 1190 1191 boundsnames = { 1192 'start': 'abounds', 'stop': 'zbounds', 'median': 'mbounds'} 1193 tmp = self.tmp 1194 sorted = tmp.sorted 1195 indices = tmp.indices 1196 tmp_sorted = tmp.sorted2 1197 tmp_indices = tmp.indices2 1198 sortedLR = tmp.sortedLR 1199 indicesLR = tmp.indicesLR 1200 cs = self.chunksize 1201 ncs = self.nchunkslice 1202 nsb = self.nslicesblock 1203 ncb = ncs * nsb 1204 ncb2 = ncb 1205 boundsobj = tmp._f_get_child(boundsnames[mode]) 1206 can_cross_bbounds = (self.indsize == 8 and self.nelementsILR > 0) 1207 for nblock in range(self.nblocks): 1208 # Protection for last block having less chunks than ncb 1209 remainingchunks = self.nchunks - nblock * ncb 1210 if remainingchunks < ncb: 1211 ncb2 = remainingchunks 1212 if ncb2 <= 1: 1213 # if only zero or one chunks remains we are done 1214 break 1215 nslices = ncb2 // ncs 1216 bounds = boundsobj[nblock * ncb:nblock * ncb + ncb2] 1217 # Do this only if lastrow elements can cross block boundaries 1218 if (nblock == self.nblocks - 1 and # last block 1219 can_cross_bbounds): 1220 nslices += 1 1221 ul = self.nelementsILR // cs 1222 bounds = numpy.concatenate((bounds, self.bebounds[:ul])) 1223 sbounds_idx = bounds.argsort(kind=defsort) 1224 offset = int(nblock * nsb) 1225 # Swap sorted and indices following the new order 1226 self.get_neworder(sbounds_idx, sorted, tmp_sorted, sortedLR, 1227 nslices, offset, self.dtype) 1228 self.get_neworder(sbounds_idx, indices, tmp_indices, indicesLR, 1229 nslices, offset, 'u%d' % self.indsize) 1230 # Reorder completely the index at slice level 1231 self.reorder_slices(tmp=True) 1232 1233 def read_slice(self, where, nslice, buffer, start=0): 1234 """Read a slice from the `where` dataset and put it in `buffer`.""" 1235 1236 # Create the buffers for specifying the coordinates 1237 self.startl = numpy.array([nslice, start], numpy.uint64) 1238 self.stopl = numpy.array([nslice + 1, start + buffer.size], 1239 numpy.uint64) 1240 self.stepl = numpy.ones(shape=2, dtype=numpy.uint64) 1241 where._g_read_slice(self.startl, self.stopl, self.stepl, buffer) 1242 1243 def write_slice(self, where, nslice, buffer, start=0): 1244 """Write a `slice` to the `where` dataset with the `buffer` data.""" 1245 1246 self.startl = numpy.array([nslice, start], numpy.uint64) 1247 self.stopl = numpy.array([nslice + 1, start + buffer.size], 1248 numpy.uint64) 1249 self.stepl = numpy.ones(shape=2, dtype=numpy.uint64) 1250 countl = self.stopl - self.startl # (1, self.slicesize) 1251 where._g_write_slice(self.startl, self.stepl, countl, buffer) 1252 1253 # Read version for LastRow 1254 def read_slice_lr(self, where, buffer, start=0): 1255 """Read a slice from the `where` dataset and put it in `buffer`.""" 1256 1257 startl = numpy.array([start], dtype=numpy.uint64) 1258 stopl = numpy.array([start + buffer.size], dtype=numpy.uint64) 1259 stepl = numpy.array([1], dtype=numpy.uint64) 1260 where._g_read_slice(startl, stopl, stepl, buffer) 1261 1262 # Write version for LastRow 1263 def write_sliceLR(self, where, buffer, start=0): 1264 """Write a slice from the `where` dataset with the `buffer` data.""" 1265 1266 startl = numpy.array([start], dtype=numpy.uint64) 1267 countl = numpy.array([start + buffer.size], dtype=numpy.uint64) 1268 stepl = numpy.array([1], dtype=numpy.uint64) 1269 where._g_write_slice(startl, stepl, countl, buffer) 1270 1271 1272 def reorder_slice(self, nslice, sorted, indices, ssorted, sindices, 1273 tmp_sorted, tmp_indices): 1274 """Copy & reorder the slice in source to final destination.""" 1275 1276 ss = self.slicesize 1277 # Load the second part in buffers 1278 self.read_slice(tmp_sorted, nslice, ssorted[ss:]) 1279 self.read_slice(tmp_indices, nslice, sindices[ss:]) 1280 indexesextension.keysort(ssorted, sindices) 1281 # Write the first part of the buffers to the regular leaves 1282 self.write_slice(sorted, nslice - 1, ssorted[:ss]) 1283 self.write_slice(indices, nslice - 1, sindices[:ss]) 1284 # Update caches 1285 self.update_caches(nslice - 1, ssorted[:ss]) 1286 # Shift the slice in the end to the beginning 1287 ssorted[:ss] = ssorted[ss:] 1288 sindices[:ss] = sindices[ss:] 1289 1290 def update_caches(self, nslice, ssorted): 1291 """Update the caches for faster lookups.""" 1292 1293 cs = self.chunksize 1294 ncs = self.nchunkslice 1295 tmp = self.tmp 1296 # update first & second cache bounds (ranges & bounds) 1297 tmp.ranges[nslice] = ssorted[[0, -1]] 1298 tmp.bounds[nslice] = ssorted[cs::cs] 1299 # update start & stop bounds 1300 tmp.abounds[nslice * ncs:(nslice + 1) * ncs] = ssorted[0::cs] 1301 tmp.zbounds[nslice * ncs:(nslice + 1) * ncs] = ssorted[cs - 1::cs] 1302 # update median bounds 1303 smedian = ssorted[cs // 2::cs] 1304 tmp.mbounds[nslice * ncs:(nslice + 1) * ncs] = smedian 1305 tmp.mranges[nslice] = smedian[ncs // 2] 1306 1307 def reorder_slices(self, tmp): 1308 """Reorder completely the index at slice level. 1309 1310 This method has to maintain the locality of elements in the 1311 ambit of ``blocks``, i.e. an element of a ``block`` cannot be 1312 sent to another ``block`` during this reordering. This is 1313 *critical* for ``light`` indexes to be able to use this. 1314 1315 This version of reorder_slices is optimized in that *two* 1316 complete slices are taken at a time (including the last row 1317 slice) so as to sort them. Then, each new slice that is read is 1318 put at the end of this two-slice buffer, while the previous one 1319 is moved to the beginning of the buffer. This is in order to 1320 better reduce the entropy of the regular part (i.e. all except 1321 the last row) of the index. 1322 1323 A secondary effect of this is that it takes at least *twice* of 1324 memory than a previous version of reorder_slices() that only 1325 reorders on a slice-by-slice basis. However, as this is more 1326 efficient than the old version, one can configure the slicesize 1327 to be smaller, so the memory consumption is barely similar. 1328 1329 """ 1330 1331 tmp = self.tmp 1332 sorted = tmp.sorted 1333 indices = tmp.indices 1334 if tmp: 1335 tmp_sorted = tmp.sorted2 1336 tmp_indices = tmp.indices2 1337 else: 1338 tmp_sorted = tmp.sorted 1339 tmp_indices = tmp.indices 1340 cs = self.chunksize 1341 ss = self.slicesize 1342 nsb = self.blocksize // self.slicesize 1343 nslices = self.nslices 1344 nblocks = self.nblocks 1345 nelementsLR = self.nelementsILR 1346 # Create the buffer for reordering 2 slices at a time 1347 ssorted = numpy.empty(shape=ss * 2, dtype=self.dtype) 1348 sindices = numpy.empty(shape=ss * 2, 1349 dtype=numpy.dtype('u%d' % self.indsize)) 1350 1351 if self.indsize == 8: 1352 # Bootstrap the process for reordering 1353 # Read the first slice in buffers 1354 self.read_slice(tmp_sorted, 0, ssorted[:ss]) 1355 self.read_slice(tmp_indices, 0, sindices[:ss]) 1356 1357 nslice = 0 # Just in case the loop behind executes nothing 1358 # Loop over the remainding slices in block 1359 for nslice in range(1, sorted.nrows): 1360 self.reorder_slice(nslice, sorted, indices, 1361 ssorted, sindices, 1362 tmp_sorted, tmp_indices) 1363 1364 # End the process (enrolling the lastrow if necessary) 1365 if nelementsLR > 0: 1366 sortedLR = self.tmp.sortedLR 1367 indicesLR = self.tmp.indicesLR 1368 # Shrink the ssorted and sindices arrays to the minimum 1369 ssorted2 = ssorted[:ss + nelementsLR] 1370 sortedlr = ssorted2[ss:] 1371 sindices2 = sindices[:ss + nelementsLR] 1372 indiceslr = sindices2[ss:] 1373 # Read the last row info in the second part of the buffer 1374 self.read_slice_lr(sortedLR, sortedlr) 1375 self.read_slice_lr(indicesLR, indiceslr) 1376 indexesextension.keysort(ssorted2, sindices2) 1377 # Write the second part of the buffers to the lastrow indices 1378 self.write_sliceLR(sortedLR, sortedlr) 1379 self.write_sliceLR(indicesLR, indiceslr) 1380 # Update the caches for last row 1381 bebounds = numpy.concatenate((sortedlr[::cs], [sortedlr[-1]])) 1382 sortedLR[nelementsLR:nelementsLR + len(bebounds)] = bebounds 1383 self.bebounds = bebounds 1384 # Write the first part of the buffers to the regular leaves 1385 self.write_slice(sorted, nslice, ssorted[:ss]) 1386 self.write_slice(indices, nslice, sindices[:ss]) 1387 # Update caches for this slice 1388 self.update_caches(nslice, ssorted[:ss]) 1389 else: 1390 # Iterate over each block. No data should cross block 1391 # boundaries to avoid adressing problems with short indices. 1392 for nb in range(nblocks): 1393 # Bootstrap the process for reordering 1394 # Read the first slice in buffers 1395 nrow = nb * nsb 1396 self.read_slice(tmp_sorted, nrow, ssorted[:ss]) 1397 self.read_slice(tmp_indices, nrow, sindices[:ss]) 1398 1399 # Loop over the remainding slices in block 1400 lrb = nrow + nsb 1401 if lrb > nslices: 1402 lrb = nslices 1403 nslice = nrow # Just in case the loop behind executes nothing 1404 for nslice in range(nrow + 1, lrb): 1405 self.reorder_slice(nslice, sorted, indices, 1406 ssorted, sindices, 1407 tmp_sorted, tmp_indices) 1408 1409 # Write the first part of the buffers to the regular leaves 1410 self.write_slice(sorted, nslice, ssorted[:ss]) 1411 self.write_slice(indices, nslice, sindices[:ss]) 1412 # Update caches for this slice 1413 self.update_caches(nslice, ssorted[:ss]) 1414 1415 def swap_slices(self, mode="median"): 1416 """Swap slices in a superblock.""" 1417 1418 tmp = self.tmp 1419 sorted = tmp.sorted 1420 indices = tmp.indices 1421 tmp_sorted = tmp.sorted2 1422 tmp_indices = tmp.indices2 1423 ncs = self.nchunkslice 1424 nss = self.superblocksize // self.slicesize 1425 nss2 = nss 1426 for sblock in range(self.nsuperblocks): 1427 # Protection for last superblock having less slices than nss 1428 remainingslices = self.nslices - sblock * nss 1429 if remainingslices < nss: 1430 nss2 = remainingslices 1431 if nss2 <= 1: 1432 break 1433 if mode == "start": 1434 ranges = tmp.ranges[sblock * nss:sblock * nss + nss2, 0] 1435 elif mode == "stop": 1436 ranges = tmp.ranges[sblock * nss:sblock * nss + nss2, 1] 1437 elif mode == "median": 1438 ranges = tmp.mranges[sblock * nss:sblock * nss + nss2] 1439 sranges_idx = ranges.argsort(kind=defsort) 1440 # Don't swap the superblock at all if one doesn't need to 1441 ndiff = (sranges_idx != numpy.arange(nss2)).sum() / 2 1442 if ndiff * 50 < nss2: 1443 # The number of slices to rearrange is less than 2.5%, 1444 # so skip the reordering of this superblock 1445 # (too expensive for such a little improvement) 1446 if self.verbose: 1447 print("skipping reordering of superblock ->", sblock) 1448 continue 1449 ns = sblock * nss2 1450 # Swap sorted and indices slices following the new order 1451 for i in range(nss2): 1452 idx = sranges_idx[i] 1453 # Swap sorted & indices slices 1454 oi = ns + i 1455 oidx = ns + idx 1456 tmp_sorted[oi] = sorted[oidx] 1457 tmp_indices[oi] = indices[oidx] 1458 # Swap start, stop & median ranges 1459 tmp.ranges2[oi] = tmp.ranges[oidx] 1460 tmp.mranges2[oi] = tmp.mranges[oidx] 1461 # Swap chunk bounds 1462 tmp.bounds2[oi] = tmp.bounds[oidx] 1463 # Swap start, stop & median bounds 1464 j = oi * ncs 1465 jn = (oi + 1) * ncs 1466 xj = oidx * ncs 1467 xjn = (oidx + 1) * ncs 1468 tmp.abounds2[j:jn] = tmp.abounds[xj:xjn] 1469 tmp.zbounds2[j:jn] = tmp.zbounds[xj:xjn] 1470 tmp.mbounds2[j:jn] = tmp.mbounds[xj:xjn] 1471 # tmp -> originals 1472 for i in range(nss2): 1473 # Copy sorted & indices slices 1474 oi = ns + i 1475 sorted[oi] = tmp_sorted[oi] 1476 indices[oi] = tmp_indices[oi] 1477 # Copy start, stop & median ranges 1478 tmp.ranges[oi] = tmp.ranges2[oi] 1479 tmp.mranges[oi] = tmp.mranges2[oi] 1480 # Copy chunk bounds 1481 tmp.bounds[oi] = tmp.bounds2[oi] 1482 # Copy start, stop & median bounds 1483 j = oi * ncs 1484 jn = (oi + 1) * ncs 1485 tmp.abounds[j:jn] = tmp.abounds2[j:jn] 1486 tmp.zbounds[j:jn] = tmp.zbounds2[j:jn] 1487 tmp.mbounds[j:jn] = tmp.mbounds2[j:jn] 1488 1489 def search_item_lt(self, where, item, nslice, limits, start=0): 1490 """Search a single item in a specific sorted slice.""" 1491 1492 # This method will only works under the assumtion that item 1493 # *is to be found* in the nslice. 1494 assert nan_aware_lt(limits[0], item) and nan_aware_le(item, limits[1]) 1495 cs = self.chunksize 1496 ss = self.slicesize 1497 nelementsLR = self.nelementsILR 1498 bstart = start // cs 1499 1500 # Find the chunk 1501 if nslice < self.nslices: 1502 nchunk = bisect_left(where.bounds[nslice], item, bstart) 1503 else: 1504 # We need to subtract 1 chunk here because bebounds 1505 # has a leading value 1506 nchunk = bisect_left(self.bebounds, item, bstart) - 1 1507 assert nchunk >= 0 1508 1509 # Find the element in chunk 1510 pos = nchunk * cs 1511 if nslice < self.nslices: 1512 pos += bisect_left(where.sorted[nslice, pos:pos + cs], item) 1513 assert pos <= ss 1514 else: 1515 end = pos + cs 1516 if end > nelementsLR: 1517 end = nelementsLR 1518 pos += bisect_left(self.sortedLR[pos:end], item) 1519 assert pos <= nelementsLR 1520 assert pos > 0 1521 return pos 1522 1523 def compute_overlaps_finegrain(self, where, message, verbose): 1524 """Compute some statistics about overlaping of slices in index. 1525 1526 It returns the following info: 1527 1528 noverlaps : int 1529 The total number of elements that overlaps in index. 1530 multiplicity : array of int 1531 The number of times that a concrete slice overlaps with any other. 1532 toverlap : float 1533 An ovelap index: the sum of the values in segment slices that 1534 overlaps divided by the entire range of values. This index is only 1535 computed for numerical types. 1536 1537 """ 1538 1539 ss = self.slicesize 1540 ranges = where.ranges[:] 1541 sorted = where.sorted 1542 sortedLR = where.sortedLR 1543 nslices = self.nslices 1544 nelementsLR = self.nelementsILR 1545 if nelementsLR > 0: 1546 # Add the ranges corresponding to the last row 1547 rangeslr = numpy.array([self.bebounds[0], self.bebounds[-1]]) 1548 ranges = numpy.concatenate((ranges, [rangeslr])) 1549 nslices += 1 1550 soverlap = 0. 1551 toverlap = -1. 1552 multiplicity = numpy.zeros(shape=nslices, dtype="int_") 1553 overlaps = multiplicity.copy() 1554 starts = multiplicity.copy() 1555 for i in range(nslices): 1556 prev_end = ranges[i, 1] 1557 for j in range(i + 1, nslices): 1558 stj = starts[j] 1559 assert stj <= ss 1560 if stj == ss: 1561 # This slice has already been counted 1562 continue 1563 if j < self.nslices: 1564 next_beg = sorted[j, stj] 1565 else: 1566 next_beg = sortedLR[stj] 1567 next_end = ranges[j, 1] 1568 if prev_end > next_end: 1569 # Complete overlapping case 1570 multiplicity[j - i] += 1 1571 if j < self.nslices: 1572 overlaps[i] += ss - stj 1573 starts[j] = ss # a sentinel 1574 else: 1575 overlaps[i] += nelementsLR - stj 1576 starts[j] = nelementsLR # a sentinel 1577 elif prev_end > next_beg: 1578 multiplicity[j - i] += 1 1579 idx = self.search_item_lt( 1580 where, prev_end, j, ranges[j], stj) 1581 nelem = idx - stj 1582 overlaps[i] += nelem 1583 starts[j] = idx 1584 if self.type != "string": 1585 # Convert ranges into floats in order to allow 1586 # doing operations with them without overflows 1587 soverlap += float(ranges[i, 1]) - float(ranges[j, 0]) 1588 1589 # Return the overlap as the ratio between overlaps and entire range 1590 if self.type != "string": 1591 erange = float(ranges[-1, 1]) - float(ranges[0, 0]) 1592 # Check that there is an effective range of values 1593 # Beware, erange can be negative in situations where 1594 # the values are suffering overflow. This can happen 1595 # specially on big signed integer values (on overflows, 1596 # the end value will become negative!). 1597 # Also, there is no way to compute overlap ratios for 1598 # non-numerical types. So, be careful and always check 1599 # that toverlap has a positive value (it must have been 1600 # initialized to -1. before) before using it. 1601 # F. Alted 2007-01-19 1602 if erange > 0: 1603 toverlap = soverlap / erange 1604 if verbose and message != "init": 1605 print("toverlap (%s):" % message, toverlap) 1606 print("multiplicity:\n", multiplicity, multiplicity.sum()) 1607 print("overlaps:\n", overlaps, overlaps.sum()) 1608 noverlaps = overlaps.sum() 1609 # For full indexes, set the 'is_csi' flag 1610 if self.indsize == 8 and self._v_file._iswritable(): 1611 self._v_attrs.is_csi = (noverlaps == 0) 1612 # Save the number of overlaps for future references 1613 self.noverlaps = noverlaps 1614 return (noverlaps, multiplicity, toverlap) 1615 1616 def compute_overlaps(self, where, message, verbose): 1617 """Compute some statistics about overlaping of slices in index. 1618 1619 It returns the following info: 1620 1621 noverlaps : int 1622 The total number of slices that overlaps in index. 1623 multiplicity : array of int 1624 The number of times that a concrete slice overlaps with any other. 1625 toverlap : float 1626 An ovelap index: the sum of the values in segment slices that 1627 overlaps divided by the entire range of values. This index is only 1628 computed for numerical types. 1629 1630 """ 1631 1632 ranges = where.ranges[:] 1633 nslices = self.nslices 1634 if self.nelementsILR > 0: 1635 # Add the ranges corresponding to the last row 1636 rangeslr = numpy.array([self.bebounds[0], self.bebounds[-1]]) 1637 ranges = numpy.concatenate((ranges, [rangeslr])) 1638 nslices += 1 1639 noverlaps = 0 1640 soverlap = 0. 1641 toverlap = -1. 1642 multiplicity = numpy.zeros(shape=nslices, dtype="int_") 1643 for i in range(nslices): 1644 for j in range(i + 1, nslices): 1645 if ranges[i, 1] > ranges[j, 0]: 1646 noverlaps += 1 1647 multiplicity[j - i] += 1 1648 if self.type != "string": 1649 # Convert ranges into floats in order to allow 1650 # doing operations with them without overflows 1651 soverlap += float(ranges[i, 1]) - float(ranges[j, 0]) 1652 1653 # Return the overlap as the ratio between overlaps and entire range 1654 if self.type != "string": 1655 erange = float(ranges[-1, 1]) - float(ranges[0, 0]) 1656 # Check that there is an effective range of values 1657 # Beware, erange can be negative in situations where 1658 # the values are suffering overflow. This can happen 1659 # specially on big signed integer values (on overflows, 1660 # the end value will become negative!). 1661 # Also, there is no way to compute overlap ratios for 1662 # non-numerical types. So, be careful and always check 1663 # that toverlap has a positive value (it must have been 1664 # initialized to -1. before) before using it. 1665 # F. Altet 2007-01-19 1666 if erange > 0: 1667 toverlap = soverlap / erange 1668 if verbose: 1669 print("overlaps (%s):" % message, noverlaps, toverlap) 1670 print(multiplicity) 1671 # For full indexes, set the 'is_csi' flag 1672 if self.indsize == 8 and self._v_file._iswritable(): 1673 self._v_attrs.is_csi = (noverlaps == 0) 1674 # Save the number of overlaps for future references 1675 self.noverlaps = noverlaps 1676 return (noverlaps, multiplicity, toverlap) 1677 1678 def read_sorted_indices(self, what, start, stop, step): 1679 """Return the sorted or indices values in the specified range.""" 1680 (start, stop, step) = self._process_range(start, stop, step) 1681 if start >= stop: 1682 return numpy.empty(0, self.dtype) 1683 # Correction for negative values of step (reverse indices) 1684 if step < 0: 1685 tmp = start 1686 start = self.nelements - stop 1687 stop = self.nelements - tmp 1688 if what == "sorted": 1689 values = self.sorted 1690 valuesLR = self.sortedLR 1691 buffer_ = numpy.empty(stop - start, dtype=self.dtype) 1692 else: 1693 values = self.indices 1694 valuesLR = self.indicesLR 1695 buffer_ = numpy.empty(stop - start, dtype="u%d" % self.indsize) 1696 ss = self.slicesize 1697 nrow_start = start // ss 1698 istart = start % ss 1699 nrow_stop = stop // ss 1700 tlen = stop - start 1701 bstart = 0 1702 ilen = 0 1703 for nrow in range(nrow_start, nrow_stop + 1): 1704 blen = ss - istart 1705 if ilen + blen > tlen: 1706 blen = tlen - ilen 1707 if blen <= 0: 1708 break 1709 if nrow < self.nslices: 1710 self.read_slice( 1711 values, nrow, buffer_[bstart:bstart + blen], istart) 1712 else: 1713 self.read_slice_lr( 1714 valuesLR, buffer_[bstart:bstart + blen], istart) 1715 istart = 0 1716 bstart += blen 1717 ilen += blen 1718 return buffer_[::step] 1719 1720 def read_sorted(self, start=None, stop=None, step=None): 1721 """Return the sorted values of index in the specified range. 1722 1723 The meaning of the start, stop and step arguments is the same as in 1724 :meth:`Table.read_sorted`. 1725 1726 """ 1727 1728 return self.read_sorted_indices('sorted', start, stop, step) 1729 1730 1731 def read_indices(self, start=None, stop=None, step=None): 1732 """Return the indices values of index in the specified range. 1733 1734 The meaning of the start, stop and step arguments is the same as in 1735 :meth:`Table.read_sorted`. 1736 1737 """ 1738 1739 return self.read_sorted_indices('indices', start, stop, step) 1740 1741 1742 def _process_range(self, start, stop, step): 1743 """Get a range specifc for the index usage.""" 1744 1745 if start is not None and stop is None: 1746 # Special case for the behaviour of PyTables iterators 1747 stop = idx2long(start + 1) 1748 if start is None: 1749 start = 0 1750 else: 1751 start = idx2long(start) 1752 if stop is None: 1753 stop = idx2long(self.nelements) 1754 else: 1755 stop = idx2long(stop) 1756 if step is None: 1757 step = 1 1758 else: 1759 step = idx2long(step) 1760 return (start, stop, step) 1761 1762 1763 def __getitem__(self, key): 1764 """Return the indices values of index in the specified range. 1765 1766 If key argument is an integer, the corresponding index is returned. If 1767 key is a slice, the range of indices determined by it is returned. A 1768 negative value of step in slice is supported, meaning that the results 1769 will be returned in reverse order. 1770 1771 This method is equivalent to :meth:`Index.read_indices`. 1772 1773 """ 1774 1775 if is_idx(key): 1776 key = operator.index(key) 1777 1778 if key < 0: 1779 # To support negative values 1780 key += self.nelements 1781 return self.read_indices(key, key + 1, 1)[0] 1782 elif isinstance(key, slice): 1783 return self.read_indices(key.start, key.stop, key.step) 1784 1785 def __len__(self): 1786 return self.nelements 1787 1788 def restorecache(self): 1789 "Clean the limits cache and resize starts and lengths arrays" 1790 1791 params = self._v_file.params 1792 # The sorted IndexArray is absolutely required to be in memory 1793 # at the same time than the Index instance, so create a strong 1794 # reference to it. We are not introducing leaks because the 1795 # strong reference will disappear when this Index instance is 1796 # to be closed. 1797 self._sorted = self.sorted 1798 self._sorted.boundscache = ObjectCache(params['BOUNDS_MAX_SLOTS'], 1799 params['BOUNDS_MAX_SIZE'], 1800 'non-opt types bounds') 1801 self.sorted.boundscache = ObjectCache(params['BOUNDS_MAX_SLOTS'], 1802 params['BOUNDS_MAX_SIZE'], 1803 'non-opt types bounds') 1804 """A cache for the bounds (2nd hash) data. Only used for 1805 non-optimized types searches.""" 1806 self.limboundscache = ObjectCache(params['LIMBOUNDS_MAX_SLOTS'], 1807 params['LIMBOUNDS_MAX_SIZE'], 1808 'bounding limits') 1809 """A cache for bounding limits.""" 1810 self.sortedLRcache = ObjectCache(params['SORTEDLR_MAX_SLOTS'], 1811 params['SORTEDLR_MAX_SIZE'], 1812 'last row chunks') 1813 """A cache for the last row chunks. Only used for searches in 1814 the last row, and mainly useful for small indexes.""" 1815 self.starts = numpy.empty(shape=self.nrows, dtype=numpy.int32) 1816 self.lengths = numpy.empty(shape=self.nrows, dtype=numpy.int32) 1817 self.sorted._init_sorted_slice(self) 1818 self.dirtycache = False 1819 1820 def search(self, item): 1821 """Do a binary search in this index for an item.""" 1822 1823 if profile: 1824 tref = time() 1825 if profile: 1826 show_stats("Entering search", tref) 1827 1828 if self.dirtycache: 1829 self.restorecache() 1830 1831 # An empty item or if left limit is larger than the right one 1832 # means that the number of records is always going to be empty, 1833 # so we avoid further computation (including looking up the 1834 # limits cache). 1835 if not item or item[0] > item[1]: 1836 self.starts[:] = 0 1837 self.lengths[:] = 0 1838 return 0 1839 1840 tlen = 0 1841 # Check whether the item tuple is in the limits cache or not 1842 nslot = self.limboundscache.getslot(item) 1843 if nslot >= 0: 1844 startlengths = self.limboundscache.getitem(nslot) 1845 # Reset the lengths array (not necessary for starts) 1846 self.lengths[:] = 0 1847 # Now, set the interesting rows 1848 for nrow in range(len(startlengths)): 1849 nrow2, start, length = startlengths[nrow] 1850 self.starts[nrow2] = start 1851 self.lengths[nrow2] = length 1852 tlen = tlen + length 1853 return tlen 1854 # The item is not in cache. Do the real lookup. 1855 sorted = self.sorted 1856 if self.nslices > 0: 1857 if self.type in self.opt_search_types: 1858 # The next are optimizations. However, they hide the 1859 # CPU functions consumptions from python profiles. 1860 # You may want to de-activate them during profiling. 1861 if self.type == "int32": 1862 tlen = sorted._search_bin_na_i(*item) 1863 elif self.type == "int64": 1864 tlen = sorted._search_bin_na_ll(*item) 1865 elif self.type == "float16": 1866 tlen = sorted._search_bin_na_e(*item) 1867 elif self.type == "float32": 1868 tlen = sorted._search_bin_na_f(*item) 1869 elif self.type == "float64": 1870 tlen = sorted._search_bin_na_d(*item) 1871 elif self.type == "float96": 1872 tlen = sorted._search_bin_na_g(*item) 1873 elif self.type == "float128": 1874 tlen = sorted._search_bin_na_g(*item) 1875 elif self.type == "uint32": 1876 tlen = sorted._search_bin_na_ui(*item) 1877 elif self.type == "uint64": 1878 tlen = sorted._search_bin_na_ull(*item) 1879 elif self.type == "int8": 1880 tlen = sorted._search_bin_na_b(*item) 1881 elif self.type == "int16": 1882 tlen = sorted._search_bin_na_s(*item) 1883 elif self.type == "uint8": 1884 tlen = sorted._search_bin_na_ub(*item) 1885 elif self.type == "uint16": 1886 tlen = sorted._search_bin_na_us(*item) 1887 else: 1888 assert False, "This can't happen!" 1889 else: 1890 tlen = self.search_scalar(item, sorted) 1891 # Get possible remaining values in last row 1892 if self.nelementsSLR > 0: 1893 # Look for more indexes in the last row 1894 (start, stop) = self.search_last_row(item) 1895 self.starts[-1] = start 1896 self.lengths[-1] = stop - start 1897 tlen += stop - start 1898 1899 if self.limboundscache.couldenablecache(): 1900 # Get a startlengths tuple and save it in cache. 1901 # This is quite slow, but it is a good way to compress 1902 # the bounds info. Moreover, the .couldenablecache() 1903 # is doing a good work so as to avoid computing this 1904 # when it is not necessary to do it. 1905 startlengths = [] 1906 for nrow, length in enumerate(self.lengths): 1907 if length > 0: 1908 startlengths.append((nrow, self.starts[nrow], length)) 1909 # Compute the size of the recarray (aproximately) 1910 # The +1 at the end is important to avoid 0 lengths 1911 # (remember, the object headers take some space) 1912 size = len(startlengths) * 8 * 2 + 1 1913 # Put this startlengths list in cache 1914 self.limboundscache.setitem(item, startlengths, size) 1915 1916 if profile: 1917 show_stats("Exiting search", tref) 1918 return tlen 1919 1920 # This is an scalar version of search. It works with strings as well. 1921 def search_scalar(self, item, sorted): 1922 """Do a binary search in this index for an item.""" 1923 1924 tlen = 0 1925 # Do the lookup for values fullfilling the conditions 1926 for i in range(self.nslices): 1927 (start, stop) = sorted._search_bin(i, item) 1928 self.starts[i] = start 1929 self.lengths[i] = stop - start 1930 tlen += stop - start 1931 return tlen 1932 1933 def search_last_row(self, item): 1934 # Variable initialization 1935 item1, item2 = item 1936 bebounds = self.bebounds 1937 b0, b1 = bebounds[0], bebounds[-1] 1938 bounds = bebounds[1:-1] 1939 itemsize = self.dtype.itemsize 1940 sortedLRcache = self.sortedLRcache 1941 hi = self.nelementsSLR # maximum number of elements 1942 rchunksize = self.chunksize // self.reduction 1943 1944 nchunk = -1 1945 # Lookup for item1 1946 if nan_aware_gt(item1, b0): 1947 if nan_aware_le(item1, b1): 1948 # Search the appropriate chunk in bounds cache 1949 nchunk = bisect_left(bounds, item1) 1950 # Lookup for this chunk in cache 1951 nslot = sortedLRcache.getslot(nchunk) 1952 if nslot >= 0: 1953 chunk = sortedLRcache.getitem(nslot) 1954 else: 1955 begin = rchunksize * nchunk 1956 end = rchunksize * (nchunk + 1) 1957 if end > hi: 1958 end = hi 1959 # Read the chunk from disk 1960 chunk = self.sortedLR._read_sorted_slice( 1961 self.sorted, begin, end) 1962 # Put it in cache. It's important to *copy* 1963 # the buffer, as it is reused in future reads! 1964 sortedLRcache.setitem(nchunk, chunk.copy(), 1965 (end - begin) * itemsize) 1966 start = bisect_left(chunk, item1) 1967 start += rchunksize * nchunk 1968 else: 1969 start = hi 1970 else: 1971 start = 0 1972 # Lookup for item2 1973 if nan_aware_ge(item2, b0): 1974 if nan_aware_lt(item2, b1): 1975 # Search the appropriate chunk in bounds cache 1976 nchunk2 = bisect_right(bounds, item2) 1977 if nchunk2 != nchunk: 1978 # Lookup for this chunk in cache 1979 nslot = sortedLRcache.getslot(nchunk2) 1980 if nslot >= 0: 1981 chunk = sortedLRcache.getitem(nslot) 1982 else: 1983 begin = rchunksize * nchunk2 1984 end = rchunksize * (nchunk2 + 1) 1985 if end > hi: 1986 end = hi 1987 # Read the chunk from disk 1988 chunk = self.sortedLR._read_sorted_slice( 1989 self.sorted, begin, end) 1990 # Put it in cache. It's important to *copy* 1991 # the buffer, as it is reused in future reads! 1992 # See bug #60 in xot.carabos.com 1993 sortedLRcache.setitem(nchunk2, chunk.copy(), 1994 (end - begin) * itemsize) 1995 stop = bisect_right(chunk, item2) 1996 stop += rchunksize * nchunk2 1997 else: 1998 stop = hi 1999 else: 2000 stop = 0 2001 return (start, stop) 2002 2003 2004 def get_chunkmap(self): 2005 """Compute a map with the interesting chunks in index.""" 2006 2007 if profile: 2008 tref = time() 2009 if profile: 2010 show_stats("Entering get_chunkmap", tref) 2011 ss = self.slicesize 2012 nsb = self.nslicesblock 2013 nslices = self.nslices 2014 lbucket = self.lbucket 2015 indsize = self.indsize 2016 bucketsinblock = float(self.blocksize) / lbucket 2017 nchunks = int(math.ceil(float(self.nelements) / lbucket)) 2018 chunkmap = numpy.zeros(shape=nchunks, dtype="bool") 2019 reduction = self.reduction 2020 starts = (self.starts - 1) * reduction + 1 2021 stops = (self.starts + self.lengths) * reduction 2022 starts[starts < 0] = 0 # All negative values set to zero 2023 indices = self.indices 2024 for nslice in range(self.nrows): 2025 start = starts[nslice] 2026 stop = stops[nslice] 2027 if stop > start: 2028 idx = numpy.empty(shape=stop - start, dtype='u%d' % indsize) 2029 if nslice < nslices: 2030 indices._read_index_slice(nslice, start, stop, idx) 2031 else: 2032 self.indicesLR._read_index_slice(start, stop, idx) 2033 if indsize == 8: 2034 idx //= lbucket 2035 elif indsize == 2: 2036 # The chunkmap size cannot be never larger than 'int_' 2037 idx = idx.astype("int_") 2038 offset = int((nslice // nsb) * bucketsinblock) 2039 idx += offset 2040 elif indsize == 1: 2041 # The chunkmap size cannot be never larger than 'int_' 2042 idx = idx.astype("int_") 2043 offset = (nslice * ss) // lbucket 2044 idx += offset 2045 chunkmap[idx] = True 2046 # The case lbucket < nrowsinchunk should only happen in tests 2047 nrowsinchunk = self.nrowsinchunk 2048 if lbucket != nrowsinchunk: 2049 # Map the 'coarse grain' chunkmap into the 'true' chunkmap 2050 nelements = self.nelements 2051 tnchunks = int(math.ceil(float(nelements) / nrowsinchunk)) 2052 tchunkmap = numpy.zeros(shape=tnchunks, dtype="bool") 2053 ratio = float(lbucket) / nrowsinchunk 2054 idx = chunkmap.nonzero()[0] 2055 starts = (idx * ratio).astype('int_') 2056 stops = numpy.ceil((idx + 1) * ratio).astype('int_') 2057 for i in range(len(idx)): 2058 tchunkmap[starts[i]:stops[i]] = True 2059 chunkmap = tchunkmap 2060 if profile: 2061 show_stats("Exiting get_chunkmap", tref) 2062 return chunkmap 2063 2064 def get_lookup_range(self, ops, limits): 2065 assert len(ops) in [1, 2] 2066 assert len(limits) in [1, 2] 2067 assert len(ops) == len(limits) 2068 2069 column = self.column 2070 coldtype = column.dtype.base 2071 itemsize = coldtype.itemsize 2072 2073 if len(limits) == 1: 2074 assert ops[0] in ['lt', 'le', 'eq', 'ge', 'gt'] 2075 limit = limits[0] 2076 op = ops[0] 2077 if op == 'lt': 2078 range_ = (inftype(coldtype, itemsize, sign=-1), 2079 nextafter(limit, -1, coldtype, itemsize)) 2080 elif op == 'le': 2081 range_ = (inftype(coldtype, itemsize, sign=-1), 2082 limit) 2083 elif op == 'gt': 2084 range_ = (nextafter(limit, +1, coldtype, itemsize), 2085 inftype(coldtype, itemsize, sign=+1)) 2086 elif op == 'ge': 2087 range_ = (limit, 2088 inftype(coldtype, itemsize, sign=+1)) 2089 elif op == 'eq': 2090 range_ = (limit, limit) 2091 2092 elif len(limits) == 2: 2093 assert ops[0] in ('gt', 'ge') and ops[1] in ('lt', 'le') 2094 2095 lower, upper = limits 2096 if lower > upper: 2097 # ``a <[=] x <[=] b`` is always false if ``a > b``. 2098 return () 2099 2100 if ops == ('gt', 'lt'): # lower < col < upper 2101 range_ = (nextafter(lower, +1, coldtype, itemsize), 2102 nextafter(upper, -1, coldtype, itemsize)) 2103 elif ops == ('ge', 'lt'): # lower <= col < upper 2104 range_ = (lower, nextafter(upper, -1, coldtype, itemsize)) 2105 elif ops == ('gt', 'le'): # lower < col <= upper 2106 range_ = (nextafter(lower, +1, coldtype, itemsize), upper) 2107 elif ops == ('ge', 'le'): # lower <= col <= upper 2108 range_ = (lower, upper) 2109 2110 return range_ 2111 2112 2113 def _f_remove(self, recursive=False): 2114 """Remove this Index object.""" 2115 2116 # Index removal is always recursive, 2117 # no matter what `recursive` says. 2118 super(Index, self)._f_remove(True) 2119 2120 def __str__(self): 2121 """This provides a more compact representation than __repr__""" 2122 2123 # The filters 2124 filters = "" 2125 if self.filters.complevel: 2126 if self.filters.shuffle: 2127 filters += ", shuffle" 2128 if self.filters.bitshuffle: 2129 filters += ", bitshuffle" 2130 filters += ", %s(%s)" % (self.filters.complib, 2131 self.filters.complevel) 2132 return "Index(%s, %s%s).is_csi=%s" % \ 2133 (self.optlevel, self.kind, filters, self.is_csi) 2134 2135 def __repr__(self): 2136 """This provides more metainfo than standard __repr__""" 2137 2138 cpathname = self.table._v_pathname + ".cols." + self.column.pathname 2139 retstr = """%s (Index for column %s) 2140 optlevel := %s 2141 kind := %s 2142 filters := %s 2143 is_csi := %s 2144 nelements := %s 2145 chunksize := %s 2146 slicesize := %s 2147 blocksize := %s 2148 superblocksize := %s 2149 dirty := %s 2150 byteorder := %r""" % (self._v_pathname, cpathname, 2151 self.optlevel, self.kind, 2152 self.filters, self.is_csi, 2153 self.nelements, 2154 self.chunksize, self.slicesize, 2155 self.blocksize, self.superblocksize, 2156 self.dirty, self.byteorder) 2157 retstr += "\n sorted := %s" % self.sorted 2158 retstr += "\n indices := %s" % self.indices 2159 retstr += "\n ranges := %s" % self.ranges 2160 retstr += "\n bounds := %s" % self.bounds 2161 retstr += "\n sortedLR := %s" % self.sortedLR 2162 retstr += "\n indicesLR := %s" % self.indicesLR 2163 return retstr 2164 2165 2166class IndexesDescG(NotLoggedMixin, Group): 2167 _c_classid = 'DINDEX' 2168 2169 2170 def _g_width_warning(self): 2171 warnings.warn( 2172 "the number of indexed columns on a single description group " 2173 "is exceeding the recommended maximum (%d); " 2174 "be ready to see PyTables asking for *lots* of memory " 2175 "and possibly slow I/O" % self._v_max_group_width, 2176 PerformanceWarning) 2177 2178 2179 2180class IndexesTableG(NotLoggedMixin, Group): 2181 _c_classid = 'TINDEX' 2182 2183 2184 @property 2185 def auto(self): 2186 if 'AUTO_INDEX' not in self._v_attrs: 2187 return default_auto_index 2188 return self._v_attrs.AUTO_INDEX 2189 2190 @auto.setter 2191 def auto(self, auto): 2192 self._v_attrs.AUTO_INDEX = bool(auto) 2193 2194 @auto.deleter 2195 def auto(self): 2196 del self._v_attrs.AUTO_INDEX 2197 2198 def _g_width_warning(self): 2199 warnings.warn( 2200 "the number of indexed columns on a single table " 2201 "is exceeding the recommended maximum (%d); " 2202 "be ready to see PyTables asking for *lots* of memory " 2203 "and possibly slow I/O" % self._v_max_group_width, 2204 PerformanceWarning) 2205 2206 2207 def _g_check_name(self, name): 2208 if not name.startswith('_i_'): 2209 raise ValueError( 2210 "names of index groups must start with ``_i_``: %s" % name) 2211 2212 2213 @property 2214 def table(self): 2215 "Accessor for the `Table` object of this `IndexesTableG` container." 2216 names = self._v_pathname.split("/") 2217 tablename = names.pop()[3:] # "_i_" is at the beginning 2218 parentpathname = "/".join(names) 2219 tablepathname = join_path(parentpathname, tablename) 2220 table = self._v_file._get_node(tablepathname) 2221 return table 2222 2223 2224class OldIndex(NotLoggedMixin, Group): 2225 """This is meant to hide indexes of PyTables 1.x files.""" 2226 2227 _c_classid = 'CINDEX' 2228 2229 2230 2231## Local Variables: 2232## mode: python 2233## py-indent-offset: 4 2234## tab-width: 4 2235## fill-column: 72 2236## End: 2237