1# -*- coding: utf-8 -*-
2
3#######################################################################
4#
5# License: BSD
6# Created: June 08, 2004
7# Author: Francesc Alted - faltet@pytables.com
8#
9# $Id$
10#
11########################################################################
12
13"""Here is defined the Index class."""
14
15
16import math
17import operator
18import os
19import os.path
20import sys
21import tempfile
22import warnings
23
24from time import time, perf_counter
25
26import numpy
27
28from .idxutils import (calc_chunksize, calcoptlevels,
29                             get_reduction_level, nextafter, inftype)
30
31from . import indexesextension
32from .node import NotLoggedMixin
33from .atom import UIntAtom, Atom
34from .earray import EArray
35from .carray import CArray
36from .leaf import Filters
37from .indexes import CacheArray, LastRowArray, IndexArray
38from .group import Group
39from .path import join_path
40from .exceptions import PerformanceWarning
41from .utils import is_idx, idx2long, lazyattr
42from .utilsextension import (nan_aware_gt, nan_aware_ge,
43                                   nan_aware_lt, nan_aware_le,
44                                   bisect_left, bisect_right)
45from .lrucacheextension import ObjectCache
46
47
48
49# default version for INDEX objects
50# obversion = "1.0"    # Version of indexes in PyTables 1.x series
51# obversion = "2.0"    # Version of indexes in PyTables Pro 2.0 series
52obversion = "2.1"     # Version of indexes in PyTables Pro 2.1 and up series,
53                      # including the join 2.3 Std + Pro version
54
55
56debug = False
57# debug = True  # Uncomment this for printing sizes purposes
58profile = False
59# profile = True  # Uncomment for profiling
60if profile:
61    from .utils import show_stats
62
63
64# The default method for sorting
65# defsort = "quicksort"
66# Changing to mergesort to fix #441
67defsort = "mergesort"
68
69# Default policy for automatically updating indexes after a table
70# append operation, or automatically reindexing after an
71# index-invalidating operation like removing or modifying table rows.
72default_auto_index = True
73# Keep in sync with ``Table.autoindex`` docstring.
74
75# Default filters used to compress indexes.  This is quite fast and
76# compression is pretty good.
77# Remember to keep these defaults in sync with the docstrings and UG.
78default_index_filters = Filters(complevel=1, complib='zlib',
79                                shuffle=True, fletcher32=False)
80
81# Deprecated API
82defaultAutoIndex = default_auto_index
83defaultIndexFilters = default_index_filters
84
85# The list of types for which an optimised search in cython and C has
86# been implemented. Always add here the name of a new optimised type.
87opt_search_types = ("int8", "int16", "int32", "int64",
88                    "uint8", "uint16", "uint32", "uint64",
89                    "float32", "float64")
90
91# The upper limit for uint32 ints
92max32 = 2**32
93
94
95def _table_column_pathname_of_index(indexpathname):
96    names = indexpathname.split("/")
97    for i, name in enumerate(names):
98        if name.startswith('_i_'):
99            break
100    tablepathname = "/".join(names[:i]) + "/" + name[3:]
101    colpathname = "/".join(names[i + 1:])
102    return (tablepathname, colpathname)
103
104
105class Index(NotLoggedMixin, Group, indexesextension.Index):
106    """Represents the index of a column in a table.
107
108    This class is used to keep the indexing information for columns in a Table
109    dataset (see :ref:`TableClassDescr`). It is actually a descendant of the
110    Group class (see :ref:`GroupClassDescr`), with some added functionality. An
111    Index is always associated with one and only one column in the table.
112
113    .. note::
114
115        This class is mainly intended for internal use, but some of its
116        documented attributes and methods may be interesting for the
117        programmer.
118
119    Parameters
120    ----------
121    parentnode
122        The parent :class:`Group` object.
123
124        .. versionchanged:: 3.0
125           Renamed from *parentNode* to *parentnode*.
126
127    name : str
128        The name of this node in its parent group.
129    atom : Atom
130        An Atom object representing the shape and type of the atomic objects to
131        be saved. Only scalar atoms are supported.
132    title
133        Sets a TITLE attribute of the Index entity.
134    kind
135        The desired kind for this index.  The 'full' kind specifies a complete
136        track of the row position (64-bit), while the 'medium', 'light' or
137        'ultralight' kinds only specify in which chunk the row is (using
138        32-bit, 16-bit and 8-bit respectively).
139    optlevel
140        The desired optimization level for this index.
141    filters : Filters
142        An instance of the Filters class that provides information about the
143        desired I/O filters to be applied during the life of this object.
144    tmp_dir
145        The directory for the temporary files.
146    expectedrows
147        Represents an user estimate about the number of row slices that will be
148        added to the growable dimension in the IndexArray object.
149    byteorder
150        The byteorder of the index datasets *on-disk*.
151    blocksizes
152        The four main sizes of the compound blocks in index datasets (a low
153        level parameter).
154
155    """
156
157    _c_classid = 'INDEX'
158
159
160    @property
161    def kind(self):
162        "The kind of this index."
163        return {1: 'ultralight', 2: 'light',
164                4: 'medium', 8: 'full'}[self.indsize]
165
166    @property
167    def filters(self):
168        """Filter properties for this index - see Filters in
169        :ref:`FiltersClassDescr`."""
170        return self._v_filters
171
172    @property
173    def dirty(self):
174       """Whether the index is dirty or not.
175       Dirty indexes are out of sync with column data, so they exist but they
176       are not usable.
177       """
178
179       # If there is no ``DIRTY`` attribute, index should be clean.
180       return getattr(self._v_attrs, 'DIRTY', False)
181
182    @dirty.setter
183    def dirty(self, dirty):
184        wasdirty, isdirty = self.dirty, bool(dirty)
185        self._v_attrs.DIRTY = dirty
186        # If an *actual* change in dirtiness happens,
187        # notify the condition cache by setting or removing a nail.
188        conditioncache = self.table._condition_cache
189        if not wasdirty and isdirty:
190            conditioncache.nail()
191        if wasdirty and not isdirty:
192            conditioncache.unnail()
193
194    @property
195    def column(self):
196        """The Column (see :ref:`ColumnClassDescr`) instance for the indexed
197        column."""
198
199        tablepath, columnpath = _table_column_pathname_of_index(
200            self._v_pathname)
201        table = self._v_file._get_node(tablepath)
202        column = table.cols._g_col(columnpath)
203        return column
204
205    @property
206    def table(self):
207        """Accessor for the `Table` object of this index."""
208        tablepath, columnpath = _table_column_pathname_of_index(self._v_pathname)
209        table = self._v_file._get_node(tablepath)
210        return table
211
212    @property
213    def nblockssuperblock(self):
214        "The number of blocks in a superblock."
215        return self.superblocksize // self.blocksize
216
217    @property
218    def nslicesblock(self):
219        "The number of slices in a block."
220        return self.blocksize // self.slicesize
221
222    @property
223    def nchunkslice(self):
224        "The number of chunks in a slice."
225        return self.slicesize // self.chunksize
226
227    @property
228    def nsuperblocks(self):
229        "The total number of superblocks in index."
230        # Last row should not be considered as a superblock
231        nelements = self.nelements - self.nelementsILR
232        nblocks = nelements // self.superblocksize
233        if nelements % self.blocksize > 0:
234            nblocks += 1
235        return nblocks
236
237    @property
238    def nblocks(self):
239        "The total number of blocks in index."
240        # Last row should not be considered as a block
241        nelements = self.nelements - self.nelementsILR
242        nblocks = nelements // self.blocksize
243        if nelements % self.blocksize > 0:
244            nblocks += 1
245        return nblocks
246
247    @property
248    def nslices(self):
249        "The number of complete slices in index."
250        return self.nelements // self.slicesize
251
252    @property
253    def nchunks(self):
254        "The number of complete chunks in index."
255        return self.nelements // self.chunksize
256
257    @property
258    def shape(self):
259        "The shape of this index (in slices and elements)."
260        return (self.nrows, self.slicesize)
261
262    @property
263    def temp_required(self):
264        "Whether a temporary file for indexes is required or not."
265        return self.indsize > 1 and self.optlevel > 0 and self.table.nrows > self.slicesize
266
267    @property
268    def want_complete_sort(self):
269        "Whether we should try to build a completely sorted index or not."
270        return self.indsize == 8 and self.optlevel == 9
271
272    @property
273    def is_csi(self):
274        """Whether the index is completely sorted or not.
275
276        .. versionchanged:: 3.0
277           The *is_CSI* property has been renamed into *is_csi*.
278
279        """
280
281        if self.nelements == 0:
282            # An index with 0 indexed elements is not a CSI one (by definition)
283            return False
284        if self.indsize < 8:
285            # An index that is not full cannot be completely sorted
286            return False
287        # Try with the 'is_csi' attribute
288        if 'is_csi' in self._v_attrs:
289            return self._v_attrs.is_csi
290        # If not, then compute the overlaps manually
291        # (the attribute 'is_csi' will be set there)
292        self.compute_overlaps(self, None, False)
293        return self.noverlaps == 0
294
295    @lazyattr
296    def nrowsinchunk(self):
297        """The number of rows that fits in a *table* chunk."""
298
299        return self.table.chunkshape[0]
300
301    @lazyattr
302    def lbucket(self):
303        """Return the length of a bucket based index type."""
304
305        # Avoid to set a too large lbucket size (mainly useful for tests)
306        lbucket = min(self.nrowsinchunk, self.chunksize)
307        if self.indsize == 1:
308            # For ultra-light, we will never have to keep track of a
309            # bucket outside of a slice.
310            maxnb = 2**8
311            if self.slicesize > maxnb * lbucket:
312                lbucket = int(math.ceil(float(self.slicesize) / maxnb))
313        elif self.indsize == 2:
314            # For light, we will never have to keep track of a
315            # bucket outside of a block.
316            maxnb = 2**16
317            if self.blocksize > maxnb * lbucket:
318                lbucket = int(math.ceil(float(self.blocksize) / maxnb))
319        else:
320            # For medium and full indexes there should not be a need to
321            # increase lbucket
322            pass
323        return lbucket
324
325    def __init__(self, parentnode, name,
326                 atom=None, title="",
327                 kind=None,
328                 optlevel=None,
329                 filters=None,
330                 tmp_dir=None,
331                 expectedrows=0,
332                 byteorder=None,
333                 blocksizes=None,
334                 new=True):
335
336        self._v_version = None
337        """The object version of this index."""
338        self.optlevel = optlevel
339        """The optimization level for this index."""
340        self.tmp_dir = tmp_dir
341        """The directory for the temporary files."""
342        self.expectedrows = expectedrows
343        """The expected number of items of index arrays."""
344        if byteorder in ["little", "big"]:
345            self.byteorder = byteorder
346        else:
347            self.byteorder = sys.byteorder
348        """The byteorder of the index datasets."""
349        if atom is not None:
350            self.dtype = atom.dtype.base
351            self.type = atom.type
352            """The datatypes to be stored by the sorted index array."""
353            ############### Important note ###########################
354            # The datatypes saved as index values are NumPy native
355            # types, so we get rid of type metainfo like Time* or Enum*
356            # that belongs to HDF5 types (actually, this metainfo is
357            # not needed for sorting and looking-up purposes).
358            ##########################################################
359            indsize = {
360                'ultralight': 1, 'light': 2, 'medium': 4, 'full': 8}[kind]
361            assert indsize in (1, 2, 4, 8), "indsize should be 1, 2, 4 or 8!"
362            self.indsize = indsize
363            """The itemsize for the indices part of the index."""
364
365        self.nrows = None
366        """The total number of slices in the index."""
367        self.nelements = None
368        """The number of currently indexed rows for this column."""
369        self.blocksizes = blocksizes
370        """The four main sizes of the compound blocks (if specified)."""
371        self.dirtycache = True
372        """Dirty cache (for ranges, bounds & sorted) flag."""
373        self.superblocksize = None
374        """Size of the superblock for this index."""
375        self.blocksize = None
376        """Size of the block for this index."""
377        self.slicesize = None
378        """Size of the slice for this index."""
379        self.chunksize = None
380        """Size of the chunk for this index."""
381        self.tmpfilename = None
382        """Filename for temporary bounds."""
383        self.opt_search_types = opt_search_types
384        """The types for which and optimized search has been implemented."""
385        self.noverlaps = -1
386        """The number of overlaps in an index.  0 means a completely
387        sorted index. -1 means that this number is not computed yet."""
388        self.tprof = 0
389        """Time counter for benchmarking purposes."""
390
391        from .file import open_file
392        self._openFile = open_file
393        """The `open_file()` function, to avoid a circular import."""
394
395        super(Index, self).__init__(parentnode, name, title, new, filters)
396
397    def _g_post_init_hook(self):
398        if self._v_new:
399            # The version for newly created indexes
400            self._v_version = obversion
401        super(Index, self)._g_post_init_hook()
402
403        # Index arrays must only be created for new indexes
404        if not self._v_new:
405            idxversion = self._v_version
406            # Set-up some variables from info on disk and return
407            attrs = self._v_attrs
408            # Coerce NumPy scalars to Python scalars in order
409            # to avoid undesired upcasting operations.
410            self.superblocksize = int(attrs.superblocksize)
411            self.blocksize = int(attrs.blocksize)
412            self.slicesize = int(attrs.slicesize)
413            self.chunksize = int(attrs.chunksize)
414            self.blocksizes = (self.superblocksize, self.blocksize,
415                               self.slicesize, self.chunksize)
416            self.optlevel = int(attrs.optlevel)
417            sorted = self.sorted
418            indices = self.indices
419            self.dtype = sorted.atom.dtype
420            self.type = sorted.atom.type
421            self.indsize = indices.atom.itemsize
422            # Some sanity checks for slicesize, chunksize and indsize
423            assert self.slicesize == indices.shape[1], "Wrong slicesize"
424            assert self.chunksize == indices._v_chunkshape[
425                1], "Wrong chunksize"
426            assert self.indsize in (1, 2, 4, 8), "Wrong indices itemsize"
427            if idxversion > "2.0":
428                self.reduction = int(attrs.reduction)
429                nelementsSLR = int(self.sortedLR.attrs.nelements)
430                nelementsILR = int(self.indicesLR.attrs.nelements)
431            else:
432                self.reduction = 1
433                nelementsILR = self.indicesLR[-1]
434                nelementsSLR = nelementsILR
435            self.nrows = sorted.nrows
436            self.nelements = self.nrows * self.slicesize + nelementsILR
437            self.nelementsSLR = nelementsSLR
438            self.nelementsILR = nelementsILR
439            if nelementsILR > 0:
440                self.nrows += 1
441            # Get the bounds as a cache (this has to remain here!)
442            rchunksize = self.chunksize // self.reduction
443            nboundsLR = (nelementsSLR - 1) // rchunksize
444            if nboundsLR < 0:
445                nboundsLR = 0  # correction for -1 bounds
446            nboundsLR += 2  # bounds + begin + end
447            # All bounds values (+begin + end) are at the end of sortedLR
448            self.bebounds = self.sortedLR[
449                nelementsSLR:nelementsSLR + nboundsLR]
450            return
451
452        # The index is new. Initialize the values
453        self.nrows = 0
454        self.nelements = 0
455        self.nelementsSLR = 0
456        self.nelementsILR = 0
457
458        # The atom
459        atom = Atom.from_dtype(self.dtype)
460
461        # The filters
462        filters = self.filters
463
464        # Compute the superblocksize, blocksize, slicesize and chunksize values
465        # (in case these parameters haven't been passed to the constructor)
466        if self.blocksizes is None:
467            self.blocksizes = calc_chunksize(
468                self.expectedrows, self.optlevel, self.indsize)
469        (self.superblocksize, self.blocksize,
470         self.slicesize, self.chunksize) = self.blocksizes
471        if debug:
472            print("blocksizes:", self.blocksizes)
473        # Compute the reduction level
474        self.reduction = get_reduction_level(
475            self.indsize, self.optlevel, self.slicesize, self.chunksize)
476        rchunksize = self.chunksize // self.reduction
477        rslicesize = self.slicesize // self.reduction
478
479        # Save them on disk as attributes
480        self._v_attrs.superblocksize = numpy.uint64(self.superblocksize)
481        self._v_attrs.blocksize = numpy.uint64(self.blocksize)
482        self._v_attrs.slicesize = numpy.uint32(self.slicesize)
483        self._v_attrs.chunksize = numpy.uint32(self.chunksize)
484        # Save the optlevel as well
485        self._v_attrs.optlevel = self.optlevel
486        # Save the reduction level
487        self._v_attrs.reduction = self.reduction
488
489        # Create the IndexArray for sorted values
490        sorted = IndexArray(self, 'sorted', atom, "Sorted Values",
491                            filters, self.byteorder)
492
493        # Create the IndexArray for index values
494        IndexArray(self, 'indices', UIntAtom(itemsize=self.indsize),
495                   "Number of chunk in table", filters, self.byteorder)
496
497        # Create the cache for range values  (1st order cache)
498        CacheArray(self, 'ranges', atom, (0, 2), "Range Values", filters,
499                   self.expectedrows // self.slicesize,
500                   byteorder=self.byteorder)
501        # median ranges
502        EArray(self, 'mranges', atom, (0,), "Median ranges", filters,
503               byteorder=self.byteorder, _log=False)
504
505        # Create the cache for boundary values (2nd order cache)
506        nbounds_inslice = (rslicesize - 1) // rchunksize
507        CacheArray(self, 'bounds', atom, (0, nbounds_inslice),
508                   "Boundary Values", filters, self.nchunks,
509                   (1, nbounds_inslice), byteorder=self.byteorder)
510
511        # begin, end & median bounds (only for numerical types)
512        EArray(self, 'abounds', atom, (0,), "Start bounds", filters,
513               byteorder=self.byteorder, _log=False)
514        EArray(self, 'zbounds', atom, (0,), "End bounds", filters,
515               byteorder=self.byteorder, _log=False)
516        EArray(self, 'mbounds', atom, (0,), "Median bounds", filters,
517               byteorder=self.byteorder, _log=False)
518
519        # Create the Array for last (sorted) row values + bounds
520        shape = (rslicesize + 2 + nbounds_inslice,)
521        sortedLR = LastRowArray(self, 'sortedLR', atom, shape,
522                                "Last Row sorted values + bounds",
523                                filters, (rchunksize,),
524                                byteorder=self.byteorder)
525
526        # Create the Array for the number of chunk in last row
527        shape = (self.slicesize,)     # enough for indexes and length
528        indicesLR = LastRowArray(self, 'indicesLR',
529                                 UIntAtom(itemsize=self.indsize),
530                                 shape, "Last Row indices",
531                                 filters, (self.chunksize,),
532                                 byteorder=self.byteorder)
533
534        # The number of elements in LR will be initialized here
535        sortedLR.attrs.nelements = 0
536        indicesLR.attrs.nelements = 0
537
538        # All bounds values (+begin + end) are uninitialized in creation time
539        self.bebounds = None
540
541        # The starts and lengths initialization
542        self.starts = numpy.empty(shape=self.nrows, dtype=numpy.int32)
543        """Where the values fulfiling conditions starts for every slice."""
544        self.lengths = numpy.empty(shape=self.nrows, dtype=numpy.int32)
545        """Lengths of the values fulfilling conditions for every slice."""
546
547        # Finally, create a temporary file for indexes if needed
548        if self.temp_required:
549            self.create_temp()
550
551
552    def initial_append(self, xarr, nrow, reduction):
553        """Compute an initial indices arrays for data to be indexed."""
554
555        if profile:
556            tref = time()
557        if profile:
558            show_stats("Entering initial_append", tref)
559        arr = xarr.pop()
560        indsize = self.indsize
561        slicesize = self.slicesize
562        nelementsILR = self.nelementsILR
563        if profile:
564            show_stats("Before creating idx", tref)
565        if indsize == 8:
566            idx = numpy.arange(0, len(arr), dtype="uint64") + nrow * slicesize
567        elif indsize == 4:
568            # For medium (32-bit) all the rows in tables should be
569            # directly reachable.  But as len(arr) < 2**31, we can
570            # choose uint32 for representing indices.  In this way, we
571            # consume far less memory during the keysort process.  The
572            # offset will be added in self.final_idx32() later on.
573            #
574            # This optimization also prevents the values in LR to
575            # participate in the ``swap_chunks`` process, and this is
576            # the main reason to not allow the medium indexes to create
577            # completely sorted indexes.  However, I don't find this to
578            # be a big limitation, as probably fully indexes are much
579            # more suitable for producing completely sorted indexes
580            # because in this case the indices part is usable for
581            # getting the reverse indices of the index, and I forsee
582            # this to be a common requirement in many operations (for
583            # example, in table sorts).
584            #
585            # F. Alted 2008-09-15
586            idx = numpy.arange(0, len(arr), dtype="uint32")
587        else:
588            idx = numpy.empty(len(arr), "uint%d" % (indsize * 8))
589            lbucket = self.lbucket
590            # Fill the idx with the bucket indices
591            offset = int(lbucket - ((nrow * (slicesize % lbucket)) % lbucket))
592            idx[0:offset] = 0
593            for i in range(offset, slicesize, lbucket):
594                idx[i:i + lbucket] = (i + lbucket - 1) // lbucket
595            if indsize == 2:
596                # Add a second offset in this case
597                # First normalize the number of rows
598                offset2 = (nrow % self.nslicesblock) * slicesize // lbucket
599                idx += offset2
600        # Add the last row at the beginning of arr & idx (if needed)
601        if (indsize == 8 and nelementsILR > 0):
602            # It is possible that the values in LR are already sorted.
603            # Fetch them and override existing values in arr and idx.
604            assert len(arr) > nelementsILR
605            self.read_slice_lr(self.sortedLR, arr[:nelementsILR])
606            self.read_slice_lr(self.indicesLR, idx[:nelementsILR])
607        # In-place sorting
608        if profile:
609            show_stats("Before keysort", tref)
610        indexesextension.keysort(arr, idx)
611        larr = arr[-1]
612        if reduction > 1:
613            # It's important to do a copy() here in order to ensure that
614            # sorted._append() will receive a contiguous array.
615            if profile:
616                show_stats("Before reduction", tref)
617            reduc = arr[::reduction].copy()
618            if profile:
619                show_stats("After reduction", tref)
620            arr = reduc
621            if profile:
622                show_stats("After arr <-- reduc", tref)
623        # A completely sorted index is not longer possible after an
624        # append of an index with already one slice.
625        if nrow > 0:
626            self._v_attrs.is_csi = False
627        if profile:
628            show_stats("Exiting initial_append", tref)
629        return larr, arr, idx
630
631    def final_idx32(self, idx, offset):
632        """Perform final operations in 32-bit indices."""
633
634        if profile:
635            tref = time()
636        if profile:
637            show_stats("Entering final_idx32", tref)
638        # Do an upcast first in order to add the offset.
639        idx = idx.astype('uint64')
640        idx += offset
641        # The next partition is valid up to table sizes of
642        # 2**30 * 2**18 = 2**48 bytes, that is, 256 Tera-elements,
643        # which should be a safe figure, at least for a while.
644        idx //= self.lbucket
645        # After the division, we can downsize the indexes to 'uint32'
646        idx = idx.astype('uint32')
647        if profile:
648            show_stats("Exiting final_idx32", tref)
649        return idx
650
651    def append(self, xarr, update=False):
652        """Append the array to the index objects."""
653
654        if profile:
655            tref = time()
656        if profile:
657            show_stats("Entering append", tref)
658        if not update and self.temp_required:
659            where = self.tmp
660            # The reduction will take place *after* the optimization process
661            reduction = 1
662        else:
663            where = self
664            reduction = self.reduction
665        sorted = where.sorted
666        indices = where.indices
667        ranges = where.ranges
668        mranges = where.mranges
669        bounds = where.bounds
670        mbounds = where.mbounds
671        abounds = where.abounds
672        zbounds = where.zbounds
673        sortedLR = where.sortedLR
674        indicesLR = where.indicesLR
675        nrows = sorted.nrows  # before sorted.append()
676        larr, arr, idx = self.initial_append(xarr, nrows, reduction)
677        # Save the sorted array
678        sorted.append(arr.reshape(1, arr.size))
679        cs = self.chunksize // reduction
680        ncs = self.nchunkslice
681        # Save ranges & bounds
682        ranges.append([[arr[0], larr]])
683        bounds.append([arr[cs::cs]])
684        abounds.append(arr[0::cs])
685        zbounds.append(arr[cs - 1::cs])
686        # Compute the medians
687        smedian = arr[cs // 2::cs]
688        mbounds.append(smedian)
689        mranges.append([smedian[ncs // 2]])
690        if profile:
691            show_stats("Before deleting arr & smedian", tref)
692        del arr, smedian   # delete references
693        if profile:
694            show_stats("After deleting arr & smedian", tref)
695        # Now that arr is gone, we can upcast the indices and add the offset
696        if self.indsize == 4:
697            idx = self.final_idx32(idx, nrows * self.slicesize)
698        indices.append(idx.reshape(1, idx.size))
699        if profile:
700            show_stats("Before deleting idx", tref)
701        del idx
702        # Update counters after a successful append
703        self.nrows = nrows + 1
704        self.nelements = self.nrows * self.slicesize
705        self.nelementsSLR = 0  # reset the counter of the last row index to 0
706        self.nelementsILR = 0  # reset the counter of the last row index to 0
707        # The number of elements will be saved as an attribute.
708        # This is necessary in case the LR arrays can remember its values
709        # after a possible node preemtion/reload.
710        sortedLR.attrs.nelements = self.nelementsSLR
711        indicesLR.attrs.nelements = self.nelementsILR
712        self.dirtycache = True   # the cache is dirty now
713        if profile:
714            show_stats("Exiting append", tref)
715
716    def append_last_row(self, xarr, update=False):
717        """Append the array to the last row index objects."""
718
719        if profile:
720            tref = time()
721        if profile:
722            show_stats("Entering appendLR", tref)
723        # compute the elements in the last row sorted & bounds array
724        nrows = self.nslices
725        if not update and self.temp_required:
726            where = self.tmp
727            # The reduction will take place *after* the optimization process
728            reduction = 1
729        else:
730            where = self
731            reduction = self.reduction
732        indicesLR = where.indicesLR
733        sortedLR = where.sortedLR
734        larr, arr, idx = self.initial_append(xarr, nrows, reduction)
735        nelementsSLR = len(arr)
736        nelementsILR = len(idx)
737        # Build the cache of bounds
738        rchunksize = self.chunksize // reduction
739        self.bebounds = numpy.concatenate((arr[::rchunksize], [larr]))
740        # The number of elements will be saved as an attribute
741        sortedLR.attrs.nelements = nelementsSLR
742        indicesLR.attrs.nelements = nelementsILR
743        # Save the number of elements, bounds and sorted values
744        # at the end of the sorted array
745        offset2 = len(self.bebounds)
746        sortedLR[nelementsSLR:nelementsSLR + offset2] = self.bebounds
747        sortedLR[:nelementsSLR] = arr
748        del arr
749        # Now that arr is gone, we can upcast the indices and add the offset
750        if self.indsize == 4:
751            idx = self.final_idx32(idx, nrows * self.slicesize)
752        # Save the reverse index array
753        indicesLR[:len(idx)] = idx
754        del idx
755        # Update counters after a successful append
756        self.nrows = nrows + 1
757        self.nelements = nrows * self.slicesize + nelementsILR
758        self.nelementsILR = nelementsILR
759        self.nelementsSLR = nelementsSLR
760        self.dirtycache = True   # the cache is dirty now
761        if profile:
762            show_stats("Exiting appendLR", tref)
763
764
765    def optimize(self, verbose=False):
766        """Optimize an index so as to allow faster searches.
767
768        verbose
769            If True, messages about the progress of the
770            optimization process are printed out.
771
772        """
773
774        if not self.temp_required:
775            return
776
777        if verbose:
778            self.verbose = True
779        else:
780            self.verbose = debug
781
782        # Initialize last_tover and last_nover
783        self.last_tover = 0
784        self.last_nover = 0
785
786        # Compute the correct optimizations for current optim level
787        opts = calcoptlevels(self.nblocks, self.optlevel, self.indsize)
788        optmedian, optstarts, optstops, optfull = opts
789
790        if debug:
791            print("optvalues:", opts)
792
793        self.create_temp2()
794        # Start the optimization process
795        while True:
796            if optfull:
797                for niter in range(optfull):
798                    if self.swap('chunks', 'median'):
799                        break
800                    if self.nblocks > 1:
801                        # Swap slices only in the case that we have
802                        # several blocks
803                        if self.swap('slices', 'median'):
804                            break
805                        if self.swap('chunks', 'median'):
806                            break
807                    if self.swap('chunks', 'start'):
808                        break
809                    if self.swap('chunks', 'stop'):
810                        break
811            else:
812                if optmedian:
813                    if self.swap('chunks', 'median'):
814                        break
815                if optstarts:
816                    if self.swap('chunks', 'start'):
817                        break
818                if optstops:
819                    if self.swap('chunks', 'stop'):
820                        break
821            break  # If we reach this, exit the loop
822
823        # Check if we require a complete sort.  Important: this step
824        # should be carried out *after* the optimization process has
825        # been completed (this is to guarantee that the complete sort
826        # does not take too much memory).
827        if self.want_complete_sort:
828            if self.noverlaps > 0:
829                self.do_complete_sort()
830            # Check that we have effectively achieved the complete sort
831            if self.noverlaps > 0:
832                warnings.warn(
833                    "OPSI was not able to achieve a completely sorted index."
834                    "  Please report this to the authors.", UserWarning)
835
836        # Close and delete the temporal optimization index file
837        self.cleanup_temp()
838        return
839
840    def do_complete_sort(self):
841        """Bring an already optimized index into a complete sorted state."""
842
843        if self.verbose:
844            t1 = time()
845            c1 = perf_counter()
846        ss = self.slicesize
847        tmp = self.tmp
848        ranges = tmp.ranges[:]
849        nslices = self.nslices
850
851        nelementsLR = self.nelementsILR
852        if nelementsLR > 0:
853            # Add the ranges corresponding to the last row
854            rangeslr = numpy.array([self.bebounds[0], self.bebounds[-1]])
855            ranges = numpy.concatenate((ranges, [rangeslr]))
856            nslices += 1
857
858        sorted = tmp.sorted
859        indices = tmp.indices
860        sortedLR = tmp.sortedLR
861        indicesLR = tmp.indicesLR
862        sremain = numpy.array([], dtype=self.dtype)
863        iremain = numpy.array([], dtype='u%d' % self.indsize)
864        starts = numpy.zeros(shape=nslices, dtype=numpy.int_)
865        for i in range(nslices):
866            # Find the overlapping elements for slice i
867            sover = numpy.array([], dtype=self.dtype)
868            iover = numpy.array([], dtype='u%d' % self.indsize)
869            prev_end = ranges[i, 1]
870            for j in range(i + 1, nslices):
871                stj = starts[j]
872                if ((j < self.nslices and stj == ss) or
873                        (j == self.nslices and stj == nelementsLR)):
874                    # This slice has been already dealt with
875                    continue
876                if j < self.nslices:
877                    assert stj < ss, \
878                        "Two slices cannot overlap completely at this stage!"
879                    next_beg = sorted[j, stj]
880                else:
881                    assert stj < nelementsLR, \
882                        "Two slices cannot overlap completely at this stage!"
883                    next_beg = sortedLR[stj]
884                next_end = ranges[j, 1]
885                if prev_end > next_end:
886                    # Complete overlapping case
887                    if j < self.nslices:
888                        sover = numpy.concatenate((sover, sorted[j, stj:]))
889                        iover = numpy.concatenate((iover, indices[j, stj:]))
890                        starts[j] = ss
891                    else:
892                        n = nelementsLR
893                        sover = numpy.concatenate((sover, sortedLR[stj:n]))
894                        iover = numpy.concatenate((iover, indicesLR[stj:n]))
895                        starts[j] = nelementsLR
896                elif prev_end > next_beg:
897                    idx = self.search_item_lt(tmp, prev_end, j, ranges[j], stj)
898                    if j < self.nslices:
899                        sover = numpy.concatenate((sover, sorted[j, stj:idx]))
900                        iover = numpy.concatenate((iover, indices[j, stj:idx]))
901                    else:
902                        sover = numpy.concatenate((sover, sortedLR[stj:idx]))
903                        iover = numpy.concatenate((iover, indicesLR[stj:idx]))
904                    starts[j] = idx
905            # Build the extended slices to sort out
906            if i < self.nslices:
907                ssorted = numpy.concatenate(
908                    (sremain, sorted[i, starts[i]:], sover))
909                sindices = numpy.concatenate(
910                    (iremain, indices[i, starts[i]:], iover))
911            else:
912                ssorted = numpy.concatenate(
913                    (sremain, sortedLR[starts[i]:nelementsLR], sover))
914                sindices = numpy.concatenate(
915                    (iremain, indicesLR[starts[i]:nelementsLR], iover))
916            # Sort the extended slices
917            indexesextension.keysort(ssorted, sindices)
918            # Save the first elements of extended slices in the slice i
919            if i < self.nslices:
920                sorted[i] = ssorted[:ss]
921                indices[i] = sindices[:ss]
922                # Update caches for this slice
923                self.update_caches(i, ssorted[:ss])
924                # Save the remaining values in a separate array
925                send = len(sover) + len(sremain)
926                sremain = ssorted[ss:ss + send]
927                iremain = sindices[ss:ss + send]
928            else:
929                # Still some elements remain for the last row
930                n = len(ssorted)
931                assert n == nelementsLR
932                send = 0
933                sortedLR[:n] = ssorted
934                indicesLR[:n] = sindices
935                # Update the caches for last row
936                sortedlr = sortedLR[:nelementsLR]
937                bebounds = numpy.concatenate(
938                    (sortedlr[::self.chunksize], [sortedlr[-1]]))
939                sortedLR[nelementsLR:nelementsLR + len(bebounds)] = bebounds
940                self.bebounds = bebounds
941
942        # Verify that we have dealt with all the remaining values
943        assert send == 0
944
945        # Compute the overlaps in order to verify that we have achieved
946        # a complete sort.  This has to be executed always (and not only
947        # in verbose mode!).
948        self.compute_overlaps(self.tmp, "do_complete_sort()", self.verbose)
949        if self.verbose:
950            t = round(time() - t1, 4)
951            c = round(perf_counter() - c1, 4)
952            print("time: %s. clock: %s" % (t, c))
953
954    def swap(self, what, mode=None):
955        """Swap chunks or slices using a certain bounds reference."""
956
957        # Thresholds for avoiding continuing the optimization
958        # thnover = 4 * self.slicesize  # minimum number of overlapping
959        #                               # elements
960        thnover = 40
961        thmult = 0.1      # minimum ratio of multiplicity (a 10%)
962        thtover = 0.01    # minimum overlaping index for slices (a 1%)
963
964        if self.verbose:
965            t1 = time()
966            c1 = perf_counter()
967        if what == "chunks":
968            self.swap_chunks(mode)
969        elif what == "slices":
970            self.swap_slices(mode)
971        if mode:
972            message = "swap_%s(%s)" % (what, mode)
973        else:
974            message = "swap_%s" % (what,)
975        (nover, mult, tover) = self.compute_overlaps(
976            self.tmp, message, self.verbose)
977        rmult = len(mult.nonzero()[0]) / float(len(mult))
978        if self.verbose:
979            t = round(time() - t1, 4)
980            c = round(perf_counter() - c1, 4)
981            print("time: %s. clock: %s" % (t, c))
982        # Check that entropy is actually decreasing
983        if what == "chunks" and self.last_tover > 0. and self.last_nover > 0:
984            tover_var = (self.last_tover - tover) / self.last_tover
985            nover_var = (self.last_nover - nover) / self.last_nover
986            if tover_var < 0.05 and nover_var < 0.05:
987                # Less than a 5% of improvement is too few
988                return True
989        self.last_tover = tover
990        self.last_nover = nover
991        # Check if some threshold has met
992        if nover < thnover:
993            return True
994        if rmult < thmult:
995            return True
996        # Additional check for the overlap ratio
997        if tover >= 0. and tover < thtover:
998            return True
999        return False
1000
1001    def create_temp(self):
1002        """Create some temporary objects for slice sorting purposes."""
1003
1004        # The index will be dirty during the index optimization process
1005        self.dirty = True
1006        # Build the name of the temporary file
1007        fd, self.tmpfilename = tempfile.mkstemp(
1008            ".tmp", "pytables-", self.tmp_dir)
1009        # Close the file descriptor so as to avoid leaks
1010        os.close(fd)
1011        # Create the proper PyTables file
1012        self.tmpfile = self._openFile(self.tmpfilename, "w")
1013        self.tmp = tmp = self.tmpfile.root
1014        cs = self.chunksize
1015        ss = self.slicesize
1016        filters = self.filters
1017        # temporary sorted & indices arrays
1018        shape = (0, ss)
1019        atom = Atom.from_dtype(self.dtype)
1020        EArray(tmp, 'sorted', atom, shape,
1021               "Temporary sorted", filters, chunkshape=(1, cs))
1022        EArray(tmp, 'indices', UIntAtom(itemsize=self.indsize), shape,
1023               "Temporary indices", filters, chunkshape=(1, cs))
1024        # temporary bounds
1025        nbounds_inslice = (ss - 1) // cs
1026        shape = (0, nbounds_inslice)
1027        EArray(tmp, 'bounds', atom, shape, "Temp chunk bounds",
1028               filters, chunkshape=(cs, nbounds_inslice))
1029        shape = (0,)
1030        EArray(tmp, 'abounds', atom, shape, "Temp start bounds",
1031               filters, chunkshape=(cs,))
1032        EArray(tmp, 'zbounds', atom, shape, "Temp end bounds",
1033               filters, chunkshape=(cs,))
1034        EArray(tmp, 'mbounds', atom, shape, "Median bounds",
1035               filters, chunkshape=(cs,))
1036        # temporary ranges
1037        EArray(tmp, 'ranges', atom, (0, 2),
1038               "Temporary range values", filters, chunkshape=(cs, 2))
1039        EArray(tmp, 'mranges', atom, (0,),
1040               "Median ranges", filters, chunkshape=(cs,))
1041        # temporary last row (sorted)
1042        shape = (ss + 2 + nbounds_inslice,)
1043        CArray(tmp, 'sortedLR', atom, shape,
1044               "Temp Last Row sorted values + bounds",
1045               filters, chunkshape=(cs,))
1046        # temporary last row (indices)
1047        shape = (ss,)
1048        CArray(tmp, 'indicesLR',
1049               UIntAtom(itemsize=self.indsize),
1050               shape, "Temp Last Row indices",
1051               filters, chunkshape=(cs,))
1052
1053    def create_temp2(self):
1054        """Create some temporary objects for slice sorting purposes."""
1055
1056        # The algorithms for doing the swap can be optimized so that
1057        # one should be necessary to create temporaries for keeping just
1058        # the contents of a single superblock.
1059        # F. Alted 2007-01-03
1060        cs = self.chunksize
1061        ss = self.slicesize
1062        filters = self.filters
1063        # temporary sorted & indices arrays
1064        shape = (self.nslices, ss)
1065        atom = Atom.from_dtype(self.dtype)
1066        tmp = self.tmp
1067        CArray(tmp, 'sorted2', atom, shape,
1068               "Temporary sorted 2", filters, chunkshape=(1, cs))
1069        CArray(tmp, 'indices2', UIntAtom(itemsize=self.indsize), shape,
1070               "Temporary indices 2", filters, chunkshape=(1, cs))
1071        # temporary bounds
1072        nbounds_inslice = (ss - 1) // cs
1073        shape = (self.nslices, nbounds_inslice)
1074        CArray(tmp, 'bounds2', atom, shape, "Temp chunk bounds 2",
1075               filters, chunkshape=(cs, nbounds_inslice))
1076        shape = (self.nchunks,)
1077        CArray(tmp, 'abounds2', atom, shape, "Temp start bounds 2",
1078               filters, chunkshape=(cs,))
1079        CArray(tmp, 'zbounds2', atom, shape, "Temp end bounds 2",
1080               filters, chunkshape=(cs,))
1081        CArray(tmp, 'mbounds2', atom, shape, "Median bounds 2",
1082               filters, chunkshape=(cs,))
1083        # temporary ranges
1084        CArray(tmp, 'ranges2', atom, (self.nslices, 2),
1085               "Temporary range values 2", filters, chunkshape=(cs, 2))
1086        CArray(tmp, 'mranges2', atom, (self.nslices,),
1087               "Median ranges 2", filters, chunkshape=(cs,))
1088
1089    def cleanup_temp(self):
1090        """Copy the data and delete the temporaries for sorting purposes."""
1091
1092        if self.verbose:
1093            print("Copying temporary data...")
1094        # tmp -> index
1095        reduction = self.reduction
1096        cs = self.chunksize // reduction
1097        ncs = self.nchunkslice
1098        tmp = self.tmp
1099        for i in range(self.nslices):
1100            # Copy sorted & indices slices
1101            sorted = tmp.sorted[i][::reduction].copy()
1102            self.sorted.append(sorted.reshape(1, sorted.size))
1103            # Compute ranges
1104            self.ranges.append([[sorted[0], sorted[-1]]])
1105            # Compute chunk bounds
1106            self.bounds.append([sorted[cs::cs]])
1107            # Compute start, stop & median bounds and ranges
1108            self.abounds.append(sorted[0::cs])
1109            self.zbounds.append(sorted[cs - 1::cs])
1110            smedian = sorted[cs // 2::cs]
1111            self.mbounds.append(smedian)
1112            self.mranges.append([smedian[ncs // 2]])
1113            del sorted, smedian   # delete references
1114            # Now that sorted is gone, we can copy the indices
1115            indices = tmp.indices[i]
1116            self.indices.append(indices.reshape(1, indices.size))
1117
1118        # Now it is the last row turn (if needed)
1119        if self.nelementsSLR > 0:
1120            # First, the sorted values
1121            sortedLR = self.sortedLR
1122            indicesLR = self.indicesLR
1123            nelementsLR = self.nelementsILR
1124            sortedlr = tmp.sortedLR[:nelementsLR][::reduction].copy()
1125            nelementsSLR = len(sortedlr)
1126            sortedLR[:nelementsSLR] = sortedlr
1127            # Now, the bounds
1128            self.bebounds = numpy.concatenate((sortedlr[::cs], [sortedlr[-1]]))
1129            offset2 = len(self.bebounds)
1130            sortedLR[nelementsSLR:nelementsSLR + offset2] = self.bebounds
1131            # Finally, the indices
1132            indicesLR[:] = tmp.indicesLR[:]
1133            # Update the number of (reduced) sorted elements
1134            self.nelementsSLR = nelementsSLR
1135        # The number of elements will be saved as an attribute
1136        self.sortedLR.attrs.nelements = self.nelementsSLR
1137        self.indicesLR.attrs.nelements = self.nelementsILR
1138
1139        if self.verbose:
1140            print("Deleting temporaries...")
1141        self.tmp = None
1142        self.tmpfile.close()
1143        os.remove(self.tmpfilename)
1144        self.tmpfilename = None
1145
1146        # The optimization process has finished, and the index is ok now
1147        self.dirty = False
1148        # ...but the memory data cache is dirty now
1149        self.dirtycache = True
1150
1151    def get_neworder(self, neworder, src_disk, tmp_disk,
1152                     lastrow, nslices, offset, dtype):
1153        """Get sorted & indices values in new order."""
1154
1155        cs = self.chunksize
1156        ncs = ncs2 = self.nchunkslice
1157        self_nslices = self.nslices
1158        tmp = numpy.empty(shape=self.slicesize, dtype=dtype)
1159        for i in range(nslices):
1160            ns = offset + i
1161            if ns == self_nslices:
1162                # The number of complete chunks in the last row
1163                ncs2 = self.nelementsILR // cs
1164            # Get slices in new order
1165            for j in range(ncs2):
1166                idx = neworder[i * ncs + j]
1167                ins = idx // ncs
1168                inc = (idx - ins * ncs) * cs
1169                ins += offset
1170                nc = j * cs
1171                if ins == self_nslices:
1172                    tmp[nc:nc + cs] = lastrow[inc:inc + cs]
1173                else:
1174                    tmp[nc:nc + cs] = src_disk[ins, inc:inc + cs]
1175            if ns == self_nslices:
1176                # The number of complete chunks in the last row
1177                lastrow[:ncs2 * cs] = tmp[:ncs2 * cs]
1178                # The elements in the last chunk of the last row will
1179                # participate in the global reordering later on, during
1180                # the phase of sorting of *two* slices at a time
1181                # (including the last row slice, see
1182                # self.reorder_slices()).  The caches for last row will
1183                # be updated in self.reorder_slices() too.
1184                # F. Altet 2008-08-25
1185            else:
1186                tmp_disk[ns] = tmp
1187
1188    def swap_chunks(self, mode="median"):
1189        """Swap & reorder the different chunks in a block."""
1190
1191        boundsnames = {
1192            'start': 'abounds', 'stop': 'zbounds', 'median': 'mbounds'}
1193        tmp = self.tmp
1194        sorted = tmp.sorted
1195        indices = tmp.indices
1196        tmp_sorted = tmp.sorted2
1197        tmp_indices = tmp.indices2
1198        sortedLR = tmp.sortedLR
1199        indicesLR = tmp.indicesLR
1200        cs = self.chunksize
1201        ncs = self.nchunkslice
1202        nsb = self.nslicesblock
1203        ncb = ncs * nsb
1204        ncb2 = ncb
1205        boundsobj = tmp._f_get_child(boundsnames[mode])
1206        can_cross_bbounds = (self.indsize == 8 and self.nelementsILR > 0)
1207        for nblock in range(self.nblocks):
1208            # Protection for last block having less chunks than ncb
1209            remainingchunks = self.nchunks - nblock * ncb
1210            if remainingchunks < ncb:
1211                ncb2 = remainingchunks
1212            if ncb2 <= 1:
1213                # if only zero or one chunks remains we are done
1214                break
1215            nslices = ncb2 // ncs
1216            bounds = boundsobj[nblock * ncb:nblock * ncb + ncb2]
1217            # Do this only if lastrow elements can cross block boundaries
1218            if (nblock == self.nblocks - 1 and  # last block
1219                    can_cross_bbounds):
1220                nslices += 1
1221                ul = self.nelementsILR // cs
1222                bounds = numpy.concatenate((bounds, self.bebounds[:ul]))
1223            sbounds_idx = bounds.argsort(kind=defsort)
1224            offset = int(nblock * nsb)
1225            # Swap sorted and indices following the new order
1226            self.get_neworder(sbounds_idx, sorted, tmp_sorted, sortedLR,
1227                              nslices, offset, self.dtype)
1228            self.get_neworder(sbounds_idx, indices, tmp_indices, indicesLR,
1229                              nslices, offset, 'u%d' % self.indsize)
1230        # Reorder completely the index at slice level
1231        self.reorder_slices(tmp=True)
1232
1233    def read_slice(self, where, nslice, buffer, start=0):
1234        """Read a slice from the `where` dataset and put it in `buffer`."""
1235
1236        # Create the buffers for specifying the coordinates
1237        self.startl = numpy.array([nslice, start], numpy.uint64)
1238        self.stopl = numpy.array([nslice + 1, start + buffer.size],
1239                                 numpy.uint64)
1240        self.stepl = numpy.ones(shape=2, dtype=numpy.uint64)
1241        where._g_read_slice(self.startl, self.stopl, self.stepl, buffer)
1242
1243    def write_slice(self, where, nslice, buffer, start=0):
1244        """Write a `slice` to the `where` dataset with the `buffer` data."""
1245
1246        self.startl = numpy.array([nslice, start], numpy.uint64)
1247        self.stopl = numpy.array([nslice + 1, start + buffer.size],
1248                                 numpy.uint64)
1249        self.stepl = numpy.ones(shape=2, dtype=numpy.uint64)
1250        countl = self.stopl - self.startl   # (1, self.slicesize)
1251        where._g_write_slice(self.startl, self.stepl, countl, buffer)
1252
1253    # Read version for LastRow
1254    def read_slice_lr(self, where, buffer, start=0):
1255        """Read a slice from the `where` dataset and put it in `buffer`."""
1256
1257        startl = numpy.array([start], dtype=numpy.uint64)
1258        stopl = numpy.array([start + buffer.size], dtype=numpy.uint64)
1259        stepl = numpy.array([1], dtype=numpy.uint64)
1260        where._g_read_slice(startl, stopl, stepl, buffer)
1261
1262    # Write version for LastRow
1263    def write_sliceLR(self, where, buffer, start=0):
1264        """Write a slice from the `where` dataset with the `buffer` data."""
1265
1266        startl = numpy.array([start], dtype=numpy.uint64)
1267        countl = numpy.array([start + buffer.size], dtype=numpy.uint64)
1268        stepl = numpy.array([1], dtype=numpy.uint64)
1269        where._g_write_slice(startl, stepl, countl, buffer)
1270
1271
1272    def reorder_slice(self, nslice, sorted, indices, ssorted, sindices,
1273                      tmp_sorted, tmp_indices):
1274        """Copy & reorder the slice in source to final destination."""
1275
1276        ss = self.slicesize
1277        # Load the second part in buffers
1278        self.read_slice(tmp_sorted, nslice, ssorted[ss:])
1279        self.read_slice(tmp_indices, nslice, sindices[ss:])
1280        indexesextension.keysort(ssorted, sindices)
1281        # Write the first part of the buffers to the regular leaves
1282        self.write_slice(sorted, nslice - 1, ssorted[:ss])
1283        self.write_slice(indices, nslice - 1, sindices[:ss])
1284        # Update caches
1285        self.update_caches(nslice - 1, ssorted[:ss])
1286        # Shift the slice in the end to the beginning
1287        ssorted[:ss] = ssorted[ss:]
1288        sindices[:ss] = sindices[ss:]
1289
1290    def update_caches(self, nslice, ssorted):
1291        """Update the caches for faster lookups."""
1292
1293        cs = self.chunksize
1294        ncs = self.nchunkslice
1295        tmp = self.tmp
1296        # update first & second cache bounds (ranges & bounds)
1297        tmp.ranges[nslice] = ssorted[[0, -1]]
1298        tmp.bounds[nslice] = ssorted[cs::cs]
1299        # update start & stop bounds
1300        tmp.abounds[nslice * ncs:(nslice + 1) * ncs] = ssorted[0::cs]
1301        tmp.zbounds[nslice * ncs:(nslice + 1) * ncs] = ssorted[cs - 1::cs]
1302        # update median bounds
1303        smedian = ssorted[cs // 2::cs]
1304        tmp.mbounds[nslice * ncs:(nslice + 1) * ncs] = smedian
1305        tmp.mranges[nslice] = smedian[ncs // 2]
1306
1307    def reorder_slices(self, tmp):
1308        """Reorder completely the index at slice level.
1309
1310        This method has to maintain the locality of elements in the
1311        ambit of ``blocks``, i.e. an element of a ``block`` cannot be
1312        sent to another ``block`` during this reordering.  This is
1313        *critical* for ``light`` indexes to be able to use this.
1314
1315        This version of reorder_slices is optimized in that *two*
1316        complete slices are taken at a time (including the last row
1317        slice) so as to sort them.  Then, each new slice that is read is
1318        put at the end of this two-slice buffer, while the previous one
1319        is moved to the beginning of the buffer.  This is in order to
1320        better reduce the entropy of the regular part (i.e. all except
1321        the last row) of the index.
1322
1323        A secondary effect of this is that it takes at least *twice* of
1324        memory than a previous version of reorder_slices() that only
1325        reorders on a slice-by-slice basis.  However, as this is more
1326        efficient than the old version, one can configure the slicesize
1327        to be smaller, so the memory consumption is barely similar.
1328
1329        """
1330
1331        tmp = self.tmp
1332        sorted = tmp.sorted
1333        indices = tmp.indices
1334        if tmp:
1335            tmp_sorted = tmp.sorted2
1336            tmp_indices = tmp.indices2
1337        else:
1338            tmp_sorted = tmp.sorted
1339            tmp_indices = tmp.indices
1340        cs = self.chunksize
1341        ss = self.slicesize
1342        nsb = self.blocksize // self.slicesize
1343        nslices = self.nslices
1344        nblocks = self.nblocks
1345        nelementsLR = self.nelementsILR
1346        # Create the buffer for reordering 2 slices at a time
1347        ssorted = numpy.empty(shape=ss * 2, dtype=self.dtype)
1348        sindices = numpy.empty(shape=ss * 2,
1349                               dtype=numpy.dtype('u%d' % self.indsize))
1350
1351        if self.indsize == 8:
1352            # Bootstrap the process for reordering
1353            # Read the first slice in buffers
1354            self.read_slice(tmp_sorted, 0, ssorted[:ss])
1355            self.read_slice(tmp_indices, 0, sindices[:ss])
1356
1357            nslice = 0   # Just in case the loop behind executes nothing
1358            # Loop over the remainding slices in block
1359            for nslice in range(1, sorted.nrows):
1360                self.reorder_slice(nslice, sorted, indices,
1361                                   ssorted, sindices,
1362                                   tmp_sorted, tmp_indices)
1363
1364            # End the process (enrolling the lastrow if necessary)
1365            if nelementsLR > 0:
1366                sortedLR = self.tmp.sortedLR
1367                indicesLR = self.tmp.indicesLR
1368                # Shrink the ssorted and sindices arrays to the minimum
1369                ssorted2 = ssorted[:ss + nelementsLR]
1370                sortedlr = ssorted2[ss:]
1371                sindices2 = sindices[:ss + nelementsLR]
1372                indiceslr = sindices2[ss:]
1373                # Read the last row info in the second part of the buffer
1374                self.read_slice_lr(sortedLR, sortedlr)
1375                self.read_slice_lr(indicesLR, indiceslr)
1376                indexesextension.keysort(ssorted2, sindices2)
1377                # Write the second part of the buffers to the lastrow indices
1378                self.write_sliceLR(sortedLR, sortedlr)
1379                self.write_sliceLR(indicesLR, indiceslr)
1380                # Update the caches for last row
1381                bebounds = numpy.concatenate((sortedlr[::cs], [sortedlr[-1]]))
1382                sortedLR[nelementsLR:nelementsLR + len(bebounds)] = bebounds
1383                self.bebounds = bebounds
1384            # Write the first part of the buffers to the regular leaves
1385            self.write_slice(sorted, nslice, ssorted[:ss])
1386            self.write_slice(indices, nslice, sindices[:ss])
1387            # Update caches for this slice
1388            self.update_caches(nslice, ssorted[:ss])
1389        else:
1390            # Iterate over each block.  No data should cross block
1391            # boundaries to avoid adressing problems with short indices.
1392            for nb in range(nblocks):
1393                # Bootstrap the process for reordering
1394                # Read the first slice in buffers
1395                nrow = nb * nsb
1396                self.read_slice(tmp_sorted, nrow, ssorted[:ss])
1397                self.read_slice(tmp_indices, nrow, sindices[:ss])
1398
1399                # Loop over the remainding slices in block
1400                lrb = nrow + nsb
1401                if lrb > nslices:
1402                    lrb = nslices
1403                nslice = nrow   # Just in case the loop behind executes nothing
1404                for nslice in range(nrow + 1, lrb):
1405                    self.reorder_slice(nslice, sorted, indices,
1406                                       ssorted, sindices,
1407                                       tmp_sorted, tmp_indices)
1408
1409                # Write the first part of the buffers to the regular leaves
1410                self.write_slice(sorted, nslice, ssorted[:ss])
1411                self.write_slice(indices, nslice, sindices[:ss])
1412                # Update caches for this slice
1413                self.update_caches(nslice, ssorted[:ss])
1414
1415    def swap_slices(self, mode="median"):
1416        """Swap slices in a superblock."""
1417
1418        tmp = self.tmp
1419        sorted = tmp.sorted
1420        indices = tmp.indices
1421        tmp_sorted = tmp.sorted2
1422        tmp_indices = tmp.indices2
1423        ncs = self.nchunkslice
1424        nss = self.superblocksize // self.slicesize
1425        nss2 = nss
1426        for sblock in range(self.nsuperblocks):
1427            # Protection for last superblock having less slices than nss
1428            remainingslices = self.nslices - sblock * nss
1429            if remainingslices < nss:
1430                nss2 = remainingslices
1431            if nss2 <= 1:
1432                break
1433            if mode == "start":
1434                ranges = tmp.ranges[sblock * nss:sblock * nss + nss2, 0]
1435            elif mode == "stop":
1436                ranges = tmp.ranges[sblock * nss:sblock * nss + nss2, 1]
1437            elif mode == "median":
1438                ranges = tmp.mranges[sblock * nss:sblock * nss + nss2]
1439            sranges_idx = ranges.argsort(kind=defsort)
1440            # Don't swap the superblock at all if one doesn't need to
1441            ndiff = (sranges_idx != numpy.arange(nss2)).sum() / 2
1442            if ndiff * 50 < nss2:
1443                # The number of slices to rearrange is less than 2.5%,
1444                # so skip the reordering of this superblock
1445                # (too expensive for such a little improvement)
1446                if self.verbose:
1447                    print("skipping reordering of superblock ->", sblock)
1448                continue
1449            ns = sblock * nss2
1450            # Swap sorted and indices slices following the new order
1451            for i in range(nss2):
1452                idx = sranges_idx[i]
1453                # Swap sorted & indices slices
1454                oi = ns + i
1455                oidx = ns + idx
1456                tmp_sorted[oi] = sorted[oidx]
1457                tmp_indices[oi] = indices[oidx]
1458                # Swap start, stop & median ranges
1459                tmp.ranges2[oi] = tmp.ranges[oidx]
1460                tmp.mranges2[oi] = tmp.mranges[oidx]
1461                # Swap chunk bounds
1462                tmp.bounds2[oi] = tmp.bounds[oidx]
1463                # Swap start, stop & median bounds
1464                j = oi * ncs
1465                jn = (oi + 1) * ncs
1466                xj = oidx * ncs
1467                xjn = (oidx + 1) * ncs
1468                tmp.abounds2[j:jn] = tmp.abounds[xj:xjn]
1469                tmp.zbounds2[j:jn] = tmp.zbounds[xj:xjn]
1470                tmp.mbounds2[j:jn] = tmp.mbounds[xj:xjn]
1471            # tmp -> originals
1472            for i in range(nss2):
1473                # Copy sorted & indices slices
1474                oi = ns + i
1475                sorted[oi] = tmp_sorted[oi]
1476                indices[oi] = tmp_indices[oi]
1477                # Copy start, stop & median ranges
1478                tmp.ranges[oi] = tmp.ranges2[oi]
1479                tmp.mranges[oi] = tmp.mranges2[oi]
1480                # Copy chunk bounds
1481                tmp.bounds[oi] = tmp.bounds2[oi]
1482                # Copy start, stop & median bounds
1483                j = oi * ncs
1484                jn = (oi + 1) * ncs
1485                tmp.abounds[j:jn] = tmp.abounds2[j:jn]
1486                tmp.zbounds[j:jn] = tmp.zbounds2[j:jn]
1487                tmp.mbounds[j:jn] = tmp.mbounds2[j:jn]
1488
1489    def search_item_lt(self, where, item, nslice, limits, start=0):
1490        """Search a single item in a specific sorted slice."""
1491
1492        # This method will only works under the assumtion that item
1493        # *is to be found* in the nslice.
1494        assert nan_aware_lt(limits[0], item) and nan_aware_le(item, limits[1])
1495        cs = self.chunksize
1496        ss = self.slicesize
1497        nelementsLR = self.nelementsILR
1498        bstart = start // cs
1499
1500        # Find the chunk
1501        if nslice < self.nslices:
1502            nchunk = bisect_left(where.bounds[nslice], item, bstart)
1503        else:
1504            # We need to subtract 1 chunk here because bebounds
1505            # has a leading value
1506            nchunk = bisect_left(self.bebounds, item, bstart) - 1
1507        assert nchunk >= 0
1508
1509        # Find the element in chunk
1510        pos = nchunk * cs
1511        if nslice < self.nslices:
1512            pos += bisect_left(where.sorted[nslice, pos:pos + cs], item)
1513            assert pos <= ss
1514        else:
1515            end = pos + cs
1516            if end > nelementsLR:
1517                end = nelementsLR
1518            pos += bisect_left(self.sortedLR[pos:end], item)
1519            assert pos <= nelementsLR
1520        assert pos > 0
1521        return pos
1522
1523    def compute_overlaps_finegrain(self, where, message, verbose):
1524        """Compute some statistics about overlaping of slices in index.
1525
1526        It returns the following info:
1527
1528        noverlaps : int
1529            The total number of elements that overlaps in index.
1530        multiplicity : array of int
1531            The number of times that a concrete slice overlaps with any other.
1532        toverlap : float
1533            An ovelap index: the sum of the values in segment slices that
1534            overlaps divided by the entire range of values.  This index is only
1535            computed for numerical types.
1536
1537        """
1538
1539        ss = self.slicesize
1540        ranges = where.ranges[:]
1541        sorted = where.sorted
1542        sortedLR = where.sortedLR
1543        nslices = self.nslices
1544        nelementsLR = self.nelementsILR
1545        if nelementsLR > 0:
1546            # Add the ranges corresponding to the last row
1547            rangeslr = numpy.array([self.bebounds[0], self.bebounds[-1]])
1548            ranges = numpy.concatenate((ranges, [rangeslr]))
1549            nslices += 1
1550        soverlap = 0.
1551        toverlap = -1.
1552        multiplicity = numpy.zeros(shape=nslices, dtype="int_")
1553        overlaps = multiplicity.copy()
1554        starts = multiplicity.copy()
1555        for i in range(nslices):
1556            prev_end = ranges[i, 1]
1557            for j in range(i + 1, nslices):
1558                stj = starts[j]
1559                assert stj <= ss
1560                if stj == ss:
1561                    # This slice has already been counted
1562                    continue
1563                if j < self.nslices:
1564                    next_beg = sorted[j, stj]
1565                else:
1566                    next_beg = sortedLR[stj]
1567                next_end = ranges[j, 1]
1568                if prev_end > next_end:
1569                    # Complete overlapping case
1570                    multiplicity[j - i] += 1
1571                    if j < self.nslices:
1572                        overlaps[i] += ss - stj
1573                        starts[j] = ss   # a sentinel
1574                    else:
1575                        overlaps[i] += nelementsLR - stj
1576                        starts[j] = nelementsLR   # a sentinel
1577                elif prev_end > next_beg:
1578                    multiplicity[j - i] += 1
1579                    idx = self.search_item_lt(
1580                        where, prev_end, j, ranges[j], stj)
1581                    nelem = idx - stj
1582                    overlaps[i] += nelem
1583                    starts[j] = idx
1584                    if self.type != "string":
1585                        # Convert ranges into floats in order to allow
1586                        # doing operations with them without overflows
1587                        soverlap += float(ranges[i, 1]) - float(ranges[j, 0])
1588
1589        # Return the overlap as the ratio between overlaps and entire range
1590        if self.type != "string":
1591            erange = float(ranges[-1, 1]) - float(ranges[0, 0])
1592            # Check that there is an effective range of values
1593            # Beware, erange can be negative in situations where
1594            # the values are suffering overflow. This can happen
1595            # specially on big signed integer values (on overflows,
1596            # the end value will become negative!).
1597            # Also, there is no way to compute overlap ratios for
1598            # non-numerical types. So, be careful and always check
1599            # that toverlap has a positive value (it must have been
1600            # initialized to -1. before) before using it.
1601            # F. Alted 2007-01-19
1602            if erange > 0:
1603                toverlap = soverlap / erange
1604        if verbose and message != "init":
1605            print("toverlap (%s):" % message, toverlap)
1606            print("multiplicity:\n", multiplicity, multiplicity.sum())
1607            print("overlaps:\n", overlaps, overlaps.sum())
1608        noverlaps = overlaps.sum()
1609        # For full indexes, set the 'is_csi' flag
1610        if self.indsize == 8 and self._v_file._iswritable():
1611            self._v_attrs.is_csi = (noverlaps == 0)
1612        # Save the number of overlaps for future references
1613        self.noverlaps = noverlaps
1614        return (noverlaps, multiplicity, toverlap)
1615
1616    def compute_overlaps(self, where, message, verbose):
1617        """Compute some statistics about overlaping of slices in index.
1618
1619        It returns the following info:
1620
1621        noverlaps : int
1622            The total number of slices that overlaps in index.
1623        multiplicity : array of int
1624            The number of times that a concrete slice overlaps with any other.
1625        toverlap : float
1626            An ovelap index: the sum of the values in segment slices that
1627            overlaps divided by the entire range of values.  This index is only
1628            computed for numerical types.
1629
1630        """
1631
1632        ranges = where.ranges[:]
1633        nslices = self.nslices
1634        if self.nelementsILR > 0:
1635            # Add the ranges corresponding to the last row
1636            rangeslr = numpy.array([self.bebounds[0], self.bebounds[-1]])
1637            ranges = numpy.concatenate((ranges, [rangeslr]))
1638            nslices += 1
1639        noverlaps = 0
1640        soverlap = 0.
1641        toverlap = -1.
1642        multiplicity = numpy.zeros(shape=nslices, dtype="int_")
1643        for i in range(nslices):
1644            for j in range(i + 1, nslices):
1645                if ranges[i, 1] > ranges[j, 0]:
1646                    noverlaps += 1
1647                    multiplicity[j - i] += 1
1648                    if self.type != "string":
1649                        # Convert ranges into floats in order to allow
1650                        # doing operations with them without overflows
1651                        soverlap += float(ranges[i, 1]) - float(ranges[j, 0])
1652
1653        # Return the overlap as the ratio between overlaps and entire range
1654        if self.type != "string":
1655            erange = float(ranges[-1, 1]) - float(ranges[0, 0])
1656            # Check that there is an effective range of values
1657            # Beware, erange can be negative in situations where
1658            # the values are suffering overflow. This can happen
1659            # specially on big signed integer values (on overflows,
1660            # the end value will become negative!).
1661            # Also, there is no way to compute overlap ratios for
1662            # non-numerical types. So, be careful and always check
1663            # that toverlap has a positive value (it must have been
1664            # initialized to -1. before) before using it.
1665            # F. Altet 2007-01-19
1666            if erange > 0:
1667                toverlap = soverlap / erange
1668        if verbose:
1669            print("overlaps (%s):" % message, noverlaps, toverlap)
1670            print(multiplicity)
1671        # For full indexes, set the 'is_csi' flag
1672        if self.indsize == 8 and self._v_file._iswritable():
1673            self._v_attrs.is_csi = (noverlaps == 0)
1674        # Save the number of overlaps for future references
1675        self.noverlaps = noverlaps
1676        return (noverlaps, multiplicity, toverlap)
1677
1678    def read_sorted_indices(self, what, start, stop, step):
1679        """Return the sorted or indices values in the specified range."""
1680        (start, stop, step) = self._process_range(start, stop, step)
1681        if start >= stop:
1682            return numpy.empty(0, self.dtype)
1683        # Correction for negative values of step (reverse indices)
1684        if step < 0:
1685            tmp = start
1686            start = self.nelements - stop
1687            stop = self.nelements - tmp
1688        if what == "sorted":
1689            values = self.sorted
1690            valuesLR = self.sortedLR
1691            buffer_ = numpy.empty(stop - start, dtype=self.dtype)
1692        else:
1693            values = self.indices
1694            valuesLR = self.indicesLR
1695            buffer_ = numpy.empty(stop - start, dtype="u%d" % self.indsize)
1696        ss = self.slicesize
1697        nrow_start = start // ss
1698        istart = start % ss
1699        nrow_stop = stop // ss
1700        tlen = stop - start
1701        bstart = 0
1702        ilen = 0
1703        for nrow in range(nrow_start, nrow_stop + 1):
1704            blen = ss - istart
1705            if ilen + blen > tlen:
1706                blen = tlen - ilen
1707            if blen <= 0:
1708                break
1709            if nrow < self.nslices:
1710                self.read_slice(
1711                    values, nrow, buffer_[bstart:bstart + blen], istart)
1712            else:
1713                self.read_slice_lr(
1714                    valuesLR, buffer_[bstart:bstart + blen], istart)
1715            istart = 0
1716            bstart += blen
1717            ilen += blen
1718        return buffer_[::step]
1719
1720    def read_sorted(self, start=None, stop=None, step=None):
1721        """Return the sorted values of index in the specified range.
1722
1723        The meaning of the start, stop and step arguments is the same as in
1724        :meth:`Table.read_sorted`.
1725
1726        """
1727
1728        return self.read_sorted_indices('sorted', start, stop, step)
1729
1730
1731    def read_indices(self, start=None, stop=None, step=None):
1732        """Return the indices values of index in the specified range.
1733
1734        The meaning of the start, stop and step arguments is the same as in
1735        :meth:`Table.read_sorted`.
1736
1737        """
1738
1739        return self.read_sorted_indices('indices', start, stop, step)
1740
1741
1742    def _process_range(self, start, stop, step):
1743        """Get a range specifc for the index usage."""
1744
1745        if start is not None and stop is None:
1746            # Special case for the behaviour of PyTables iterators
1747            stop = idx2long(start + 1)
1748        if start is None:
1749            start = 0
1750        else:
1751            start = idx2long(start)
1752        if stop is None:
1753            stop = idx2long(self.nelements)
1754        else:
1755            stop = idx2long(stop)
1756        if step is None:
1757            step = 1
1758        else:
1759            step = idx2long(step)
1760        return (start, stop, step)
1761
1762
1763    def __getitem__(self, key):
1764        """Return the indices values of index in the specified range.
1765
1766        If key argument is an integer, the corresponding index is returned.  If
1767        key is a slice, the range of indices determined by it is returned.  A
1768        negative value of step in slice is supported, meaning that the results
1769        will be returned in reverse order.
1770
1771        This method is equivalent to :meth:`Index.read_indices`.
1772
1773        """
1774
1775        if is_idx(key):
1776            key = operator.index(key)
1777
1778            if key < 0:
1779                # To support negative values
1780                key += self.nelements
1781            return self.read_indices(key, key + 1, 1)[0]
1782        elif isinstance(key, slice):
1783            return self.read_indices(key.start, key.stop, key.step)
1784
1785    def __len__(self):
1786        return self.nelements
1787
1788    def restorecache(self):
1789        "Clean the limits cache and resize starts and lengths arrays"
1790
1791        params = self._v_file.params
1792        # The sorted IndexArray is absolutely required to be in memory
1793        # at the same time than the Index instance, so create a strong
1794        # reference to it.  We are not introducing leaks because the
1795        # strong reference will disappear when this Index instance is
1796        # to be closed.
1797        self._sorted = self.sorted
1798        self._sorted.boundscache = ObjectCache(params['BOUNDS_MAX_SLOTS'],
1799                                               params['BOUNDS_MAX_SIZE'],
1800                                               'non-opt types bounds')
1801        self.sorted.boundscache = ObjectCache(params['BOUNDS_MAX_SLOTS'],
1802                                              params['BOUNDS_MAX_SIZE'],
1803                                              'non-opt types bounds')
1804        """A cache for the bounds (2nd hash) data. Only used for
1805        non-optimized types searches."""
1806        self.limboundscache = ObjectCache(params['LIMBOUNDS_MAX_SLOTS'],
1807                                          params['LIMBOUNDS_MAX_SIZE'],
1808                                          'bounding limits')
1809        """A cache for bounding limits."""
1810        self.sortedLRcache = ObjectCache(params['SORTEDLR_MAX_SLOTS'],
1811                                         params['SORTEDLR_MAX_SIZE'],
1812                                         'last row chunks')
1813        """A cache for the last row chunks. Only used for searches in
1814        the last row, and mainly useful for small indexes."""
1815        self.starts = numpy.empty(shape=self.nrows, dtype=numpy.int32)
1816        self.lengths = numpy.empty(shape=self.nrows, dtype=numpy.int32)
1817        self.sorted._init_sorted_slice(self)
1818        self.dirtycache = False
1819
1820    def search(self, item):
1821        """Do a binary search in this index for an item."""
1822
1823        if profile:
1824            tref = time()
1825        if profile:
1826            show_stats("Entering search", tref)
1827
1828        if self.dirtycache:
1829            self.restorecache()
1830
1831        # An empty item or if left limit is larger than the right one
1832        # means that the number of records is always going to be empty,
1833        # so we avoid further computation (including looking up the
1834        # limits cache).
1835        if not item or item[0] > item[1]:
1836            self.starts[:] = 0
1837            self.lengths[:] = 0
1838            return 0
1839
1840        tlen = 0
1841        # Check whether the item tuple is in the limits cache or not
1842        nslot = self.limboundscache.getslot(item)
1843        if nslot >= 0:
1844            startlengths = self.limboundscache.getitem(nslot)
1845            # Reset the lengths array (not necessary for starts)
1846            self.lengths[:] = 0
1847            # Now, set the interesting rows
1848            for nrow in range(len(startlengths)):
1849                nrow2, start, length = startlengths[nrow]
1850                self.starts[nrow2] = start
1851                self.lengths[nrow2] = length
1852                tlen = tlen + length
1853            return tlen
1854        # The item is not in cache. Do the real lookup.
1855        sorted = self.sorted
1856        if self.nslices > 0:
1857            if self.type in self.opt_search_types:
1858                # The next are optimizations. However, they hide the
1859                # CPU functions consumptions from python profiles.
1860                # You may want to de-activate them during profiling.
1861                if self.type == "int32":
1862                    tlen = sorted._search_bin_na_i(*item)
1863                elif self.type == "int64":
1864                    tlen = sorted._search_bin_na_ll(*item)
1865                elif self.type == "float16":
1866                    tlen = sorted._search_bin_na_e(*item)
1867                elif self.type == "float32":
1868                    tlen = sorted._search_bin_na_f(*item)
1869                elif self.type == "float64":
1870                    tlen = sorted._search_bin_na_d(*item)
1871                elif self.type == "float96":
1872                    tlen = sorted._search_bin_na_g(*item)
1873                elif self.type == "float128":
1874                    tlen = sorted._search_bin_na_g(*item)
1875                elif self.type == "uint32":
1876                    tlen = sorted._search_bin_na_ui(*item)
1877                elif self.type == "uint64":
1878                    tlen = sorted._search_bin_na_ull(*item)
1879                elif self.type == "int8":
1880                    tlen = sorted._search_bin_na_b(*item)
1881                elif self.type == "int16":
1882                    tlen = sorted._search_bin_na_s(*item)
1883                elif self.type == "uint8":
1884                    tlen = sorted._search_bin_na_ub(*item)
1885                elif self.type == "uint16":
1886                    tlen = sorted._search_bin_na_us(*item)
1887                else:
1888                    assert False, "This can't happen!"
1889            else:
1890                tlen = self.search_scalar(item, sorted)
1891        # Get possible remaining values in last row
1892        if self.nelementsSLR > 0:
1893            # Look for more indexes in the last row
1894            (start, stop) = self.search_last_row(item)
1895            self.starts[-1] = start
1896            self.lengths[-1] = stop - start
1897            tlen += stop - start
1898
1899        if self.limboundscache.couldenablecache():
1900            # Get a startlengths tuple and save it in cache.
1901            # This is quite slow, but it is a good way to compress
1902            # the bounds info. Moreover, the .couldenablecache()
1903            # is doing a good work so as to avoid computing this
1904            # when it is not necessary to do it.
1905            startlengths = []
1906            for nrow, length in enumerate(self.lengths):
1907                if length > 0:
1908                    startlengths.append((nrow, self.starts[nrow], length))
1909            # Compute the size of the recarray (aproximately)
1910            # The +1 at the end is important to avoid 0 lengths
1911            # (remember, the object headers take some space)
1912            size = len(startlengths) * 8 * 2 + 1
1913            # Put this startlengths list in cache
1914            self.limboundscache.setitem(item, startlengths, size)
1915
1916        if profile:
1917            show_stats("Exiting search", tref)
1918        return tlen
1919
1920    # This is an scalar version of search. It works with strings as well.
1921    def search_scalar(self, item, sorted):
1922        """Do a binary search in this index for an item."""
1923
1924        tlen = 0
1925        # Do the lookup for values fullfilling the conditions
1926        for i in range(self.nslices):
1927            (start, stop) = sorted._search_bin(i, item)
1928            self.starts[i] = start
1929            self.lengths[i] = stop - start
1930            tlen += stop - start
1931        return tlen
1932
1933    def search_last_row(self, item):
1934        # Variable initialization
1935        item1, item2 = item
1936        bebounds = self.bebounds
1937        b0, b1 = bebounds[0], bebounds[-1]
1938        bounds = bebounds[1:-1]
1939        itemsize = self.dtype.itemsize
1940        sortedLRcache = self.sortedLRcache
1941        hi = self.nelementsSLR               # maximum number of elements
1942        rchunksize = self.chunksize // self.reduction
1943
1944        nchunk = -1
1945        # Lookup for item1
1946        if nan_aware_gt(item1, b0):
1947            if nan_aware_le(item1, b1):
1948                # Search the appropriate chunk in bounds cache
1949                nchunk = bisect_left(bounds, item1)
1950                # Lookup for this chunk in cache
1951                nslot = sortedLRcache.getslot(nchunk)
1952                if nslot >= 0:
1953                    chunk = sortedLRcache.getitem(nslot)
1954                else:
1955                    begin = rchunksize * nchunk
1956                    end = rchunksize * (nchunk + 1)
1957                    if end > hi:
1958                        end = hi
1959                    # Read the chunk from disk
1960                    chunk = self.sortedLR._read_sorted_slice(
1961                        self.sorted, begin, end)
1962                    # Put it in cache.  It's important to *copy*
1963                    # the buffer, as it is reused in future reads!
1964                    sortedLRcache.setitem(nchunk, chunk.copy(),
1965                                          (end - begin) * itemsize)
1966                start = bisect_left(chunk, item1)
1967                start += rchunksize * nchunk
1968            else:
1969                start = hi
1970        else:
1971            start = 0
1972        # Lookup for item2
1973        if nan_aware_ge(item2, b0):
1974            if nan_aware_lt(item2, b1):
1975                # Search the appropriate chunk in bounds cache
1976                nchunk2 = bisect_right(bounds, item2)
1977                if nchunk2 != nchunk:
1978                    # Lookup for this chunk in cache
1979                    nslot = sortedLRcache.getslot(nchunk2)
1980                    if nslot >= 0:
1981                        chunk = sortedLRcache.getitem(nslot)
1982                    else:
1983                        begin = rchunksize * nchunk2
1984                        end = rchunksize * (nchunk2 + 1)
1985                        if end > hi:
1986                            end = hi
1987                        # Read the chunk from disk
1988                        chunk = self.sortedLR._read_sorted_slice(
1989                            self.sorted, begin, end)
1990                        # Put it in cache.  It's important to *copy*
1991                        # the buffer, as it is reused in future reads!
1992                        # See bug #60 in xot.carabos.com
1993                        sortedLRcache.setitem(nchunk2, chunk.copy(),
1994                                              (end - begin) * itemsize)
1995                stop = bisect_right(chunk, item2)
1996                stop += rchunksize * nchunk2
1997            else:
1998                stop = hi
1999        else:
2000            stop = 0
2001        return (start, stop)
2002
2003
2004    def get_chunkmap(self):
2005        """Compute a map with the interesting chunks in index."""
2006
2007        if profile:
2008            tref = time()
2009        if profile:
2010            show_stats("Entering get_chunkmap", tref)
2011        ss = self.slicesize
2012        nsb = self.nslicesblock
2013        nslices = self.nslices
2014        lbucket = self.lbucket
2015        indsize = self.indsize
2016        bucketsinblock = float(self.blocksize) / lbucket
2017        nchunks = int(math.ceil(float(self.nelements) / lbucket))
2018        chunkmap = numpy.zeros(shape=nchunks, dtype="bool")
2019        reduction = self.reduction
2020        starts = (self.starts - 1) * reduction + 1
2021        stops = (self.starts + self.lengths) * reduction
2022        starts[starts < 0] = 0    # All negative values set to zero
2023        indices = self.indices
2024        for nslice in range(self.nrows):
2025            start = starts[nslice]
2026            stop = stops[nslice]
2027            if stop > start:
2028                idx = numpy.empty(shape=stop - start, dtype='u%d' % indsize)
2029                if nslice < nslices:
2030                    indices._read_index_slice(nslice, start, stop, idx)
2031                else:
2032                    self.indicesLR._read_index_slice(start, stop, idx)
2033                if indsize == 8:
2034                    idx //= lbucket
2035                elif indsize == 2:
2036                    # The chunkmap size cannot be never larger than 'int_'
2037                    idx = idx.astype("int_")
2038                    offset = int((nslice // nsb) * bucketsinblock)
2039                    idx += offset
2040                elif indsize == 1:
2041                    # The chunkmap size cannot be never larger than 'int_'
2042                    idx = idx.astype("int_")
2043                    offset = (nslice * ss) // lbucket
2044                    idx += offset
2045                chunkmap[idx] = True
2046        # The case lbucket < nrowsinchunk should only happen in tests
2047        nrowsinchunk = self.nrowsinchunk
2048        if lbucket != nrowsinchunk:
2049            # Map the 'coarse grain' chunkmap into the 'true' chunkmap
2050            nelements = self.nelements
2051            tnchunks = int(math.ceil(float(nelements) / nrowsinchunk))
2052            tchunkmap = numpy.zeros(shape=tnchunks, dtype="bool")
2053            ratio = float(lbucket) / nrowsinchunk
2054            idx = chunkmap.nonzero()[0]
2055            starts = (idx * ratio).astype('int_')
2056            stops = numpy.ceil((idx + 1) * ratio).astype('int_')
2057            for i in range(len(idx)):
2058                tchunkmap[starts[i]:stops[i]] = True
2059            chunkmap = tchunkmap
2060        if profile:
2061            show_stats("Exiting get_chunkmap", tref)
2062        return chunkmap
2063
2064    def get_lookup_range(self, ops, limits):
2065        assert len(ops) in [1, 2]
2066        assert len(limits) in [1, 2]
2067        assert len(ops) == len(limits)
2068
2069        column = self.column
2070        coldtype = column.dtype.base
2071        itemsize = coldtype.itemsize
2072
2073        if len(limits) == 1:
2074            assert ops[0] in ['lt', 'le', 'eq', 'ge', 'gt']
2075            limit = limits[0]
2076            op = ops[0]
2077            if op == 'lt':
2078                range_ = (inftype(coldtype, itemsize, sign=-1),
2079                          nextafter(limit, -1, coldtype, itemsize))
2080            elif op == 'le':
2081                range_ = (inftype(coldtype, itemsize, sign=-1),
2082                          limit)
2083            elif op == 'gt':
2084                range_ = (nextafter(limit, +1, coldtype, itemsize),
2085                          inftype(coldtype, itemsize, sign=+1))
2086            elif op == 'ge':
2087                range_ = (limit,
2088                          inftype(coldtype, itemsize, sign=+1))
2089            elif op == 'eq':
2090                range_ = (limit, limit)
2091
2092        elif len(limits) == 2:
2093            assert ops[0] in ('gt', 'ge') and ops[1] in ('lt', 'le')
2094
2095            lower, upper = limits
2096            if lower > upper:
2097                # ``a <[=] x <[=] b`` is always false if ``a > b``.
2098                return ()
2099
2100            if ops == ('gt', 'lt'):  # lower < col < upper
2101                range_ = (nextafter(lower, +1, coldtype, itemsize),
2102                          nextafter(upper, -1, coldtype, itemsize))
2103            elif ops == ('ge', 'lt'):  # lower <= col < upper
2104                range_ = (lower, nextafter(upper, -1, coldtype, itemsize))
2105            elif ops == ('gt', 'le'):  # lower < col <= upper
2106                range_ = (nextafter(lower, +1, coldtype, itemsize), upper)
2107            elif ops == ('ge', 'le'):  # lower <= col <= upper
2108                range_ = (lower, upper)
2109
2110        return range_
2111
2112
2113    def _f_remove(self, recursive=False):
2114        """Remove this Index object."""
2115
2116        # Index removal is always recursive,
2117        # no matter what `recursive` says.
2118        super(Index, self)._f_remove(True)
2119
2120    def __str__(self):
2121        """This provides a more compact representation than __repr__"""
2122
2123        # The filters
2124        filters = ""
2125        if self.filters.complevel:
2126            if self.filters.shuffle:
2127                filters += ", shuffle"
2128            if self.filters.bitshuffle:
2129                filters += ", bitshuffle"
2130            filters += ", %s(%s)" % (self.filters.complib,
2131                                     self.filters.complevel)
2132        return "Index(%s, %s%s).is_csi=%s" % \
2133               (self.optlevel, self.kind, filters, self.is_csi)
2134
2135    def __repr__(self):
2136        """This provides more metainfo than standard __repr__"""
2137
2138        cpathname = self.table._v_pathname + ".cols." + self.column.pathname
2139        retstr = """%s (Index for column %s)
2140  optlevel := %s
2141  kind := %s
2142  filters := %s
2143  is_csi := %s
2144  nelements := %s
2145  chunksize := %s
2146  slicesize := %s
2147  blocksize := %s
2148  superblocksize := %s
2149  dirty := %s
2150  byteorder := %r""" % (self._v_pathname, cpathname,
2151                        self.optlevel, self.kind,
2152                        self.filters, self.is_csi,
2153                        self.nelements,
2154                        self.chunksize, self.slicesize,
2155                        self.blocksize, self.superblocksize,
2156                        self.dirty, self.byteorder)
2157        retstr += "\n  sorted := %s" % self.sorted
2158        retstr += "\n  indices := %s" % self.indices
2159        retstr += "\n  ranges := %s" % self.ranges
2160        retstr += "\n  bounds := %s" % self.bounds
2161        retstr += "\n  sortedLR := %s" % self.sortedLR
2162        retstr += "\n  indicesLR := %s" % self.indicesLR
2163        return retstr
2164
2165
2166class IndexesDescG(NotLoggedMixin, Group):
2167    _c_classid = 'DINDEX'
2168
2169
2170    def _g_width_warning(self):
2171        warnings.warn(
2172            "the number of indexed columns on a single description group "
2173            "is exceeding the recommended maximum (%d); "
2174            "be ready to see PyTables asking for *lots* of memory "
2175            "and possibly slow I/O" % self._v_max_group_width,
2176            PerformanceWarning)
2177
2178
2179
2180class IndexesTableG(NotLoggedMixin, Group):
2181    _c_classid = 'TINDEX'
2182
2183
2184    @property
2185    def auto(self):
2186        if 'AUTO_INDEX' not in self._v_attrs:
2187            return default_auto_index
2188        return self._v_attrs.AUTO_INDEX
2189
2190    @auto.setter
2191    def auto(self, auto):
2192        self._v_attrs.AUTO_INDEX = bool(auto)
2193
2194    @auto.deleter
2195    def auto(self):
2196        del self._v_attrs.AUTO_INDEX
2197
2198    def _g_width_warning(self):
2199        warnings.warn(
2200            "the number of indexed columns on a single table "
2201            "is exceeding the recommended maximum (%d); "
2202            "be ready to see PyTables asking for *lots* of memory "
2203            "and possibly slow I/O" % self._v_max_group_width,
2204            PerformanceWarning)
2205
2206
2207    def _g_check_name(self, name):
2208        if not name.startswith('_i_'):
2209            raise ValueError(
2210                "names of index groups must start with ``_i_``: %s" % name)
2211
2212
2213    @property
2214    def table(self):
2215        "Accessor for the `Table` object of this `IndexesTableG` container."
2216        names = self._v_pathname.split("/")
2217        tablename = names.pop()[3:]   # "_i_" is at the beginning
2218        parentpathname = "/".join(names)
2219        tablepathname = join_path(parentpathname, tablename)
2220        table = self._v_file._get_node(tablepathname)
2221        return table
2222
2223
2224class OldIndex(NotLoggedMixin, Group):
2225    """This is meant to hide indexes of PyTables 1.x files."""
2226
2227    _c_classid = 'CINDEX'
2228
2229
2230
2231## Local Variables:
2232## mode: python
2233## py-indent-offset: 4
2234## tab-width: 4
2235## fill-column: 72
2236## End:
2237