numeric.py - OpenGrok cross reference for /dports/textproc/py-whoosh/Whoosh-2.7.4/src/whoosh/util/numeric.py

# Copyright 2010 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#    1. Redistributions of source code must retain the above copyright notice,
#       this list of conditions and the following disclaimer.
#
#    2. Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.

import math, struct
from array import array
from bisect import bisect_left
from struct import pack, unpack

from whoosh.compat import b, long_type
from whoosh.system import pack_byte, unpack_byte, pack_ushort, unpack_ushort
from whoosh.system import pack_int, unpack_int, pack_uint, unpack_uint
from whoosh.system import pack_long, unpack_long, pack_ulong, unpack_ulong
from whoosh.system import pack_float, unpack_float, pack_double, unpack_double


NaN = struct.unpack("<d", b('\xff\xff\xff\xff\xff\xff\xff\xff'))[0]

typecode_max = {"b": 127, "B": 255, "h": 2 ** 15 - 1, "H": 2 ** 16 - 1,
                "i": 2 ** 31 - 1, "I": 2 ** 32 - 1,
                "q": 2 ** 63 - 1, "Q": 2 ** 64 - 1}
typecode_min = {"b": 0 - 128, "B": 0, "h": 0 - 2 ** 15, "H": 0,
                "i": 0 - 2 ** 31, "I": 0,
                "q": 0 - 2 ** 63, "Q": 0}
typecode_pack = {"B": pack_byte, "H": pack_ushort, "i": pack_int,
                 "I": pack_uint, "q": pack_long, "Q": pack_ulong,
                 "f": pack_float, "d": pack_double}
typecode_unpack = {"B": unpack_byte, "H": unpack_ushort, "i": unpack_int,
                   "I": unpack_uint, "q": unpack_long, "Q": unpack_ulong,
                   "f": unpack_float, "d": unpack_double}


# Functions related to binary representations

def bits_required(maxnum):
    """Returns the number of bits required to represent the given (unsigned)
    integer.
    """

    return max(1, math.ceil(math.log(maxnum, 2)))


def typecode_required(maxnum):
    if maxnum < 256:
        return "B"
    elif maxnum < 2 ** 16:
        return "H"
    elif maxnum < 2 ** 31 - 1:
        return "i"
    elif maxnum < 2 ** 32:
        return "I"
    elif maxnum < 2 ** 63 - 1:
        return "q"
    else:
        return "Q"


def max_value(bitcount):
    """Returns the maximum (unsigned) integer representable in the given number
    of bits.
    """

    return ~(~0 << bitcount)


def bytes_for_bits(bitcount):
    r = int(math.ceil((bitcount + 1) / 8.0))
    return r


# Functions for converting numbers to and from sortable representations

_istruct = struct.Struct(">i")
_qstruct = struct.Struct(">q")
_dstruct = struct.Struct(">d")
_ipack, _iunpack = _istruct.pack, _istruct.unpack
_qpack, _qunpack = _qstruct.pack, _qstruct.unpack
_dpack, _dunpack = _dstruct.pack, _dstruct.unpack


def to_sortable(numtype, intsize, signed, x):
    if numtype is int or numtype is long_type:
        if signed:
            x += (1 << intsize - 1)
        return x
    else:
        return float_to_sortable_long(x, signed)


def from_sortable(numtype, intsize, signed, x):
    if numtype is int or numtype is long_type:
        if signed:
            x -= (1 << intsize - 1)
        return x
    else:
        return sortable_long_to_float(x, signed)


def float_to_sortable_long(x, signed):
    x = _qunpack(_dpack(x))[0]
    if x < 0:
        x ^= 0x7fffffffffffffff
    if signed:
        x += 1 << 63
    assert x >= 0
    return x


def sortable_long_to_float(x, signed):
    if signed:
        x -= 1 << 63
    if x < 0:
        x ^= 0x7fffffffffffffff
    x = _dunpack(_qpack(x))[0]
    return x


# Functions for generating tiered ranges

def split_ranges(intsize, step, start, end):
    """Splits a range of numbers (from ``start`` to ``end``, inclusive)
    into a sequence of trie ranges of the form ``(start, end, shift)``. The
    consumer of these tuples is expected to shift the ``start`` and ``end``
    right by ``shift``.

    This is used for generating term ranges for a numeric field. The queries
    for the edges of the range are generated at high precision and large blocks
    in the middle are generated at low precision.
    """

    shift = 0
    while True:
        diff = 1 << (shift + step)
        mask = ((1 << step) - 1) << shift
        setbits = lambda x: x | ((1 << shift) - 1)

        haslower = (start & mask) != 0
        hasupper = (end & mask) != mask

        not_mask = ~mask & ((1 << intsize + 1) - 1)
        nextstart = (start + diff if haslower else start) & not_mask
        nextend = (end - diff if hasupper else end) & not_mask

        if shift + step >= intsize or nextstart > nextend:
            yield (start, setbits(end), shift)
            break

        if haslower:
            yield (start, setbits(start | mask), shift)
        if hasupper:
            yield (end & not_mask, setbits(end), shift)

        start = nextstart
        end = nextend
        shift += step


def tiered_ranges(numtype, intsize, signed, start, end, shift_step,
                  startexcl, endexcl):
    assert numtype in (int, float)
    assert intsize in (8, 16, 32, 64)

    # Convert start and end values to sortable ints
    if start is None:
        start = 0
    else:
        start = to_sortable(numtype, intsize, signed, start)
        if startexcl:
            start += 1

    if end is None:
        end = 2 ** intsize - 1
    else:
        end = to_sortable(numtype, intsize, signed, end)
        if endexcl:
            end -= 1

    if not shift_step:
        return ((start, end, 0),)

    # Yield (rstart, rend, shift) ranges for the different resolutions
    return split_ranges(intsize, shift_step, start, end)


# Float-to-byte encoding/decoding

def float_to_byte(value, mantissabits=5, zeroexp=2):
    """Encodes a floating point number in a single byte.
    """

    # Assume int size == float size

    fzero = (63 - zeroexp) << mantissabits
    bits = unpack("i", pack("f", value))[0]
    smallfloat = bits >> (24 - mantissabits)
    if smallfloat < fzero:
        # Map negative numbers and 0 to 0
        # Map underflow to next smallest non-zero number
        if bits <= 0:
            result = chr(0)
        else:
            result = chr(1)
    elif smallfloat >= fzero + 0x100:
        # Map overflow to largest number
        result = chr(255)
    else:
        result = chr(smallfloat - fzero)
    return b(result)


def byte_to_float(b, mantissabits=5, zeroexp=2):
    """Decodes a floating point number stored in a single byte.
    """
    if type(b) is not int:
        b = ord(b)
    if b == 0:
        return 0.0

    bits = (b & 0xff) << (24 - mantissabits)
    bits += (63 - zeroexp) << 24
    return unpack("f", pack("i", bits))[0]


# Length-to-byte approximation functions

# Old implementation:

#def length_to_byte(length):
#    """Returns a logarithmic approximation of the given number, in the range
#    0-255. The approximation has high precision at the low end (e.g.
#    1 -> 0, 2 -> 1, 3 -> 2 ...) and low precision at the high end. Numbers
#    equal to or greater than 108116 all approximate to 255.
#
#    This is useful for storing field lengths, where the general case is small
#    documents and very large documents are more rare.
#    """
#
#    # This encoding formula works up to 108116 -> 255, so if the length is
#    # equal to or greater than that limit, just return 255.
#    if length >= 108116:
#        return 255
#
#    # The parameters of this formula where chosen heuristically so that low
#    # numbers would approximate closely, and the byte range 0-255 would cover
#    # a decent range of document lengths (i.e. 1 to ~100000).
#    return int(round(log((length / 27.0) + 1, 1.033)))
#def _byte_to_length(n):
#    return int(round((pow(1.033, n) - 1) * 27))
#_b2l_cache = array("i", (_byte_to_length(i) for i in xrange(256)))
#byte_to_length = _b2l_cache.__getitem__

# New implementation

# Instead of computing the actual formula to get the byte for any given length,
# precompute the length associated with each byte, and use bisect to find the
# nearest value. This gives quite a large speed-up.
#
# Note that this does not give all the same answers as the old, "real"
# implementation since this implementation always "rounds down" (thanks to the
# bisect_left) while the old implementation would "round up" or "round down"
# depending on the input. Since this is a fairly gross approximation anyway,
# I don't think it matters much.

# Values generated using the formula from the "old" implementation above
_length_byte_cache = array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14,
16, 17, 18, 20, 21, 23, 25, 26, 28, 30, 32, 34, 36, 38, 40, 42, 45, 47, 49, 52,
54, 57, 60, 63, 66, 69, 72, 75, 79, 82, 86, 89, 93, 97, 101, 106, 110, 114,
119, 124, 129, 134, 139, 145, 150, 156, 162, 169, 175, 182, 189, 196, 203, 211,
219, 227, 235, 244, 253, 262, 271, 281, 291, 302, 313, 324, 336, 348, 360, 373,
386, 399, 414, 428, 443, 459, 475, 491, 508, 526, 544, 563, 583, 603, 623, 645,
667, 690, 714, 738, 763, 789, 816, 844, 873, 903, 933, 965, 998, 1032, 1066,
1103, 1140, 1178, 1218, 1259, 1302, 1345, 1391, 1438, 1486, 1536, 1587, 1641,
1696, 1753, 1811, 1872, 1935, 1999, 2066, 2135, 2207, 2280, 2356, 2435, 2516,
2600, 2687, 2777, 2869, 2965, 3063, 3165, 3271, 3380, 3492, 3608, 3728, 3852,
3980, 4112, 4249, 4390, 4536, 4686, 4842, 5002, 5168, 5340, 5517, 5700, 5889,
6084, 6286, 6494, 6709, 6932, 7161, 7398, 7643, 7897, 8158, 8428, 8707, 8995,
9293, 9601, 9918, 10247, 10586, 10936, 11298, 11671, 12057, 12456, 12868,
13294, 13733, 14187, 14656, 15141, 15641, 16159, 16693, 17244, 17814, 18403,
19011, 19640, 20289, 20959, 21652, 22367, 23106, 23869, 24658, 25472, 26314,
27183, 28081, 29009, 29967, 30957, 31979, 33035, 34126, 35254, 36418, 37620,
38863, 40146, 41472, 42841, 44256, 45717, 47227, 48786, 50397, 52061, 53780,
55556, 57390, 59285, 61242, 63264, 65352, 67510, 69739, 72041, 74419, 76876,
79414, 82035, 84743, 87541, 90430, 93416, 96499, 99684, 102975, 106374])


def length_to_byte(length):
    if length is None:
        return 0
    if length >= 106374:
        return 255
    else:
        return bisect_left(_length_byte_cache, length)

byte_to_length = _length_byte_cache.__getitem__