1# Copyright 2010 Matt Chaput. All rights reserved.
2#
3# Redistribution and use in source and binary forms, with or without
4# modification, are permitted provided that the following conditions are met:
5#
6#    1. Redistributions of source code must retain the above copyright notice,
7#       this list of conditions and the following disclaimer.
8#
9#    2. Redistributions in binary form must reproduce the above copyright
10#       notice, this list of conditions and the following disclaimer in the
11#       documentation and/or other materials provided with the distribution.
12#
13# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
14# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
15# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
16# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
17# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
18# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
19# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
22# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23#
24# The views and conclusions contained in the software and documentation are
25# those of the authors and should not be interpreted as representing official
26# policies, either expressed or implied, of Matt Chaput.
27
28import math, struct
29from array import array
30from bisect import bisect_left
31from struct import pack, unpack
32
33from whoosh.compat import b, long_type
34from whoosh.system import pack_byte, unpack_byte, pack_ushort, unpack_ushort
35from whoosh.system import pack_int, unpack_int, pack_uint, unpack_uint
36from whoosh.system import pack_long, unpack_long, pack_ulong, unpack_ulong
37from whoosh.system import pack_float, unpack_float, pack_double, unpack_double
38
39
40NaN = struct.unpack("<d", b('\xff\xff\xff\xff\xff\xff\xff\xff'))[0]
41
42typecode_max = {"b": 127, "B": 255, "h": 2 ** 15 - 1, "H": 2 ** 16 - 1,
43                "i": 2 ** 31 - 1, "I": 2 ** 32 - 1,
44                "q": 2 ** 63 - 1, "Q": 2 ** 64 - 1}
45typecode_min = {"b": 0 - 128, "B": 0, "h": 0 - 2 ** 15, "H": 0,
46                "i": 0 - 2 ** 31, "I": 0,
47                "q": 0 - 2 ** 63, "Q": 0}
48typecode_pack = {"B": pack_byte, "H": pack_ushort, "i": pack_int,
49                 "I": pack_uint, "q": pack_long, "Q": pack_ulong,
50                 "f": pack_float, "d": pack_double}
51typecode_unpack = {"B": unpack_byte, "H": unpack_ushort, "i": unpack_int,
52                   "I": unpack_uint, "q": unpack_long, "Q": unpack_ulong,
53                   "f": unpack_float, "d": unpack_double}
54
55
56# Functions related to binary representations
57
58def bits_required(maxnum):
59    """Returns the number of bits required to represent the given (unsigned)
60    integer.
61    """
62
63    return max(1, math.ceil(math.log(maxnum, 2)))
64
65
66def typecode_required(maxnum):
67    if maxnum < 256:
68        return "B"
69    elif maxnum < 2 ** 16:
70        return "H"
71    elif maxnum < 2 ** 31 - 1:
72        return "i"
73    elif maxnum < 2 ** 32:
74        return "I"
75    elif maxnum < 2 ** 63 - 1:
76        return "q"
77    else:
78        return "Q"
79
80
81def max_value(bitcount):
82    """Returns the maximum (unsigned) integer representable in the given number
83    of bits.
84    """
85
86    return ~(~0 << bitcount)
87
88
89def bytes_for_bits(bitcount):
90    r = int(math.ceil((bitcount + 1) / 8.0))
91    return r
92
93
94# Functions for converting numbers to and from sortable representations
95
96_istruct = struct.Struct(">i")
97_qstruct = struct.Struct(">q")
98_dstruct = struct.Struct(">d")
99_ipack, _iunpack = _istruct.pack, _istruct.unpack
100_qpack, _qunpack = _qstruct.pack, _qstruct.unpack
101_dpack, _dunpack = _dstruct.pack, _dstruct.unpack
102
103
104def to_sortable(numtype, intsize, signed, x):
105    if numtype is int or numtype is long_type:
106        if signed:
107            x += (1 << intsize - 1)
108        return x
109    else:
110        return float_to_sortable_long(x, signed)
111
112
113def from_sortable(numtype, intsize, signed, x):
114    if numtype is int or numtype is long_type:
115        if signed:
116            x -= (1 << intsize - 1)
117        return x
118    else:
119        return sortable_long_to_float(x, signed)
120
121
122def float_to_sortable_long(x, signed):
123    x = _qunpack(_dpack(x))[0]
124    if x < 0:
125        x ^= 0x7fffffffffffffff
126    if signed:
127        x += 1 << 63
128    assert x >= 0
129    return x
130
131
132def sortable_long_to_float(x, signed):
133    if signed:
134        x -= 1 << 63
135    if x < 0:
136        x ^= 0x7fffffffffffffff
137    x = _dunpack(_qpack(x))[0]
138    return x
139
140
141# Functions for generating tiered ranges
142
143def split_ranges(intsize, step, start, end):
144    """Splits a range of numbers (from ``start`` to ``end``, inclusive)
145    into a sequence of trie ranges of the form ``(start, end, shift)``. The
146    consumer of these tuples is expected to shift the ``start`` and ``end``
147    right by ``shift``.
148
149    This is used for generating term ranges for a numeric field. The queries
150    for the edges of the range are generated at high precision and large blocks
151    in the middle are generated at low precision.
152    """
153
154    shift = 0
155    while True:
156        diff = 1 << (shift + step)
157        mask = ((1 << step) - 1) << shift
158        setbits = lambda x: x | ((1 << shift) - 1)
159
160        haslower = (start & mask) != 0
161        hasupper = (end & mask) != mask
162
163        not_mask = ~mask & ((1 << intsize + 1) - 1)
164        nextstart = (start + diff if haslower else start) & not_mask
165        nextend = (end - diff if hasupper else end) & not_mask
166
167        if shift + step >= intsize or nextstart > nextend:
168            yield (start, setbits(end), shift)
169            break
170
171        if haslower:
172            yield (start, setbits(start | mask), shift)
173        if hasupper:
174            yield (end & not_mask, setbits(end), shift)
175
176        start = nextstart
177        end = nextend
178        shift += step
179
180
181def tiered_ranges(numtype, intsize, signed, start, end, shift_step,
182                  startexcl, endexcl):
183    assert numtype in (int, float)
184    assert intsize in (8, 16, 32, 64)
185
186    # Convert start and end values to sortable ints
187    if start is None:
188        start = 0
189    else:
190        start = to_sortable(numtype, intsize, signed, start)
191        if startexcl:
192            start += 1
193
194    if end is None:
195        end = 2 ** intsize - 1
196    else:
197        end = to_sortable(numtype, intsize, signed, end)
198        if endexcl:
199            end -= 1
200
201    if not shift_step:
202        return ((start, end, 0),)
203
204    # Yield (rstart, rend, shift) ranges for the different resolutions
205    return split_ranges(intsize, shift_step, start, end)
206
207
208# Float-to-byte encoding/decoding
209
210def float_to_byte(value, mantissabits=5, zeroexp=2):
211    """Encodes a floating point number in a single byte.
212    """
213
214    # Assume int size == float size
215
216    fzero = (63 - zeroexp) << mantissabits
217    bits = unpack("i", pack("f", value))[0]
218    smallfloat = bits >> (24 - mantissabits)
219    if smallfloat < fzero:
220        # Map negative numbers and 0 to 0
221        # Map underflow to next smallest non-zero number
222        if bits <= 0:
223            result = chr(0)
224        else:
225            result = chr(1)
226    elif smallfloat >= fzero + 0x100:
227        # Map overflow to largest number
228        result = chr(255)
229    else:
230        result = chr(smallfloat - fzero)
231    return b(result)
232
233
234def byte_to_float(b, mantissabits=5, zeroexp=2):
235    """Decodes a floating point number stored in a single byte.
236    """
237    if type(b) is not int:
238        b = ord(b)
239    if b == 0:
240        return 0.0
241
242    bits = (b & 0xff) << (24 - mantissabits)
243    bits += (63 - zeroexp) << 24
244    return unpack("f", pack("i", bits))[0]
245
246
247# Length-to-byte approximation functions
248
249# Old implementation:
250
251#def length_to_byte(length):
252#    """Returns a logarithmic approximation of the given number, in the range
253#    0-255. The approximation has high precision at the low end (e.g.
254#    1 -> 0, 2 -> 1, 3 -> 2 ...) and low precision at the high end. Numbers
255#    equal to or greater than 108116 all approximate to 255.
256#
257#    This is useful for storing field lengths, where the general case is small
258#    documents and very large documents are more rare.
259#    """
260#
261#    # This encoding formula works up to 108116 -> 255, so if the length is
262#    # equal to or greater than that limit, just return 255.
263#    if length >= 108116:
264#        return 255
265#
266#    # The parameters of this formula where chosen heuristically so that low
267#    # numbers would approximate closely, and the byte range 0-255 would cover
268#    # a decent range of document lengths (i.e. 1 to ~100000).
269#    return int(round(log((length / 27.0) + 1, 1.033)))
270#def _byte_to_length(n):
271#    return int(round((pow(1.033, n) - 1) * 27))
272#_b2l_cache = array("i", (_byte_to_length(i) for i in xrange(256)))
273#byte_to_length = _b2l_cache.__getitem__
274
275# New implementation
276
277# Instead of computing the actual formula to get the byte for any given length,
278# precompute the length associated with each byte, and use bisect to find the
279# nearest value. This gives quite a large speed-up.
280#
281# Note that this does not give all the same answers as the old, "real"
282# implementation since this implementation always "rounds down" (thanks to the
283# bisect_left) while the old implementation would "round up" or "round down"
284# depending on the input. Since this is a fairly gross approximation anyway,
285# I don't think it matters much.
286
287# Values generated using the formula from the "old" implementation above
288_length_byte_cache = array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14,
28916, 17, 18, 20, 21, 23, 25, 26, 28, 30, 32, 34, 36, 38, 40, 42, 45, 47, 49, 52,
29054, 57, 60, 63, 66, 69, 72, 75, 79, 82, 86, 89, 93, 97, 101, 106, 110, 114,
291119, 124, 129, 134, 139, 145, 150, 156, 162, 169, 175, 182, 189, 196, 203, 211,
292219, 227, 235, 244, 253, 262, 271, 281, 291, 302, 313, 324, 336, 348, 360, 373,
293386, 399, 414, 428, 443, 459, 475, 491, 508, 526, 544, 563, 583, 603, 623, 645,
294667, 690, 714, 738, 763, 789, 816, 844, 873, 903, 933, 965, 998, 1032, 1066,
2951103, 1140, 1178, 1218, 1259, 1302, 1345, 1391, 1438, 1486, 1536, 1587, 1641,
2961696, 1753, 1811, 1872, 1935, 1999, 2066, 2135, 2207, 2280, 2356, 2435, 2516,
2972600, 2687, 2777, 2869, 2965, 3063, 3165, 3271, 3380, 3492, 3608, 3728, 3852,
2983980, 4112, 4249, 4390, 4536, 4686, 4842, 5002, 5168, 5340, 5517, 5700, 5889,
2996084, 6286, 6494, 6709, 6932, 7161, 7398, 7643, 7897, 8158, 8428, 8707, 8995,
3009293, 9601, 9918, 10247, 10586, 10936, 11298, 11671, 12057, 12456, 12868,
30113294, 13733, 14187, 14656, 15141, 15641, 16159, 16693, 17244, 17814, 18403,
30219011, 19640, 20289, 20959, 21652, 22367, 23106, 23869, 24658, 25472, 26314,
30327183, 28081, 29009, 29967, 30957, 31979, 33035, 34126, 35254, 36418, 37620,
30438863, 40146, 41472, 42841, 44256, 45717, 47227, 48786, 50397, 52061, 53780,
30555556, 57390, 59285, 61242, 63264, 65352, 67510, 69739, 72041, 74419, 76876,
30679414, 82035, 84743, 87541, 90430, 93416, 96499, 99684, 102975, 106374])
307
308
309def length_to_byte(length):
310    if length is None:
311        return 0
312    if length >= 106374:
313        return 255
314    else:
315        return bisect_left(_length_byte_cache, length)
316
317byte_to_length = _length_byte_cache.__getitem__
318