1# Copyright 2010 Matt Chaput. All rights reserved. 2# 3# Redistribution and use in source and binary forms, with or without 4# modification, are permitted provided that the following conditions are met: 5# 6# 1. Redistributions of source code must retain the above copyright notice, 7# this list of conditions and the following disclaimer. 8# 9# 2. Redistributions in binary form must reproduce the above copyright 10# notice, this list of conditions and the following disclaimer in the 11# documentation and/or other materials provided with the distribution. 12# 13# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23# 24# The views and conclusions contained in the software and documentation are 25# those of the authors and should not be interpreted as representing official 26# policies, either expressed or implied, of Matt Chaput. 27 28import math, struct 29from array import array 30from bisect import bisect_left 31from struct import pack, unpack 32 33from whoosh.compat import b, long_type 34from whoosh.system import pack_byte, unpack_byte, pack_ushort, unpack_ushort 35from whoosh.system import pack_int, unpack_int, pack_uint, unpack_uint 36from whoosh.system import pack_long, unpack_long, pack_ulong, unpack_ulong 37from whoosh.system import pack_float, unpack_float, pack_double, unpack_double 38 39 40NaN = struct.unpack("<d", b('\xff\xff\xff\xff\xff\xff\xff\xff'))[0] 41 42typecode_max = {"b": 127, "B": 255, "h": 2 ** 15 - 1, "H": 2 ** 16 - 1, 43 "i": 2 ** 31 - 1, "I": 2 ** 32 - 1, 44 "q": 2 ** 63 - 1, "Q": 2 ** 64 - 1} 45typecode_min = {"b": 0 - 128, "B": 0, "h": 0 - 2 ** 15, "H": 0, 46 "i": 0 - 2 ** 31, "I": 0, 47 "q": 0 - 2 ** 63, "Q": 0} 48typecode_pack = {"B": pack_byte, "H": pack_ushort, "i": pack_int, 49 "I": pack_uint, "q": pack_long, "Q": pack_ulong, 50 "f": pack_float, "d": pack_double} 51typecode_unpack = {"B": unpack_byte, "H": unpack_ushort, "i": unpack_int, 52 "I": unpack_uint, "q": unpack_long, "Q": unpack_ulong, 53 "f": unpack_float, "d": unpack_double} 54 55 56# Functions related to binary representations 57 58def bits_required(maxnum): 59 """Returns the number of bits required to represent the given (unsigned) 60 integer. 61 """ 62 63 return max(1, math.ceil(math.log(maxnum, 2))) 64 65 66def typecode_required(maxnum): 67 if maxnum < 256: 68 return "B" 69 elif maxnum < 2 ** 16: 70 return "H" 71 elif maxnum < 2 ** 31 - 1: 72 return "i" 73 elif maxnum < 2 ** 32: 74 return "I" 75 elif maxnum < 2 ** 63 - 1: 76 return "q" 77 else: 78 return "Q" 79 80 81def max_value(bitcount): 82 """Returns the maximum (unsigned) integer representable in the given number 83 of bits. 84 """ 85 86 return ~(~0 << bitcount) 87 88 89def bytes_for_bits(bitcount): 90 r = int(math.ceil((bitcount + 1) / 8.0)) 91 return r 92 93 94# Functions for converting numbers to and from sortable representations 95 96_istruct = struct.Struct(">i") 97_qstruct = struct.Struct(">q") 98_dstruct = struct.Struct(">d") 99_ipack, _iunpack = _istruct.pack, _istruct.unpack 100_qpack, _qunpack = _qstruct.pack, _qstruct.unpack 101_dpack, _dunpack = _dstruct.pack, _dstruct.unpack 102 103 104def to_sortable(numtype, intsize, signed, x): 105 if numtype is int or numtype is long_type: 106 if signed: 107 x += (1 << intsize - 1) 108 return x 109 else: 110 return float_to_sortable_long(x, signed) 111 112 113def from_sortable(numtype, intsize, signed, x): 114 if numtype is int or numtype is long_type: 115 if signed: 116 x -= (1 << intsize - 1) 117 return x 118 else: 119 return sortable_long_to_float(x, signed) 120 121 122def float_to_sortable_long(x, signed): 123 x = _qunpack(_dpack(x))[0] 124 if x < 0: 125 x ^= 0x7fffffffffffffff 126 if signed: 127 x += 1 << 63 128 assert x >= 0 129 return x 130 131 132def sortable_long_to_float(x, signed): 133 if signed: 134 x -= 1 << 63 135 if x < 0: 136 x ^= 0x7fffffffffffffff 137 x = _dunpack(_qpack(x))[0] 138 return x 139 140 141# Functions for generating tiered ranges 142 143def split_ranges(intsize, step, start, end): 144 """Splits a range of numbers (from ``start`` to ``end``, inclusive) 145 into a sequence of trie ranges of the form ``(start, end, shift)``. The 146 consumer of these tuples is expected to shift the ``start`` and ``end`` 147 right by ``shift``. 148 149 This is used for generating term ranges for a numeric field. The queries 150 for the edges of the range are generated at high precision and large blocks 151 in the middle are generated at low precision. 152 """ 153 154 shift = 0 155 while True: 156 diff = 1 << (shift + step) 157 mask = ((1 << step) - 1) << shift 158 setbits = lambda x: x | ((1 << shift) - 1) 159 160 haslower = (start & mask) != 0 161 hasupper = (end & mask) != mask 162 163 not_mask = ~mask & ((1 << intsize + 1) - 1) 164 nextstart = (start + diff if haslower else start) & not_mask 165 nextend = (end - diff if hasupper else end) & not_mask 166 167 if shift + step >= intsize or nextstart > nextend: 168 yield (start, setbits(end), shift) 169 break 170 171 if haslower: 172 yield (start, setbits(start | mask), shift) 173 if hasupper: 174 yield (end & not_mask, setbits(end), shift) 175 176 start = nextstart 177 end = nextend 178 shift += step 179 180 181def tiered_ranges(numtype, intsize, signed, start, end, shift_step, 182 startexcl, endexcl): 183 assert numtype in (int, float) 184 assert intsize in (8, 16, 32, 64) 185 186 # Convert start and end values to sortable ints 187 if start is None: 188 start = 0 189 else: 190 start = to_sortable(numtype, intsize, signed, start) 191 if startexcl: 192 start += 1 193 194 if end is None: 195 end = 2 ** intsize - 1 196 else: 197 end = to_sortable(numtype, intsize, signed, end) 198 if endexcl: 199 end -= 1 200 201 if not shift_step: 202 return ((start, end, 0),) 203 204 # Yield (rstart, rend, shift) ranges for the different resolutions 205 return split_ranges(intsize, shift_step, start, end) 206 207 208# Float-to-byte encoding/decoding 209 210def float_to_byte(value, mantissabits=5, zeroexp=2): 211 """Encodes a floating point number in a single byte. 212 """ 213 214 # Assume int size == float size 215 216 fzero = (63 - zeroexp) << mantissabits 217 bits = unpack("i", pack("f", value))[0] 218 smallfloat = bits >> (24 - mantissabits) 219 if smallfloat < fzero: 220 # Map negative numbers and 0 to 0 221 # Map underflow to next smallest non-zero number 222 if bits <= 0: 223 result = chr(0) 224 else: 225 result = chr(1) 226 elif smallfloat >= fzero + 0x100: 227 # Map overflow to largest number 228 result = chr(255) 229 else: 230 result = chr(smallfloat - fzero) 231 return b(result) 232 233 234def byte_to_float(b, mantissabits=5, zeroexp=2): 235 """Decodes a floating point number stored in a single byte. 236 """ 237 if type(b) is not int: 238 b = ord(b) 239 if b == 0: 240 return 0.0 241 242 bits = (b & 0xff) << (24 - mantissabits) 243 bits += (63 - zeroexp) << 24 244 return unpack("f", pack("i", bits))[0] 245 246 247# Length-to-byte approximation functions 248 249# Old implementation: 250 251#def length_to_byte(length): 252# """Returns a logarithmic approximation of the given number, in the range 253# 0-255. The approximation has high precision at the low end (e.g. 254# 1 -> 0, 2 -> 1, 3 -> 2 ...) and low precision at the high end. Numbers 255# equal to or greater than 108116 all approximate to 255. 256# 257# This is useful for storing field lengths, where the general case is small 258# documents and very large documents are more rare. 259# """ 260# 261# # This encoding formula works up to 108116 -> 255, so if the length is 262# # equal to or greater than that limit, just return 255. 263# if length >= 108116: 264# return 255 265# 266# # The parameters of this formula where chosen heuristically so that low 267# # numbers would approximate closely, and the byte range 0-255 would cover 268# # a decent range of document lengths (i.e. 1 to ~100000). 269# return int(round(log((length / 27.0) + 1, 1.033))) 270#def _byte_to_length(n): 271# return int(round((pow(1.033, n) - 1) * 27)) 272#_b2l_cache = array("i", (_byte_to_length(i) for i in xrange(256))) 273#byte_to_length = _b2l_cache.__getitem__ 274 275# New implementation 276 277# Instead of computing the actual formula to get the byte for any given length, 278# precompute the length associated with each byte, and use bisect to find the 279# nearest value. This gives quite a large speed-up. 280# 281# Note that this does not give all the same answers as the old, "real" 282# implementation since this implementation always "rounds down" (thanks to the 283# bisect_left) while the old implementation would "round up" or "round down" 284# depending on the input. Since this is a fairly gross approximation anyway, 285# I don't think it matters much. 286 287# Values generated using the formula from the "old" implementation above 288_length_byte_cache = array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 28916, 17, 18, 20, 21, 23, 25, 26, 28, 30, 32, 34, 36, 38, 40, 42, 45, 47, 49, 52, 29054, 57, 60, 63, 66, 69, 72, 75, 79, 82, 86, 89, 93, 97, 101, 106, 110, 114, 291119, 124, 129, 134, 139, 145, 150, 156, 162, 169, 175, 182, 189, 196, 203, 211, 292219, 227, 235, 244, 253, 262, 271, 281, 291, 302, 313, 324, 336, 348, 360, 373, 293386, 399, 414, 428, 443, 459, 475, 491, 508, 526, 544, 563, 583, 603, 623, 645, 294667, 690, 714, 738, 763, 789, 816, 844, 873, 903, 933, 965, 998, 1032, 1066, 2951103, 1140, 1178, 1218, 1259, 1302, 1345, 1391, 1438, 1486, 1536, 1587, 1641, 2961696, 1753, 1811, 1872, 1935, 1999, 2066, 2135, 2207, 2280, 2356, 2435, 2516, 2972600, 2687, 2777, 2869, 2965, 3063, 3165, 3271, 3380, 3492, 3608, 3728, 3852, 2983980, 4112, 4249, 4390, 4536, 4686, 4842, 5002, 5168, 5340, 5517, 5700, 5889, 2996084, 6286, 6494, 6709, 6932, 7161, 7398, 7643, 7897, 8158, 8428, 8707, 8995, 3009293, 9601, 9918, 10247, 10586, 10936, 11298, 11671, 12057, 12456, 12868, 30113294, 13733, 14187, 14656, 15141, 15641, 16159, 16693, 17244, 17814, 18403, 30219011, 19640, 20289, 20959, 21652, 22367, 23106, 23869, 24658, 25472, 26314, 30327183, 28081, 29009, 29967, 30957, 31979, 33035, 34126, 35254, 36418, 37620, 30438863, 40146, 41472, 42841, 44256, 45717, 47227, 48786, 50397, 52061, 53780, 30555556, 57390, 59285, 61242, 63264, 65352, 67510, 69739, 72041, 74419, 76876, 30679414, 82035, 84743, 87541, 90430, 93416, 96499, 99684, 102975, 106374]) 307 308 309def length_to_byte(length): 310 if length is None: 311 return 0 312 if length >= 106374: 313 return 255 314 else: 315 return bisect_left(_length_byte_cache, length) 316 317byte_to_length = _length_byte_cache.__getitem__ 318