1 /** @file
2  * @brief Serialise floating point values to string which sort the same way.
3  */
4 /* Copyright (C) 2007,2009,2015,2016 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
19  */
20 
21 #include <config.h>
22 
23 #include <xapian/queryparser.h>
24 
25 // Disable these assertions when building the library as these functions are
26 // marked not to throw exceptions in the API headers.  In unittest we define
27 // UNITTEST_ASSERT_NOTHROW to set a variable to the exception message and
28 // return, then the harness checks if that variable has been set.
29 #ifndef XAPIAN_UNITTEST
30 # define UNITTEST_ASSERT_NOTHROW(COND,RET)
31 #endif
32 
33 #include <cfloat>
34 #include <cmath>
35 #include <cstring>
36 
37 #include <string>
38 
39 using namespace std;
40 
41 #if FLT_RADIX != 2
42 # error Code currently assumes FLT_RADIX == 2
43 #endif
44 
45 #ifdef _MSC_VER
46 // Disable warning about negating an unsigned type, which we do deliberately.
47 # pragma warning(disable:4146)
48 #endif
49 
50 size_t
sortable_serialise_(double value,char * buf)51 Xapian::sortable_serialise_(double value, char * buf) XAPIAN_NOEXCEPT
52 {
53     double mantissa;
54     int exponent;
55 
56     // Negative infinity.
57     if (value < -DBL_MAX) return 0;
58 
59     mantissa = frexp(value, &exponent);
60 
61     /* Deal with zero specially.
62      *
63      * IEEE representation of doubles uses 11 bits for the exponent, with a
64      * bias of 1023.  We bias this by subtracting 8, and non-IEEE
65      * representations may allow higher exponents, so allow exponents down to
66      * -2039 - if smaller exponents are possible anywhere, we underflow such
67      *  numbers to 0.
68      */
69     if (mantissa == 0.0 || exponent < -2039) {
70 	*buf = '\x80';
71 	return 1;
72     }
73 
74     bool negative = (mantissa < 0);
75     if (negative) mantissa = -mantissa;
76 
77     // Infinity, or extremely large non-IEEE representation.
78     if (value > DBL_MAX || exponent > 2055) {
79 	if (negative) {
80 	    // This can only happen with a non-IEEE representation, because
81 	    // we've already tested for value < -DBL_MAX
82 	    return 0;
83 	} else {
84 	    memset(buf, '\xff', 9);
85 	    return 9;
86 	}
87     }
88 
89     // Encoding:
90     //
91     // [ 7 | 6 | 5 | 4 3 2 1 0]
92     //   Sm  Se  Le
93     //
94     // Sm stores the sign of the mantissa: 1 = positive or zero, 0 = negative.
95     // Se stores the sign of the exponent: Sm for positive/zero, !Sm for neg.
96     // Le stores the length of the exponent: !Se for 3 bits, Se for 11 bits.
97     unsigned char next = (negative ? 0 : 0xe0);
98 
99     // Bias the exponent by 8 so that more small integers get short encodings.
100     exponent -= 8;
101     bool exponent_negative = (exponent < 0);
102     if (exponent_negative) {
103 	exponent = -exponent;
104 	next ^= 0x60;
105     }
106 
107     size_t len = 0;
108 
109     /* We store the exponent in 3 or 11 bits.  If the number is negative, we
110      * flip all the bits of the exponent, since larger negative numbers should
111      * sort first.
112      *
113      * If the exponent is negative, we flip the bits of the exponent, since
114      * larger negative exponents should sort first (unless the number is
115      * negative, in which case they should sort later).
116      */
117     UNITTEST_ASSERT_NOTHROW(exponent >= 0, 0);
118     if (exponent < 8) {
119 	next ^= 0x20;
120 	next |= static_cast<unsigned char>(exponent << 2);
121 	if (negative ^ exponent_negative) next ^= 0x1c;
122     } else {
123 	UNITTEST_ASSERT_NOTHROW((exponent >> 11) == 0, 0);
124 	// Put the top 5 bits of the exponent into the lower 5 bits of the
125 	// first byte:
126 	next |= static_cast<unsigned char>(exponent >> 6);
127 	if (negative ^ exponent_negative) next ^= 0x1f;
128 	buf[len++] = next;
129 	// And the lower 6 bits of the exponent go into the upper 6 bits
130 	// of the second byte:
131 	next = static_cast<unsigned char>(exponent << 2);
132 	if (negative ^ exponent_negative) next ^= 0xfc;
133     }
134 
135     // Convert the 52 (or 53) bits of the mantissa into two 32-bit words.
136     mantissa *= 1 << (negative ? 26 : 27);
137     unsigned word1 = static_cast<unsigned>(mantissa);
138     mantissa -= word1;
139     unsigned word2 = static_cast<unsigned>(mantissa * 4294967296.0); // 1<<32
140     // If the number is positive, the first bit will always be set because 0.5
141     // <= mantissa < 1, unless mantissa is zero, which we handle specially
142     // above).  If the number is negative, we negate the mantissa instead of
143     // flipping all the bits, so in the case of 0.5, the first bit isn't set
144     // so we need to store it explicitly.  But for the cost of one extra
145     // leading bit, we can save several trailing 0xff bytes in lots of common
146     // cases.
147     UNITTEST_ASSERT_NOTHROW(negative || (word1 & (1<<26)), 0);
148     if (negative) {
149 	// We negate the mantissa for negative numbers, so that the sort order
150 	// is reversed (since larger negative numbers should come first).
151 	word1 = -word1;
152 	if (word2 != 0) ++word1;
153 	word2 = -word2;
154     }
155 
156     word1 &= 0x03ffffff;
157     next |= static_cast<unsigned char>(word1 >> 24);
158     buf[len++] = next;
159     buf[len++] = char(word1 >> 16);
160     buf[len++] = char(word1 >> 8);
161     buf[len++] = char(word1);
162 
163     buf[len++] = char(word2 >> 24);
164     buf[len++] = char(word2 >> 16);
165     buf[len++] = char(word2 >> 8);
166     buf[len++] = char(word2);
167 
168     // Finally, we can chop off any trailing zero bytes.
169     while (len > 0 && buf[len - 1] == '\0') {
170 	--len;
171     }
172 
173     return len;
174 }
175 
176 /// Get a number from the character at a given position in a string, returning
177 /// 0 if the string isn't long enough.
178 static inline unsigned char
numfromstr(const std::string & str,std::string::size_type pos)179 numfromstr(const std::string & str, std::string::size_type pos)
180 {
181     return (pos < str.size()) ? static_cast<unsigned char>(str[pos]) : '\0';
182 }
183 
184 double
sortable_unserialise(const std::string & value)185 Xapian::sortable_unserialise(const std::string & value) XAPIAN_NOEXCEPT
186 {
187     // Zero.
188     if (value.size() == 1 && value[0] == '\x80') return 0.0;
189 
190     // Positive infinity.
191     if (value.size() == 9 &&
192 	memcmp(value.data(), "\xff\xff\xff\xff\xff\xff\xff\xff\xff", 9) == 0) {
193 #ifdef INFINITY
194 	// INFINITY is C99.  Oddly, it's of type "float" so sanity check in
195 	// case it doesn't cast to double as infinity (apparently some
196 	// implementations have this problem).
197 	if (double(INFINITY) > HUGE_VAL) return INFINITY;
198 #endif
199 	return HUGE_VAL;
200     }
201 
202     // Negative infinity.
203     if (value.empty()) {
204 #ifdef INFINITY
205 	if (double(INFINITY) > HUGE_VAL) return -INFINITY;
206 #endif
207 	return -HUGE_VAL;
208     }
209 
210     unsigned char first = numfromstr(value, 0);
211     size_t i = 0;
212 
213     first ^= static_cast<unsigned char>(first & 0xc0) >> 1;
214     bool negative = !(first & 0x80);
215     bool exponent_negative = (first & 0x40);
216     bool explen = !(first & 0x20);
217     int exponent = first & 0x1f;
218     if (!explen) {
219 	exponent >>= 2;
220 	if (negative ^ exponent_negative) exponent ^= 0x07;
221     } else {
222 	first = numfromstr(value, ++i);
223 	exponent <<= 6;
224 	exponent |= (first >> 2);
225 	if (negative ^ exponent_negative) exponent ^= 0x07ff;
226     }
227 
228     unsigned word1;
229     word1 = (unsigned(first & 0x03) << 24);
230     word1 |= numfromstr(value, ++i) << 16;
231     word1 |= numfromstr(value, ++i) << 8;
232     word1 |= numfromstr(value, ++i);
233 
234     unsigned word2 = 0;
235     if (i < value.size()) {
236 	word2 = numfromstr(value, ++i) << 24;
237 	word2 |= numfromstr(value, ++i) << 16;
238 	word2 |= numfromstr(value, ++i) << 8;
239 	word2 |= numfromstr(value, ++i);
240     }
241 
242     if (negative) {
243 	word1 = -word1;
244 	if (word2 != 0) ++word1;
245 	word2 = -word2;
246 	UNITTEST_ASSERT_NOTHROW((word1 & 0xf0000000) != 0, 0);
247 	word1 &= 0x03ffffff;
248     }
249     if (!negative) word1 |= 1<<26;
250 
251     double mantissa = 0;
252     if (word2) mantissa = word2 / 4294967296.0; // 1<<32
253     mantissa += word1;
254     mantissa /= 1 << (negative ? 26 : 27);
255 
256     if (exponent_negative) exponent = -exponent;
257     exponent += 8;
258 
259     if (negative) mantissa = -mantissa;
260 
261     // We use scalbn() since it's equivalent to ldexp() when FLT_RADIX == 2
262     // (which we currently assume), except that ldexp() will set errno if the
263     // result overflows or underflows, which isn't really desirable here.
264     return scalbn(mantissa, exponent);
265 }
266