1 /* flint_utils.h: Generic functions for flint
2 *
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2002 Ananova Ltd
5 * Copyright 2002,2003,2004,2006,2008,2009 Olly Betts
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
21 */
22
23 #ifndef OM_HGUARD_FLINT_UTILS_H
24 #define OM_HGUARD_FLINT_UTILS_H
25
26 #include "omassert.h"
27
28 #include <xapian/types.h>
29
30 #include <string>
31
32 using namespace std;
33
34 typedef unsigned char om_byte;
35 typedef unsigned int om_uint32;
36 typedef int om_int32;
37
38 /** FIXME: the pack and unpack int methods store in low-byte-first order
39 * - it might be easier to implement efficient specialisations with
40 * high-byte-first order.
41 */
42
43 /** Reads an unsigned integer from a string starting at a given position.
44 *
45 * @param src A pointer to a pointer to the data to read. The
46 * character pointer will be updated to point to the
47 * next character to read, or 0 if the method ran out of
48 * data. (It is only set to 0 in case of an error).
49 * @param src_end A pointer to the byte after the end of the data to
50 * read the integer from.
51 * @param resultptr A pointer to a place to store the result. If an
52 * error occurs, the value stored in this location is
53 * undefined. If this pointer is 0, the result is not
54 * stored, and the method simply skips over the result.
55 *
56 * @result True if an integer was successfully read. False if the read
57 * failed. Failure may either be due to the data running out (in
58 * which case *src will equal 0), or due to the value read
59 * overflowing the size of result (in which case *src will point
60 * to wherever the value ends, despite the overflow).
61 */
62 template<class T>
63 bool
F_unpack_uint(const char ** src,const char * src_end,T * resultptr)64 F_unpack_uint(const char ** src,
65 const char * src_end,
66 T * resultptr)
67 {
68 // Check unsigned
69 STATIC_ASSERT_UNSIGNED_TYPE(T);
70
71 // Check byte is what it's meant to be
72 STATIC_ASSERT(sizeof(om_byte) == 1);
73
74 unsigned int shift = 0;
75 T result = 0;
76
77 while (true) {
78 if ((*src) == src_end) {
79 *src = 0;
80 return false;
81 }
82
83 om_byte part = static_cast<om_byte>(**src);
84 (*src)++;
85
86 // if new byte might cause overflow, and it does
87 if (((shift > (sizeof(T) - 1) * 8 + 1) &&
88 ((part & 0x7f) << (shift % 8)) >= 0x100) ||
89 (shift >= sizeof(T) * 8)) {
90 // Overflowed - move to end of this integer
91 while (true) {
92 if ((part & 0x80) == 0) return false;
93 if ((*src) == src_end) {
94 *src = 0;
95 return false;
96 }
97 part = static_cast<om_byte>(**src);
98 (*src)++;
99 }
100 }
101
102 result += T(part & 0x7f) << shift;
103 shift += 7;
104
105 if ((part & 0x80) == 0) {
106 if (resultptr) *resultptr = result;
107 return true;
108 }
109 }
110 }
111
112
113 /** Generates a packed representation of an integer.
114 *
115 * @param value The integer to represent.
116 *
117 * @result A string containing the representation of the integer.
118 */
119 template<class T>
120 string
F_pack_uint(T value)121 F_pack_uint(T value)
122 {
123 // Check unsigned
124 STATIC_ASSERT_UNSIGNED_TYPE(T);
125
126 if (value == 0) return string(1, '\0');
127 string result;
128
129 while (value != 0) {
130 om_byte part = static_cast<om_byte>(value & 0x7f);
131 value = value >> 7;
132 if (value) part |= 0x80;
133 result.append(1u, char(part));
134 }
135
136 return result;
137 }
138
139 /** Generates a packed representation of a bool.
140 *
141 * This is a specialisation of the template above.
142 *
143 * @param value The bool to represent.
144 *
145 * @result A string containing the representation of the bool.
146 */
147 template<>
148 inline string
149 F_pack_uint<bool>(bool value)
150 {
151 return string(1, static_cast<char>(value));
152 }
153
154 /** Reads an unsigned integer from a string starting at a given position.
155 * This encoding requires that we know the encoded length from out-of-band
156 * information (so is suitable when only one integer is encoded, or for
157 * the last integer encoded).
158 *
159 * @param src A pointer to a pointer to the data to read.
160 * @param src_end A pointer to the byte after the end of the data to
161 * read the integer from.
162 * @param resultptr A pointer to a place to store the result. If an
163 * error occurs, the value stored in this location is
164 * undefined. If this pointer is 0, the result is not
165 * stored, and the method simply skips over the result.
166 *
167 * @result True if an integer was successfully read. False if the read
168 * failed. Failure can hapen if the value read overflows
169 * the size of result.
170 */
171 template<class T>
172 bool
F_unpack_uint_last(const char ** src,const char * src_end,T * resultptr)173 F_unpack_uint_last(const char ** src, const char * src_end, T * resultptr)
174 {
175 // Check unsigned
176 STATIC_ASSERT_UNSIGNED_TYPE(T);
177 // Check byte is what it's meant to be
178 STATIC_ASSERT(sizeof(om_byte) == 1);
179
180 if (src_end - *src > int(sizeof(T))) {
181 // Would overflow
182 *src = src_end;
183 return false;
184 }
185
186 T result = 0;
187 int shift = 0;
188 while (*src != src_end) {
189 result |= static_cast<T>(static_cast<om_byte>(**src)) << shift;
190 ++(*src);
191 shift += 8;
192 }
193 *resultptr = result;
194 return true;
195 }
196
197 /** Generates a packed representation of an integer.
198 * This encoding requires that we know the encoded length from out-of-band
199 * information (so is suitable when only one integer is encoded, or for
200 * the last integer encoded).
201 *
202 * @param value The integer to represent.
203 *
204 * @result A string containing the representation of the integer.
205 */
206 template<class T>
207 string
F_pack_uint_last(T value)208 F_pack_uint_last(T value)
209 {
210 // Check unsigned
211 STATIC_ASSERT_UNSIGNED_TYPE(T);
212
213 string result;
214 while (value) {
215 result += char(value);
216 value >>= 8;
217 }
218 return result;
219 }
220
221 /** Generate a packed representation of an integer, preserving sort order.
222 *
223 * This representation is less compact than the usual one, and has a limit
224 * of 256 bytes on the length of the integer. However, this is unlikely to
225 * ever be a problem.
226 *
227 * @param value The integer to represent.
228 *
229 * @result A string containing the representation of the integer.
230 */
231 template<class T>
232 string
F_pack_uint_preserving_sort(T value)233 F_pack_uint_preserving_sort(T value)
234 {
235 // Check unsigned
236 STATIC_ASSERT_UNSIGNED_TYPE(T);
237
238 string result;
239 while (value != 0) {
240 om_byte part = static_cast<om_byte>(value & 0xff);
241 value = value >> 8;
242 result.insert(string::size_type(0), 1u, char(part));
243 }
244 result.insert(string::size_type(0), 1u, char(result.size()));
245 return result;
246 }
247
248 /** Unpack a unsigned integer, store in sort preserving order.
249 *
250 * @param src A pointer to a pointer to the data to read. The
251 * character pointer will be updated to point to the
252 * next character to read, or 0 if the method ran out of
253 * data. (It is only set to 0 in case of an error).
254 * @param src_end A pointer to the byte after the end of the data to
255 * read the integer from.
256 * @param resultptr A pointer to a place to store the result. If an
257 * error occurs, the value stored in this location is
258 * undefined. If this pointer is 0, the result is not
259 * stored, and the method simply skips over the result.
260 *
261 * @result True if an integer was successfully read. False if the read
262 * failed. Failure may either be due to the data running out (in
263 * which case *src will equal 0), or due to the value read
264 * overflowing the size of result (in which case *src will point
265 * to wherever the value ends, despite the overflow).
266 */
267 template<class T>
268 bool
F_unpack_uint_preserving_sort(const char ** src,const char * src_end,T * resultptr)269 F_unpack_uint_preserving_sort(const char ** src,
270 const char * src_end,
271 T * resultptr)
272 {
273 if (*src == src_end) {
274 *src = 0;
275 return false;
276 }
277
278 unsigned int length = static_cast<om_byte>(**src);
279 (*src)++;
280
281 if (length > sizeof(T)) {
282 *src += length;
283 if (*src > src_end) {
284 *src = 0;
285 }
286 return false;
287 }
288
289 // Can't be overflow now.
290 T result = 0;
291 while (length > 0) {
292 result = result << 8;
293 result += static_cast<om_byte>(**src);
294 (*src)++;
295 length--;
296 }
297 *resultptr = result;
298
299 return true;
300 }
301
302 inline bool
F_unpack_string(const char ** src,const char * src_end,string & result)303 F_unpack_string(const char ** src,
304 const char * src_end,
305 string & result)
306 {
307 string::size_type length;
308 if (!F_unpack_uint(src, src_end, &length)) {
309 return false;
310 }
311
312 if (src_end - *src < 0 ||
313 string::size_type(src_end - *src) < length) {
314 src = 0;
315 return false;
316 }
317
318 result.assign(*src, length);
319 *src += length;
320 return true;
321 }
322
323 inline string
F_pack_string(const string & value)324 F_pack_string(const string & value)
325 {
326 return F_pack_uint(value.size()) + value;
327 }
328
329 /** Pack a string into a representation which preserves sort order.
330 * We do this by replacing zero bytes in the string with a zero byte
331 * followed by byte value 0xff, and then appending two zero bytes to
332 * the end.
333 */
334 inline string
F_pack_string_preserving_sort(string value)335 F_pack_string_preserving_sort(string value)
336 {
337 string::size_type i = 0, j;
338 while ((j = value.find('\0', i)) != string::npos) {
339 value.replace(j, 1, "\0\xff", 2);
340 i = j + 2;
341 }
342 value += '\0'; // FIXME temp...
343 return value + '\0'; // Note - next byte mustn't be '\xff'...
344 }
345
346 inline bool
F_unpack_string_preserving_sort(const char ** src,const char * src_end,string & result)347 F_unpack_string_preserving_sort(const char ** src,
348 const char * src_end,
349 string & result)
350 {
351 result.resize(0);
352 while (*src < src_end) {
353 const char *begin = *src;
354 while (**src) {
355 ++(*src);
356 if (*src == src_end) return false;
357 }
358 result += string(begin, *src - begin);
359 ++(*src);
360 if (*src == src_end) return false;
361 if (**src != '\xff') {
362 ++(*src); // FIXME temp
363 return true;
364 }
365 result += '\0';
366 ++(*src);
367 }
368 return false;
369 }
370
371 inline bool
F_unpack_bool(const char ** src,const char * src_end,bool * resultptr)372 F_unpack_bool(const char ** src,
373 const char * src_end,
374 bool * resultptr)
375 {
376 if (*src == src_end) {
377 *src = 0;
378 return false;
379 }
380 switch (*((*src)++)) {
381 case '0':
382 if (resultptr) *resultptr = false;
383 return true;
384 case '1':
385 if (resultptr) *resultptr = true;
386 return true;
387 }
388 *src = 0;
389 return false;
390 }
391
392 inline string
F_pack_bool(bool value)393 F_pack_bool(bool value)
394 {
395 return value ? "1" : "0";
396 }
397
398 /** Convert a document id to a key (suitable when the docid is the only
399 * component of the key).
400 */
401 inline string
flint_docid_to_key(Xapian::docid did)402 flint_docid_to_key(Xapian::docid did)
403 {
404 return F_pack_uint_preserving_sort(did);
405 }
406
407 #endif /* OM_HGUARD_FLINT_UTILS_H */
408