1 /* flint_utils.h: Generic functions for flint
2  *
3  * Copyright 1999,2000,2001 BrightStation PLC
4  * Copyright 2002 Ananova Ltd
5  * Copyright 2002,2003,2004,2006,2008,2009 Olly Betts
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
20  * USA
21  */
22 
23 #ifndef OM_HGUARD_FLINT_UTILS_H
24 #define OM_HGUARD_FLINT_UTILS_H
25 
26 #include "omassert.h"
27 
28 #include <xapian/types.h>
29 
30 #include <string>
31 
32 using namespace std;
33 
34 typedef unsigned char       om_byte;
35 typedef unsigned int        om_uint32;
36 typedef int                 om_int32;
37 
38 /** FIXME: the pack and unpack int methods store in low-byte-first order
39  *  - it might be easier to implement efficient specialisations with
40  *  high-byte-first order.
41  */
42 
43 /** Reads an unsigned integer from a string starting at a given position.
44  *
45  *  @param src       A pointer to a pointer to the data to read.  The
46  *                   character pointer will be updated to point to the
47  *                   next character to read, or 0 if the method ran out of
48  *                   data.  (It is only set to 0 in case of an error).
49  *  @param src_end   A pointer to the byte after the end of the data to
50  *                   read the integer from.
51  *  @param resultptr A pointer to a place to store the result.  If an
52  *                   error occurs, the value stored in this location is
53  *                   undefined.  If this pointer is 0, the result is not
54  *                   stored, and the method simply skips over the result.
55  *
56  *  @result True if an integer was successfully read.  False if the read
57  *          failed.  Failure may either be due to the data running out (in
58  *          which case *src will equal 0), or due to the value read
59  *          overflowing the size of result (in which case *src will point
60  *          to wherever the value ends, despite the overflow).
61  */
62 template<class T>
63 bool
F_unpack_uint(const char ** src,const char * src_end,T * resultptr)64 F_unpack_uint(const char ** src,
65 	    const char * src_end,
66 	    T * resultptr)
67 {
68     // Check unsigned
69     STATIC_ASSERT_UNSIGNED_TYPE(T);
70 
71     // Check byte is what it's meant to be
72     STATIC_ASSERT(sizeof(om_byte) == 1);
73 
74     unsigned int shift = 0;
75     T result = 0;
76 
77     while (true) {
78 	if ((*src) == src_end) {
79 	    *src = 0;
80 	    return false;
81 	}
82 
83 	om_byte part = static_cast<om_byte>(**src);
84 	(*src)++;
85 
86 	// if new byte might cause overflow, and it does
87 	if (((shift > (sizeof(T) - 1) * 8 + 1) &&
88 	     ((part & 0x7f) << (shift % 8)) >= 0x100) ||
89 	    (shift >= sizeof(T) * 8)) {
90 	    // Overflowed - move to end of this integer
91 	    while (true) {
92 		if ((part & 0x80) == 0) return false;
93 		if ((*src) == src_end) {
94 		    *src = 0;
95 		    return false;
96 		}
97 		part = static_cast<om_byte>(**src);
98 		(*src)++;
99 	    }
100 	}
101 
102 	result += T(part & 0x7f) << shift;
103 	shift += 7;
104 
105 	if ((part & 0x80) == 0) {
106 	    if (resultptr) *resultptr = result;
107 	    return true;
108 	}
109     }
110 }
111 
112 
113 /** Generates a packed representation of an integer.
114  *
115  *  @param value  The integer to represent.
116  *
117  *  @result       A string containing the representation of the integer.
118  */
119 template<class T>
120 string
F_pack_uint(T value)121 F_pack_uint(T value)
122 {
123     // Check unsigned
124     STATIC_ASSERT_UNSIGNED_TYPE(T);
125 
126     if (value == 0) return string(1, '\0');
127     string result;
128 
129     while (value != 0) {
130 	om_byte part = static_cast<om_byte>(value & 0x7f);
131 	value = value >> 7;
132 	if (value) part |= 0x80;
133 	result.append(1u, char(part));
134     }
135 
136     return result;
137 }
138 
139 /** Generates a packed representation of a bool.
140  *
141  *  This is a specialisation of the template above.
142  *
143  *  @param value  The bool to represent.
144  *
145  *  @result       A string containing the representation of the bool.
146  */
147 template<>
148 inline string
149 F_pack_uint<bool>(bool value)
150 {
151     return string(1, static_cast<char>(value));
152 }
153 
154 /** Reads an unsigned integer from a string starting at a given position.
155  *  This encoding requires that we know the encoded length from out-of-band
156  *  information (so is suitable when only one integer is encoded, or for
157  *  the last integer encoded).
158  *
159  *  @param src       A pointer to a pointer to the data to read.
160  *  @param src_end   A pointer to the byte after the end of the data to
161  *                   read the integer from.
162  *  @param resultptr A pointer to a place to store the result.  If an
163  *                   error occurs, the value stored in this location is
164  *                   undefined.  If this pointer is 0, the result is not
165  *                   stored, and the method simply skips over the result.
166  *
167  *  @result True if an integer was successfully read.  False if the read
168  *          failed.  Failure can hapen if the value read overflows
169  *          the size of result.
170  */
171 template<class T>
172 bool
F_unpack_uint_last(const char ** src,const char * src_end,T * resultptr)173 F_unpack_uint_last(const char ** src, const char * src_end, T * resultptr)
174 {
175     // Check unsigned
176     STATIC_ASSERT_UNSIGNED_TYPE(T);
177     // Check byte is what it's meant to be
178     STATIC_ASSERT(sizeof(om_byte) == 1);
179 
180     if (src_end - *src > int(sizeof(T))) {
181 	// Would overflow
182 	*src = src_end;
183 	return false;
184     }
185 
186     T result = 0;
187     int shift = 0;
188     while (*src != src_end) {
189 	result |= static_cast<T>(static_cast<om_byte>(**src)) << shift;
190 	++(*src);
191 	shift += 8;
192     }
193     *resultptr = result;
194     return true;
195 }
196 
197 /** Generates a packed representation of an integer.
198  *  This encoding requires that we know the encoded length from out-of-band
199  *  information (so is suitable when only one integer is encoded, or for
200  *  the last integer encoded).
201  *
202  *  @param value  The integer to represent.
203  *
204  *  @result       A string containing the representation of the integer.
205  */
206 template<class T>
207 string
F_pack_uint_last(T value)208 F_pack_uint_last(T value)
209 {
210     // Check unsigned
211     STATIC_ASSERT_UNSIGNED_TYPE(T);
212 
213     string result;
214     while (value) {
215         result += char(value);
216 	value >>= 8;
217     }
218     return result;
219 }
220 
221 /** Generate a packed representation of an integer, preserving sort order.
222  *
223  *  This representation is less compact than the usual one, and has a limit
224  *  of 256 bytes on the length of the integer.  However, this is unlikely to
225  *  ever be a problem.
226  *
227  *  @param value  The integer to represent.
228  *
229  *  @result       A string containing the representation of the integer.
230  */
231 template<class T>
232 string
F_pack_uint_preserving_sort(T value)233 F_pack_uint_preserving_sort(T value)
234 {
235     // Check unsigned
236     STATIC_ASSERT_UNSIGNED_TYPE(T);
237 
238     string result;
239     while (value != 0) {
240 	om_byte part = static_cast<om_byte>(value & 0xff);
241 	value = value >> 8;
242 	result.insert(string::size_type(0), 1u, char(part));
243     }
244     result.insert(string::size_type(0), 1u, char(result.size()));
245     return result;
246 }
247 
248 /** Unpack a unsigned integer, store in sort preserving order.
249  *
250  *  @param src       A pointer to a pointer to the data to read.  The
251  *                   character pointer will be updated to point to the
252  *                   next character to read, or 0 if the method ran out of
253  *                   data.  (It is only set to 0 in case of an error).
254  *  @param src_end   A pointer to the byte after the end of the data to
255  *                   read the integer from.
256  *  @param resultptr A pointer to a place to store the result.  If an
257  *                   error occurs, the value stored in this location is
258  *                   undefined.  If this pointer is 0, the result is not
259  *                   stored, and the method simply skips over the result.
260  *
261  *  @result True if an integer was successfully read.  False if the read
262  *          failed.  Failure may either be due to the data running out (in
263  *          which case *src will equal 0), or due to the value read
264  *          overflowing the size of result (in which case *src will point
265  *          to wherever the value ends, despite the overflow).
266  */
267 template<class T>
268 bool
F_unpack_uint_preserving_sort(const char ** src,const char * src_end,T * resultptr)269 F_unpack_uint_preserving_sort(const char ** src,
270 			    const char * src_end,
271 			    T * resultptr)
272 {
273     if (*src == src_end) {
274 	*src = 0;
275 	return false;
276     }
277 
278     unsigned int length = static_cast<om_byte>(**src);
279     (*src)++;
280 
281     if (length > sizeof(T)) {
282 	*src += length;
283 	if (*src > src_end) {
284 	    *src = 0;
285 	}
286 	return false;
287     }
288 
289     // Can't be overflow now.
290     T result = 0;
291     while (length > 0) {
292 	result = result << 8;
293 	result += static_cast<om_byte>(**src);
294 	(*src)++;
295 	length--;
296     }
297     *resultptr = result;
298 
299     return true;
300 }
301 
302 inline bool
F_unpack_string(const char ** src,const char * src_end,string & result)303 F_unpack_string(const char ** src,
304 	      const char * src_end,
305 	      string & result)
306 {
307     string::size_type length;
308     if (!F_unpack_uint(src, src_end, &length)) {
309     	return false;
310     }
311 
312     if (src_end - *src < 0 ||
313 	string::size_type(src_end - *src) < length) {
314 	src = 0;
315 	return false;
316     }
317 
318     result.assign(*src, length);
319     *src += length;
320     return true;
321 }
322 
323 inline string
F_pack_string(const string & value)324 F_pack_string(const string & value)
325 {
326     return F_pack_uint(value.size()) + value;
327 }
328 
329 /** Pack a string into a representation which preserves sort order.
330  *  We do this by replacing zero bytes in the string with a zero byte
331  *  followed by byte value 0xff, and then appending two zero bytes to
332  *  the end.
333  */
334 inline string
F_pack_string_preserving_sort(string value)335 F_pack_string_preserving_sort(string value)
336 {
337     string::size_type i = 0, j;
338     while ((j = value.find('\0', i)) != string::npos) {
339 	value.replace(j, 1, "\0\xff", 2);
340 	i = j + 2;
341     }
342     value += '\0'; // FIXME temp...
343     return value + '\0'; // Note - next byte mustn't be '\xff'...
344 }
345 
346 inline bool
F_unpack_string_preserving_sort(const char ** src,const char * src_end,string & result)347 F_unpack_string_preserving_sort(const char ** src,
348 			      const char * src_end,
349 			      string & result)
350 {
351     result.resize(0);
352     while (*src < src_end) {
353 	const char *begin = *src;
354 	while (**src) {
355 	    ++(*src);
356 	    if (*src == src_end) return false;
357 	}
358 	result += string(begin, *src - begin);
359 	++(*src);
360 	if (*src == src_end) return false;
361 	if (**src != '\xff') {
362 	    ++(*src); // FIXME temp
363 	    return true;
364 	}
365 	result += '\0';
366 	++(*src);
367     }
368     return false;
369 }
370 
371 inline bool
F_unpack_bool(const char ** src,const char * src_end,bool * resultptr)372 F_unpack_bool(const char ** src,
373 	    const char * src_end,
374 	    bool * resultptr)
375 {
376     if (*src == src_end) {
377 	*src = 0;
378 	return false;
379     }
380     switch (*((*src)++)) {
381 	case '0':
382 	    if (resultptr) *resultptr = false;
383 	    return true;
384 	case '1':
385 	    if (resultptr) *resultptr = true;
386 	    return true;
387     }
388     *src = 0;
389     return false;
390 }
391 
392 inline string
F_pack_bool(bool value)393 F_pack_bool(bool value)
394 {
395     return value ? "1" : "0";
396 }
397 
398 /** Convert a document id to a key (suitable when the docid is the only
399  *  component of the key).
400  */
401 inline string
flint_docid_to_key(Xapian::docid did)402 flint_docid_to_key(Xapian::docid did)
403 {
404     return F_pack_uint_preserving_sort(did);
405 }
406 
407 #endif /* OM_HGUARD_FLINT_UTILS_H */
408