1 //#include <stdio.h>
2 
3 // FIXME: make get functions const uint8_t *
4 
5 /*
6  * Copyright (c) 2019 Genome Research Ltd.
7  * Author(s): James Bonfield
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions are met:
11  *
12  *    1. Redistributions of source code must retain the above copyright notice,
13  *       this list of conditions and the following disclaimer.
14  *
15  *    2. Redistributions in binary form must reproduce the above
16  *       copyright notice, this list of conditions and the following
17  *       disclaimer in the documentation and/or other materials provided
18  *       with the distribution.
19  *
20  *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
21  *       Institute nor the names of its contributors may be used to endorse
22  *       or promote products derived from this software without specific
23  *       prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
26  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
28  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
29  * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
31  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36  */
37 
38 #ifndef VARINT2_H
39 #define VARINT2_H
40 
41 #include <stdint.h>
42 
43 // General API scheme is var_{get,put}_{s,u}{32,64}
44 // s/u for signed/unsigned;  32/64 for integer size.
45 
46 // The ideas here are taken from the vbenc code in TurboPFor
47 // (https://github.com/powturbo/TurboPFor) with analysis at
48 // https://github.com/stoklund/varint.
49 
50 // Unlike the ITF8 and standard 7-bit at a time encodings, this
51 // tries to ensure a larger portion of small numbers still fit in 1 byte.
52 // This trades more space for long integers with less space for short ones,
53 // which seems like a good tradeoff given the typical distribution curves.
54 //
55 // Like ITF8 and LTF8, the first byte also indicates the total number of
56 // bytes we need to decode, but unlike those it uses the same format for
57 // both meaning changing data type doesn't change encoding.
58 //
59 // Size comparison examples.
60 //
61 //              Max value
62 // Bytes        ITF8/7bit               This
63 // 1                  127                176
64 // 2               16,383             16,560
65 // 3            2,097,151            540,848
66 // 4          268,435,455         16,777,215
67 // 5       34,359,738,368      4,294,967,296
68 // 6    4,398,046,511,104  1,099,511,627,776
69 // ...
70 //
71 // The format is as follows:
72 // 0-176                     1 byte:  0 + 8 bit
73 // 177-16560 (14 bit range)  2 bytes: 177 + 6bit, 0 + 8bit, for x-177
74 // 16561-540848 (19 bits)    3 bytes: 241 + 3bit, 0+8, 0+8, for x-16561
75 // 540849-16777215 (~24 bit) 4 bytes: 249, 0+8, 0+8, 0+8, for x
76 // 2^24 - 2^32-1             5 bytes: 250, 0+8 x4
77 // 2^32 - 2^40-1             6 bytes: 251, 0+8 x5
78 // 2^40 - 2^48-1             7 bytes: 252, 0+8 x6
79 // 2^48 - 2^56-1             8 bytes: 253, 0+8 x7
80 // 2^56 - 2^64-1             9 bytes: 254, 0+8 x8
81 //
82 // Hence first byte value 255 is not possible and permits future
83 // escape code.
84 
85 
86 // FIXME: consider returning the value and having nbytes passed in by
87 // reference instead of vice-versa.
88 //
89 // ie uint64_t var_get_u64(uint8_t *cp, int *nbytes)
90 // vs int      var_get_u64(uint8_t *cp, uint64_t *val)
91 //
92 // The return value can then be assigned to 32-bit or 64-bit type
93 // without need of a new function name.  The cost is we can't then
94 // do "cp += var_get_u32(cp, endp, &u_freq_sz);".  Maybe we can't do
95 // overflow detection with former? (Want 32-bit but got, say, 40 bit)
96 
97 
98 // static inline char *var_dump(const uint8_t *cp, int n) {
99 //     static char buf[1000];
100 //     int i, o = 0;
101 //     for (i = 0; i < n; i++)
102 // 	o += sprintf(&buf[o], " %d", cp[i]);
103 //     return buf;
104 // }
105 
var_put_u64(uint8_t * cp,const uint8_t * endp,uint64_t x)106 static inline int var_put_u64(uint8_t *cp, const uint8_t *endp, uint64_t x) {
107     uint8_t *op = cp;
108 
109     if (x < 177) {
110 	if (endp && endp - cp < 1) return 0;
111 	// 0 to 176 in single byte as-is
112 	*cp++ = x;
113     } else if (x < 16561) {
114 	if (endp && endp - cp < 2) return 0;
115 	*cp++ = ((x-177)>>8)+177;
116 	*cp++ = x-177;
117     } else if (x < 540849) {
118 	if (endp && endp - cp < 3) return 0;
119 	*cp++ = ((x-16561)>>16)+241;
120 	*cp++ = (x-16561)>>8;
121 	*cp++ = x-16561;
122     } else if (x < (1<<24)) {
123 	if (endp && endp - cp < 4) return 0;
124 	*cp++ = 249;
125 	*cp++ = x>>16;
126 	*cp++ = x>>8;
127 	*cp++ = x;
128     } else if (x < (1LL<<32)) {
129 	if (endp && endp - cp < 5) return 0;
130 	*cp++ = 250;
131 	*cp++ = x>>24;
132 	*cp++ = x>>16;
133 	*cp++ = x>>8;
134 	*cp++ = x;
135     } else if (x < (1LL<<40)) {
136 	if (endp && endp - cp < 6) return 0;
137 	*cp++ = 251;
138 	*cp++ = x>>32;
139 	*cp++ = x>>24;
140 	*cp++ = x>>16;
141 	*cp++ = x>>8;
142 	*cp++ = x;
143     } else if (x < (1LL<<48)) {
144 	if (endp && endp - cp < 7) return 0;
145 	*cp++ = 252;
146 	*cp++ = x>>40;
147 	*cp++ = x>>32;
148 	*cp++ = x>>24;
149 	*cp++ = x>>16;
150 	*cp++ = x>>8;
151 	*cp++ = x;
152     } else if (x < (1LL<<56)) {
153 	if (endp && endp - cp < 8) return 0;
154 	*cp++ = 253;
155 	*cp++ = x>>48;
156 	*cp++ = x>>40;
157 	*cp++ = x>>32;
158 	*cp++ = x>>24;
159 	*cp++ = x>>16;
160 	*cp++ = x>>8;
161 	*cp++ = x;
162     } else {
163 	if (endp && endp - cp < 9) return 0;
164 	*cp++ = 254;
165 	*cp++ = x>>56;
166 	*cp++ = x>>48;
167 	*cp++ = x>>40;
168 	*cp++ = x>>32;
169 	*cp++ = x>>24;
170 	*cp++ = x>>16;
171 	*cp++ = x>>8;
172 	*cp++ = x;
173     }
174 
175 //    fprintf(stderr, "Put64 %d (%s)\n", x, var_dump(op, cp-op));
176 
177     return cp-op;
178 }
179 
var_put_u32(uint8_t * cp,const uint8_t * endp,uint32_t x)180 static inline int var_put_u32(uint8_t *cp, const uint8_t *endp, uint32_t x) {
181     uint8_t *op = cp;
182 
183     if (x < 177) {
184 	if (endp && endp - cp < 1) abort();//return 0;
185 	// 0 to 176 in single byte as-is
186 	*cp++ = x;
187     } else if (x < 16561) {
188 	if (endp && endp - cp < 2) abort();//return 0;
189 	*cp++ = ((x-177)>>8)+177;
190 	*cp++ = x-177;
191     } else if (x < 540849) {
192 	if (endp && endp - cp < 3) abort();//return 0;
193 	*cp++ = ((x-16561)>>16)+241;
194 	*cp++ = (x-16561)>>8;
195 	*cp++ = x-16561;
196     } else if (x < (1<<24)) {
197 	if (endp && endp - cp < 4) abort();//return 0;
198 	*cp++ = 249;
199 	*cp++ = x>>16;
200 	*cp++ = x>>8;
201 	*cp++ = x;
202     } else {
203 	if (endp && endp - cp < 5) abort();//return 0;
204 	*cp++ = 250;
205 	*cp++ = x>>24;
206 	*cp++ = x>>16;
207 	*cp++ = x>>8;
208 	*cp++ = x;
209     }
210 
211 //    fprintf(stderr, "Put32 %d (%s)\n", x, var_dump(op, cp-op));
212 
213     return cp-op;
214 }
215 
var_get_u64(uint8_t * cp,const uint8_t * endp,uint64_t * i)216 static inline int var_get_u64(uint8_t *cp, const uint8_t *endp, uint64_t *i) {
217     uint8_t *op = cp;
218     uint64_t j = 0;
219 
220     if (endp && cp >= endp) {
221 	*i = 0;
222 	return 0;
223     }
224     if (*cp < 177) {
225 	j = *cp++;
226     } else if (*cp < 241) {
227 	j = ((cp[0] - 177)<<8) + cp[1] + 177;
228 	cp += 2;
229     } else if (*cp < 249) {
230 	j = ((cp[0] - 241)<<16) + (cp[1]<<8) + cp[2] + 16561;
231 	cp += 3;
232     } else {
233 	int n = *cp++ - 249 + 3;
234 	while (n--)
235 	    j = (j<<8) + *cp++;
236     }
237 
238 //    fprintf(stderr, "Get64 %ld (%s)\n", j, var_dump(op, cp-op));
239 
240     *i = j;
241     return cp-op;
242 }
243 
var_get_u32(uint8_t * cp,const uint8_t * endp,uint32_t * i)244 static inline int var_get_u32(uint8_t *cp, const uint8_t *endp, uint32_t *i) {
245     uint8_t *op = cp;
246     uint32_t j = 0;
247 
248     if (endp && cp >= endp) {
249 	*i = 0;
250 	return 0;
251     }
252     if (*cp < 177) {
253 	j = *cp++;
254     } else if (*cp < 241) {
255 	j = ((cp[0] - 177)<<8) + cp[1] + 177;
256 	cp += 2;
257     } else if (*cp < 249) {
258 	j = ((cp[0] - 241)<<16) + (cp[1]<<8) + cp[2] + 16561;
259 	cp += 3;
260     } else {
261 	int n = *cp++ - 249 + 3;
262 	while (n--)
263 	    j = (j<<8) + *cp++;
264     }
265 
266 //    fprintf(stderr, "Get32 %d (%s)\n", j, var_dump(op, cp-op));
267 
268     *i = j;
269     return cp-op;
270 }
271 
272 // Signed versions of the above using zig-zag integer encoding.
273 // This folds the sign bit into the bottom bit so we iterate
274 // 0, -1, +1, -2, +2, etc.
var_put_s32(uint8_t * cp,const uint8_t * endp,int32_t i)275 static inline int var_put_s32(uint8_t *cp, const uint8_t *endp, int32_t i) {
276     return var_put_u32(cp, endp, (i << 1) ^ (i >> 31));
277 }
var_put_s64(uint8_t * cp,const uint8_t * endp,int64_t i)278 static inline int var_put_s64(uint8_t *cp, const uint8_t *endp, int64_t i) {
279     return var_put_u64(cp, endp, (i << 1) ^ (i >> 63));
280 }
281 
var_get_s32(uint8_t * cp,const uint8_t * endp,int32_t * i)282 static inline int var_get_s32(uint8_t *cp, const uint8_t *endp, int32_t *i) {
283     int b = var_get_u32(cp, endp, (uint32_t *)i);
284     *i = (*i >> 1) ^ -(*i & 1);
285     return b;
286 }
var_get_s64(uint8_t * cp,const uint8_t * endp,int64_t * i)287 static inline int var_get_s64(uint8_t *cp, const uint8_t *endp, int64_t *i) {
288     int b = var_get_u64(cp, endp, (uint64_t *)i);
289     *i = (*i >> 1) ^ -(*i & 1);
290     return b;
291 }
292 
var_size_u64(uint64_t v)293 static inline int var_size_u64(uint64_t v) {
294     if (v < 177)
295 	return 1;
296     else if (v < 16561)
297 	return 2;
298     else if (v < 540849)
299 	return 3;
300 
301     int i = 0;
302     do {
303 	v >>= 8;
304 	i++;
305     } while (v);
306 
307 //    fprintf(stderr, "Size %ld (%d)\n", v, i+1);
308 
309     return i+1;
310 }
311 #define var_size_u32 var_size_u64
312 
var_size_s64(int64_t v)313 static inline int var_size_s64(int64_t v) {
314     return var_size_u64((v >> 63) ^ (v << 1));
315 }
316 #define var_size_s32 var_size_s64
317 
318 #endif /* VARINT2_H */
319