1 //#include <stdio.h>
2
3 // FIXME: make get functions const uint8_t *
4
5 /*
6 * Copyright (c) 2019 Genome Research Ltd.
7 * Author(s): James Bonfield
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright notice,
13 * this list of conditions and the following disclaimer.
14 *
15 * 2. Redistributions in binary form must reproduce the above
16 * copyright notice, this list of conditions and the following
17 * disclaimer in the documentation and/or other materials provided
18 * with the distribution.
19 *
20 * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
21 * Institute nor the names of its contributors may be used to endorse
22 * or promote products derived from this software without specific
23 * prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
26 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
28 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
29 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
31 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 */
37
38 #ifndef VARINT2_H
39 #define VARINT2_H
40
41 #include <stdint.h>
42
43 // General API scheme is var_{get,put}_{s,u}{32,64}
44 // s/u for signed/unsigned; 32/64 for integer size.
45
46 // The ideas here are taken from the vbenc code in TurboPFor
47 // (https://github.com/powturbo/TurboPFor) with analysis at
48 // https://github.com/stoklund/varint.
49
50 // Unlike the ITF8 and standard 7-bit at a time encodings, this
51 // tries to ensure a larger portion of small numbers still fit in 1 byte.
52 // This trades more space for long integers with less space for short ones,
53 // which seems like a good tradeoff given the typical distribution curves.
54 //
55 // Like ITF8 and LTF8, the first byte also indicates the total number of
56 // bytes we need to decode, but unlike those it uses the same format for
57 // both meaning changing data type doesn't change encoding.
58 //
59 // Size comparison examples.
60 //
61 // Max value
62 // Bytes ITF8/7bit This
63 // 1 127 176
64 // 2 16,383 16,560
65 // 3 2,097,151 540,848
66 // 4 268,435,455 16,777,215
67 // 5 34,359,738,368 4,294,967,296
68 // 6 4,398,046,511,104 1,099,511,627,776
69 // ...
70 //
71 // The format is as follows:
72 // 0-176 1 byte: 0 + 8 bit
73 // 177-16560 (14 bit range) 2 bytes: 177 + 6bit, 0 + 8bit, for x-177
74 // 16561-540848 (19 bits) 3 bytes: 241 + 3bit, 0+8, 0+8, for x-16561
75 // 540849-16777215 (~24 bit) 4 bytes: 249, 0+8, 0+8, 0+8, for x
76 // 2^24 - 2^32-1 5 bytes: 250, 0+8 x4
77 // 2^32 - 2^40-1 6 bytes: 251, 0+8 x5
78 // 2^40 - 2^48-1 7 bytes: 252, 0+8 x6
79 // 2^48 - 2^56-1 8 bytes: 253, 0+8 x7
80 // 2^56 - 2^64-1 9 bytes: 254, 0+8 x8
81 //
82 // Hence first byte value 255 is not possible and permits future
83 // escape code.
84
85
86 // FIXME: consider returning the value and having nbytes passed in by
87 // reference instead of vice-versa.
88 //
89 // ie uint64_t var_get_u64(uint8_t *cp, int *nbytes)
90 // vs int var_get_u64(uint8_t *cp, uint64_t *val)
91 //
92 // The return value can then be assigned to 32-bit or 64-bit type
93 // without need of a new function name. The cost is we can't then
94 // do "cp += var_get_u32(cp, endp, &u_freq_sz);". Maybe we can't do
95 // overflow detection with former? (Want 32-bit but got, say, 40 bit)
96
97
98 // static inline char *var_dump(const uint8_t *cp, int n) {
99 // static char buf[1000];
100 // int i, o = 0;
101 // for (i = 0; i < n; i++)
102 // o += sprintf(&buf[o], " %d", cp[i]);
103 // return buf;
104 // }
105
var_put_u64(uint8_t * cp,const uint8_t * endp,uint64_t x)106 static inline int var_put_u64(uint8_t *cp, const uint8_t *endp, uint64_t x) {
107 uint8_t *op = cp;
108
109 if (x < 177) {
110 if (endp && endp - cp < 1) return 0;
111 // 0 to 176 in single byte as-is
112 *cp++ = x;
113 } else if (x < 16561) {
114 if (endp && endp - cp < 2) return 0;
115 *cp++ = ((x-177)>>8)+177;
116 *cp++ = x-177;
117 } else if (x < 540849) {
118 if (endp && endp - cp < 3) return 0;
119 *cp++ = ((x-16561)>>16)+241;
120 *cp++ = (x-16561)>>8;
121 *cp++ = x-16561;
122 } else if (x < (1<<24)) {
123 if (endp && endp - cp < 4) return 0;
124 *cp++ = 249;
125 *cp++ = x>>16;
126 *cp++ = x>>8;
127 *cp++ = x;
128 } else if (x < (1LL<<32)) {
129 if (endp && endp - cp < 5) return 0;
130 *cp++ = 250;
131 *cp++ = x>>24;
132 *cp++ = x>>16;
133 *cp++ = x>>8;
134 *cp++ = x;
135 } else if (x < (1LL<<40)) {
136 if (endp && endp - cp < 6) return 0;
137 *cp++ = 251;
138 *cp++ = x>>32;
139 *cp++ = x>>24;
140 *cp++ = x>>16;
141 *cp++ = x>>8;
142 *cp++ = x;
143 } else if (x < (1LL<<48)) {
144 if (endp && endp - cp < 7) return 0;
145 *cp++ = 252;
146 *cp++ = x>>40;
147 *cp++ = x>>32;
148 *cp++ = x>>24;
149 *cp++ = x>>16;
150 *cp++ = x>>8;
151 *cp++ = x;
152 } else if (x < (1LL<<56)) {
153 if (endp && endp - cp < 8) return 0;
154 *cp++ = 253;
155 *cp++ = x>>48;
156 *cp++ = x>>40;
157 *cp++ = x>>32;
158 *cp++ = x>>24;
159 *cp++ = x>>16;
160 *cp++ = x>>8;
161 *cp++ = x;
162 } else {
163 if (endp && endp - cp < 9) return 0;
164 *cp++ = 254;
165 *cp++ = x>>56;
166 *cp++ = x>>48;
167 *cp++ = x>>40;
168 *cp++ = x>>32;
169 *cp++ = x>>24;
170 *cp++ = x>>16;
171 *cp++ = x>>8;
172 *cp++ = x;
173 }
174
175 // fprintf(stderr, "Put64 %d (%s)\n", x, var_dump(op, cp-op));
176
177 return cp-op;
178 }
179
var_put_u32(uint8_t * cp,const uint8_t * endp,uint32_t x)180 static inline int var_put_u32(uint8_t *cp, const uint8_t *endp, uint32_t x) {
181 uint8_t *op = cp;
182
183 if (x < 177) {
184 if (endp && endp - cp < 1) abort();//return 0;
185 // 0 to 176 in single byte as-is
186 *cp++ = x;
187 } else if (x < 16561) {
188 if (endp && endp - cp < 2) abort();//return 0;
189 *cp++ = ((x-177)>>8)+177;
190 *cp++ = x-177;
191 } else if (x < 540849) {
192 if (endp && endp - cp < 3) abort();//return 0;
193 *cp++ = ((x-16561)>>16)+241;
194 *cp++ = (x-16561)>>8;
195 *cp++ = x-16561;
196 } else if (x < (1<<24)) {
197 if (endp && endp - cp < 4) abort();//return 0;
198 *cp++ = 249;
199 *cp++ = x>>16;
200 *cp++ = x>>8;
201 *cp++ = x;
202 } else {
203 if (endp && endp - cp < 5) abort();//return 0;
204 *cp++ = 250;
205 *cp++ = x>>24;
206 *cp++ = x>>16;
207 *cp++ = x>>8;
208 *cp++ = x;
209 }
210
211 // fprintf(stderr, "Put32 %d (%s)\n", x, var_dump(op, cp-op));
212
213 return cp-op;
214 }
215
var_get_u64(uint8_t * cp,const uint8_t * endp,uint64_t * i)216 static inline int var_get_u64(uint8_t *cp, const uint8_t *endp, uint64_t *i) {
217 uint8_t *op = cp;
218 uint64_t j = 0;
219
220 if (endp && cp >= endp) {
221 *i = 0;
222 return 0;
223 }
224 if (*cp < 177) {
225 j = *cp++;
226 } else if (*cp < 241) {
227 j = ((cp[0] - 177)<<8) + cp[1] + 177;
228 cp += 2;
229 } else if (*cp < 249) {
230 j = ((cp[0] - 241)<<16) + (cp[1]<<8) + cp[2] + 16561;
231 cp += 3;
232 } else {
233 int n = *cp++ - 249 + 3;
234 while (n--)
235 j = (j<<8) + *cp++;
236 }
237
238 // fprintf(stderr, "Get64 %ld (%s)\n", j, var_dump(op, cp-op));
239
240 *i = j;
241 return cp-op;
242 }
243
var_get_u32(uint8_t * cp,const uint8_t * endp,uint32_t * i)244 static inline int var_get_u32(uint8_t *cp, const uint8_t *endp, uint32_t *i) {
245 uint8_t *op = cp;
246 uint32_t j = 0;
247
248 if (endp && cp >= endp) {
249 *i = 0;
250 return 0;
251 }
252 if (*cp < 177) {
253 j = *cp++;
254 } else if (*cp < 241) {
255 j = ((cp[0] - 177)<<8) + cp[1] + 177;
256 cp += 2;
257 } else if (*cp < 249) {
258 j = ((cp[0] - 241)<<16) + (cp[1]<<8) + cp[2] + 16561;
259 cp += 3;
260 } else {
261 int n = *cp++ - 249 + 3;
262 while (n--)
263 j = (j<<8) + *cp++;
264 }
265
266 // fprintf(stderr, "Get32 %d (%s)\n", j, var_dump(op, cp-op));
267
268 *i = j;
269 return cp-op;
270 }
271
272 // Signed versions of the above using zig-zag integer encoding.
273 // This folds the sign bit into the bottom bit so we iterate
274 // 0, -1, +1, -2, +2, etc.
var_put_s32(uint8_t * cp,const uint8_t * endp,int32_t i)275 static inline int var_put_s32(uint8_t *cp, const uint8_t *endp, int32_t i) {
276 return var_put_u32(cp, endp, (i << 1) ^ (i >> 31));
277 }
var_put_s64(uint8_t * cp,const uint8_t * endp,int64_t i)278 static inline int var_put_s64(uint8_t *cp, const uint8_t *endp, int64_t i) {
279 return var_put_u64(cp, endp, (i << 1) ^ (i >> 63));
280 }
281
var_get_s32(uint8_t * cp,const uint8_t * endp,int32_t * i)282 static inline int var_get_s32(uint8_t *cp, const uint8_t *endp, int32_t *i) {
283 int b = var_get_u32(cp, endp, (uint32_t *)i);
284 *i = (*i >> 1) ^ -(*i & 1);
285 return b;
286 }
var_get_s64(uint8_t * cp,const uint8_t * endp,int64_t * i)287 static inline int var_get_s64(uint8_t *cp, const uint8_t *endp, int64_t *i) {
288 int b = var_get_u64(cp, endp, (uint64_t *)i);
289 *i = (*i >> 1) ^ -(*i & 1);
290 return b;
291 }
292
var_size_u64(uint64_t v)293 static inline int var_size_u64(uint64_t v) {
294 if (v < 177)
295 return 1;
296 else if (v < 16561)
297 return 2;
298 else if (v < 540849)
299 return 3;
300
301 int i = 0;
302 do {
303 v >>= 8;
304 i++;
305 } while (v);
306
307 // fprintf(stderr, "Size %ld (%d)\n", v, i+1);
308
309 return i+1;
310 }
311 #define var_size_u32 var_size_u64
312
var_size_s64(int64_t v)313 static inline int var_size_s64(int64_t v) {
314 return var_size_u64((v >> 63) ^ (v << 1));
315 }
316 #define var_size_s32 var_size_s64
317
318 #endif /* VARINT2_H */
319