1 /* Variable-length binary prefix codes for integers
2 *
3 * Contents:
4 * 1. Exponential Golomb codes
5 * 2. Golomb-Rice codes
6 * 3. Elias delta code
7 * 4. Google varint codes
8 * 5. Unit tests
9 * 6. Test driver
10 * 7. Example
11 *
12 * See also:
13 * esl_huffman : Huffman coding
14 */
15 #include "esl_config.h"
16
17 #include <stdio.h>
18 #include <limits.h>
19
20 #include "easel.h"
21 #include "esl_varint.h"
22
23 /*****************************************************************
24 * 1. Exponential Golomb codes
25 *****************************************************************/
26
27 /* Function: esl_varint_expgol()
28 * Synopsis: Generalized exponential Golomb coding
29 * Incept: SRE, Wed 09 Jan 2019
30 *
31 * Purpose: Encode nonnegative integer <v> in exponential-Golomb-<k>
32 * code. $k=0$ is standard exponential Golomb code; $k>0$
33 * are the generalized Golomb codes. Optionally return codeword in
34 * low-order bits of <*opt_code> (i.e. right flush), and
35 * its length in <*opt_n>.
36 *
37 * Args: v : integer to encode; v >= 0
38 * k : parameter for which exp-Golomb code; k >= 0
39 * opt_code : optRETURN: encoding, right flushed (low order bits)
40 * opt_n : optRETURN: length of encoding in bits
41 *
42 * Returns: <eslOK> on success.
43 *
44 * Throws: <eslERANGE> if caller requested <*opt_code> to be returned,
45 * but encoding doesn't fit in a single <uint64_t>. Now <*opt_code>
46 * and <*opt_n> are set to 0.
47 */
48 int
esl_varint_expgol(int v,int k,uint64_t * opt_code,int * opt_n)49 esl_varint_expgol(int v, int k, uint64_t *opt_code, int *opt_n)
50 {
51 ESL_DASSERT1(( v >= 0 && k >= 0 ));
52
53 uint64_t m = (1 << k); // 2^k. For k=0 standard expgol: m=1,
54 uint64_t q = v / m; // \floor (v / 2^k) piece, to be encoded in exp-golomb-0 ... q=v, all bits here,
55 uint64_t r = v % m; // will become rightmost bits ... r=0, no bits stored
56 int n = 0;
57 uint64_t code = q+1;
58 int status;
59
60 // q stored in exp-golomb-0. We already set low order bits to q+1; now we just need to determine width of q+1.
61 q = q+1;
62 while (q) { q >>= 1; n++; } // Let a = # of bits needed to represent q+1
63 n = 2 * n - 1; // now a-1 leading bits are 0, followed by a bits of q+1, for an exp-golomb-0 code.
64
65 if (opt_code && n > 64) ESL_XEXCEPTION(eslERANGE, "exponential Golomb codeword length > 64");
66
67 // r stored in its binary representation, k bits.
68 code = (code << k) | r;
69 n += k;
70
71 if (opt_code) *opt_code = code;
72 if (opt_n) *opt_n = n;
73 return eslOK;
74
75 ERROR:
76 if (opt_code) *opt_code = 0;
77 if (opt_n) *opt_n = 0;
78 return status;
79 }
80
81
82 /* Function: esl_varint_expgol_decode()
83 * Synopsis: Decode an exponential Golomb codeword.
84 * Incept: SRE, Thu 10 Jan 2019 [Slaid Cleaves, Still Fighting the War]
85 *
86 * Purpose: Decode the leading (high order) prefix bits of <code>
87 * according to exponential-Golomb-<k> code. Optionally
88 * return the decoded integer value in <*opt_v>, and the
89 * length of the decoded codeword in <*opt_n>.
90 *
91 * Note that <code> is _left flush_ for decoding: the
92 * prefix codeword is decoded from the leftmost, most
93 * significant bit.
94 *
95 * If <code> consists of only 0's, and therefore has no
96 * complete codeword, return <eslEOD>. This can be used to
97 * detect normal end of a bit string.
98 *
99 * If <code> does contain a 1 bit but does not contain a
100 * complete codeword, assume it is corrupted somehow,
101 * and return <eslECORRUPT>.
102 *
103 * Args: code : bit string to decode, starting from MSB
104 * k : parameter for which exp-Golomb code; k >= 0
105 * opt_v: optRETURN: integer value
106 * opt_n: optRETURN: length of decoded prefix codeword, in bits
107 *
108 * Returns: <eslOK> on success.
109 *
110 * <eslEOD> if <code> is all 0 bits.
111 * <eslECORRUPT> if at least one 1 bit is set, yet no
112 * codeword is present. On errors, <*opt_v> and <*opt_n>
113 * are 0.
114 */
115 int
esl_varint_expgol_decode(uint64_t code,int k,int * opt_v,int * opt_n)116 esl_varint_expgol_decode(uint64_t code, int k, int *opt_v, int *opt_n)
117 {
118 int b,n,r,v;
119 int status;
120
121 if (! code) { status = eslEOD; goto ERROR; } // all exp-Golomb codewords contain at least one 1
122
123 for (b = 63; b >= 0; b--)
124 if (code & (1ull << b)) break; // now b is position of leftmost 1 bit (i.e. MSB); 63..0.
125 n = 2*(64-b)+k-1; // ...and from that, we know the size of the codeword.
126 if (n > 64) { status = eslECORRUPT; goto ERROR; } // looks like a code but can't be valid
127
128 // the n-bit codeword consists of the binary representation of q+1 (with leading zeros),
129 // followed by a k-bit remainder.
130 code >>= 64-n;
131 r = code & ((1ull << k) - 1);
132 code >>= k;
133 v = (code-1) * (1ull << k) + r;
134
135 if (opt_v) *opt_v = v;
136 if (opt_n) *opt_n = n;
137 return eslOK;
138
139 ERROR:
140 if (opt_v) *opt_v = 0;
141 if (opt_n) *opt_n = 0;
142 return status;
143 }
144
145
146 /*****************************************************************
147 * 2. Golomb-Rice codes
148 *****************************************************************/
149
150 /* Function: esl_varint_rice()
151 * Synopsis: Golomb-Rice coding
152 * Incept: SRE, Sat 12 Jan 2019 [Ramin Djawadi, Westworld, The Maze]
153 *
154 * Purpose: Encode nonnegative integer <v> in Golomb-Rice-<k> code,
155 * for $k \geq 0$. Optionally return codeword in low-order bits of
156 * <*opt_code> (i.e. right flush) and its length in <*opt_n>.
157 *
158 * Args: v : integer to encode; v >= 0
159 * k : parameter for which Golomb-Rice code; k >= 0
160 * opt_code : optRETURN: encoding, right flushed (low order bits)
161 * opt_n : optRETURN: length of encoding in bits
162 *
163 * Returns: <eslOK> on success.
164 *
165 * Throws: <eslERANGE> if caller requested <*opt_code> to be returned,
166 * but encoding doesn't fit in a single <uint64_t>. Now <*opt_code>
167 * and <*opt_n> are set to 0.
168 */
169 int
esl_varint_rice(int v,int k,uint64_t * opt_code,int * opt_n)170 esl_varint_rice(int v, int k, uint64_t *opt_code, int *opt_n)
171 {
172 ESL_DASSERT1(( v >= 0 && k >= 0));
173 int m = (1 << k); // 2^k
174 int q = v / m; // int(v / 2^k), quotient. code leads with this many 1's; then a 0
175 int r = v % m; // remainder. encoded in k bits
176 uint64_t code = 0;
177 int status;
178
179 if (opt_code && q > 64-(k+1)) ESL_XEXCEPTION(eslERANGE, "Golomb-Rice codeword length > 64");
180
181 code = (1ull << q) - 1; // sets q low-order bits to 1; we know 2^{q+1} fits in uint64_t
182 code = code << (1 + k); // bit k+1 is a 0; low-order k bits hold r
183 code |= r;
184
185 if (opt_code) *opt_code = code;
186 if (opt_n) *opt_n = q + k + 1;
187 return eslOK;
188
189 ERROR:
190 if (opt_code) *opt_code = 0;
191 if (opt_n) *opt_n = 0;
192 return status;
193 }
194
195
196 /* Function: esl_varint_rice_decode()
197 * Synopsis: Decode a Golomb-Rice codeword
198 * Incept: SRE, Sat 12 Jan 2019 [Real Madrid v. Leganes]
199 *
200 * Purpose: Decode the leading (high order) prefix bits of <code>
201 * according to Golomb-Rice-<k> code. Optionally return the
202 * decoded integer value in <*opt_v>, and the length
203 * of the decoded codeword in <*opt_n>.
204 *
205 * Note that <code> is _left flush_ for decoding:
206 * the prefix codeword is decoded from the leftmost,
207 * most significant bit.
208 *
209 * If <code> does not contain a 0 bit, return <eslEOD>; it
210 * cannot contain a valid codeword. A caller can shift on
211 * a pattern of all 1 bits to signal EOD.
212 *
213 * There is no check for overflowing <v>. We assume that
214 * <code> encodes a value that fits in an int.
215 *
216 * Args: code : bit string to decode, starting from MSB
217 * k : parameter for which Google varint code; k >= 2
218 * opt_v: optRETURN: integer value
219 * opt_n: optRETURN: length of decoded prefix codeword, in bits
220 *
221 * Returns: <eslOK> on success.
222 * <eslEOD> if <code> has no completed codeword.
223 *
224 * Throws: (no abnormal error conditions)
225 */
226 int
esl_varint_rice_decode(uint64_t code,int k,int * opt_v,int * opt_n)227 esl_varint_rice_decode(uint64_t code, int k, int *opt_v, int *opt_n)
228 {
229 int q;
230 int status;
231
232 if (~code == 0) { status = eslEOD; goto ERROR; } // valid rice code has at least one 0 bit
233
234 for (q = 0; q <= 64-k; q++)
235 if ( (~code) & (1ull << (63-q))) break; // q = # of leading 1 bits. +1, +k: n = q + k + 1.
236
237 code >>= 64-(q+k+1); // shift 64 - (q+k+1) bits off to the right, right-flushing the codeword.
238 /* q * 2^k + r, obtained by masking low k bits */
239 if (opt_v) *opt_v = (int) (q * (1ull << k) + (code & ( (1ull << (k+1)) - 1)));
240 if (opt_n) *opt_n = q + k + 1;
241 return eslOK;
242
243 ERROR:
244 if (opt_v) *opt_v = 0;
245 if (opt_n) *opt_n = 0;
246 return status;
247 }
248
249
250
251 /*****************************************************************
252 * 3. Elias delta code
253 *****************************************************************/
254
255 /* Function: esl_varint_delta
256 * Synopsis: Elias delta coding
257 * Incept: SRE, Sat 12 Jan 2019
258 *
259 * Purpose: Encode positive integer <v> in Elias delta code.
260 * Optionally return codeword in low-order bits of
261 * <*opt_code> (i.e. right flush), and its length
262 * in <*opt_n>.
263 *
264 * Args: v : integer to encode; v >= 1
265 * opt_code : optRETURN: encoding, right flushed (low order bits)
266 * opt_n : optRETURN: length of encoding in bits
267 *
268 * Returns: <eslOK> on success.
269 *
270 * Throws: <eslERANGE> if caller requested <*opt_code> to be returned,
271 * but encoding doesn't fit in a single <uint64_t>. Now <*opt_code>
272 * and <*opt_n> are set to 0.
273 */
274 int
esl_varint_delta(int v,uint64_t * opt_code,int * opt_n)275 esl_varint_delta(int v, uint64_t *opt_code, int *opt_n)
276 {
277 ESL_DASSERT1(( v >= 1 ));
278 int a = 0; // \floor log_2 v 2^a is the highest power of 2 in v
279 int b = 0; // \floor log_2 (a+1)
280 int n = 0;
281 uint64_t code = 0;
282 uint64_t tmp;
283 int status;
284
285 tmp = ((uint64_t) v >> 1); while (tmp) { tmp = tmp >> 1; a++; }
286 tmp = ((uint64_t) (a+1) >> 1); while (tmp) { tmp = tmp >> 1; b++; }
287
288 n = b; // b leading zeros
289 n += b+1; code = a+1; // b+1 bits, representation of a+1
290 n += a; code = (code << a) | (v & ((1 << a) - 1)); // a bits, v % 2^a
291
292 if (opt_code && n > 64) ESL_XEXCEPTION(eslERANGE, "Elias delta codeword length > 64");
293
294 if (opt_code) *opt_code = code;
295 if (opt_n) *opt_n = n;
296 return eslOK;
297
298 ERROR:
299 if (opt_code) *opt_code = 0;
300 if (opt_n) *opt_n = 0;
301 return status;
302 }
303
304
305 /* Function: esl_varint_delta_decode()
306 * Synopsis: Decode an Elias delta codeword.
307 * Incept: SRE, Sat 12 Jan 2019
308 *
309 * Purpose: Decode the leading (high order) prefix bits of <code>
310 * using Elias delta code. Optionally return the decoded
311 * integer value in <*opt_v>, and/or the length of the
312 * decoded codeword in <*opt_n>.
313 *
314 * Note that <code> is _left flush_ for decoding: the
315 * prefix codeword is decoded from the leftmost, most
316 * significant bit.
317 *
318 * If <code> consists of only 0's, and therefore has
319 * no complete codeword, return <eslEOD>. This can be
320 * used to detect the normal end of a bit string.
321 *
322 * If <code> does contain a 1 bit but does not contain a
323 * complete codeword, assume it is corrupted somehow, and
324 * throw <eslECORRUPT>.
325 *
326 * Args: code : bit string to decode, starting from MSB
327 * opt_v: optRETURN: integer value
328 * opt_n: optRETURN: length of decoded prefix codeword, in bits
329 *
330 * Returns: <eslOK> on success.
331 *
332 * <eslEOD> if <code> is all 0 bits.
333 * <eslECORRUPT> if at least one 1 bit is set, yet no codeword
334 * is present. On errors, <*opt_v> and <*opt_n> are 0.
335 */
336 int
esl_varint_delta_decode(uint64_t code,int * opt_v,int * opt_n)337 esl_varint_delta_decode(uint64_t code, int *opt_v, int *opt_n)
338 {
339 int a,b,n,v, status;
340
341 if (code == 0) { status = eslEOD; goto ERROR; }
342
343 for (b = 0; b < 64; b++)
344 if (code & (1ull << (63-b))) break; // stop on first 1. b = # of leading 0 bits
345 a = (code >> (63 - 2*b)) - 1; // After b leading 0's, b+1 bits encode a+1
346 n = 2*b + a + 1;
347
348 if (n > 64) { status = eslECORRUPT; goto ERROR; }
349
350 v = (code >> (63 - 2*b -a)); // ... and a bits encodes v mod 2^a, all but its highest order bit
351 v += (1 << a); // put the 2^a bit back
352
353 // Last a bits encodes v mod 2^a, all but the highest order bit;
354 // shift and mask to get those a bits, then add 2^a back.
355 if (opt_v) *opt_v = (1<<a) + ((code >> (64 - n)) & ( (1<<a)-1));
356 if (opt_n) *opt_n = n;
357 return eslOK;
358
359 ERROR:
360 if (opt_v) *opt_v = 0;
361 if (opt_n) *opt_n = 0;
362 return status;
363 }
364
365
366
367
368 /*****************************************************************
369 * 4. Google varint codes
370 *****************************************************************/
371
372 /* Function: esl_varint_google()
373 * Synopsis: Google varint coding
374 * Incept: SRE, Fri 11 Jan 2019 [Eminem, Lose Yourself]
375 *
376 * Purpose: Encode nonnegative integer <v> in Google varint-<k> code;
377 * $k \geq 2$; $k=8$ gives you one-byte codewords, the code
378 * used by Google Protobuf. Optionally return code in
379 * low-order bits of <*opt_code> (i.e. right flushed), and
380 * length of that code in <*opt_n>.
381 *
382 * Args: v : integer to encode; v >= 0
383 * k : parameter for which Google varint code; k >= 2
384 * opt_code : optRETURN: encoding, right flushed (low order bits)
385 * opt_n : optRETURN: length of encoding in bits
386 *
387 * Returns: <eslOK> on success.
388 *
389 * Throws: <eslERANGE> if caller requested <*opt_code> to be returned,
390 * but binary encoding won't fit in a single
391 * <uint64_t>. <*opt_code> and <*opt_n> are set to 0.
392 */
393 int
esl_varint_google(int v,int k,uint64_t * opt_code,int * opt_n)394 esl_varint_google(int v, int k, uint64_t *opt_code, int *opt_n)
395 {
396 ESL_DASSERT1(( k >= 2 ));
397 ESL_DASSERT1(( v >= 0 ));
398
399 int base = 1 << (k-1); // 2^(k-1)
400 uint64_t code = 0;
401 int n = 0;
402 int status;
403
404 code |= v % base;
405 v /= base;
406 n += k;
407 while (v > 0)
408 {
409 code |= (1 << (k-1));
410 code = code << k;
411
412 code |= v % base;
413 v /= base;
414 n += k;
415 }
416 if (n > 64 && opt_code) ESL_XEXCEPTION(eslERANGE, "google varint code length > 64");
417
418 if (opt_code) *opt_code = code;
419 if (opt_n) *opt_n = n;
420 return eslOK;
421
422 ERROR:
423 if (opt_code) *opt_code = 0;
424 if (opt_n) *opt_n = 0;
425 return status;
426 }
427
428
429 /* Function: esl_varint_google_decode()
430 * Synopsis: Decode a Google varint codeword
431 * Incept: SRE, Fri 11 Jan 2019
432 *
433 * Purpose: Decode the leading (high order) prefix bits of <code>
434 * according to Google varint-<k> code. Optionally
435 * return the decoded integer value in <*opt_v>, and the
436 * length of the decoded codeword in <*opt_n>.
437 *
438 * Note that <code> is _left flush_ for decoding: the
439 * prefix codeword is decoded from the leftmost, most
440 * significant bit.
441 *
442 * If <code> does not contain a termination codegroup (i.e.
443 * it consists only of continuation groups that start with
444 * 1), return <eslEOD>. A caller can shift on a pattern of
445 * all 1 bits to signal EOD.
446 *
447 * There is no check for overflowing <v>. We assume that
448 * <code> encodes a value that fits in an int.
449 *
450 * Args: code : bit string to decode, starting from MSB
451 * k : parameter for which Google varint code; k >= 2
452 * opt_v: optRETURN: integer value
453 * opt_n: optRETURN: length of decoded prefix codeword, in bits
454 *
455 * Returns: <eslOK> on success.
456 * <eslEOD> if <code> has no completed codeword.
457 */
458 int
esl_varint_google_decode(uint64_t code,int k,int * opt_v,int * opt_n)459 esl_varint_google_decode(uint64_t code, int k, int *opt_v, int *opt_n)
460 {
461 uint64_t cmask = 1ull << (k-1);
462 uint64_t vmask = cmask-1;
463 int v = 0;
464 uint64_t grp; // current k-bit code group
465 int g; // which group (least significant group first)
466 int status;
467
468 for (g = 0; g < 64/k; code <<= k, g++)
469 {
470 grp = code >> (64-k);
471 v += (grp & vmask) << (g*(k-1));
472 if (!(grp & cmask)) break;
473 }
474 if (grp & cmask) { status = eslEOD; goto ERROR; }
475
476 if (opt_v) *opt_v = v;
477 if (opt_n) *opt_n = (g+1)*k;
478 return eslOK;
479
480 ERROR:
481 if (opt_v) *opt_v = 0;
482 if (opt_n) *opt_n = 0;
483 return status;
484 }
485
486
487
488
489 /*****************************************************************
490 * 5. Unit tests
491 *****************************************************************/
492 #ifdef eslVARINT_TESTDRIVE
493
494 #include "esl_random.h"
495
496 /* floor_log2()
497 * Returns $w = \lfloor \log_2 v \rfloor$
498 * w is the highest power of 2 in v;
499 * equivalently, w is the MSB set in v
500 *
501 * Example: v=53 ==> 00110101 ==> w=5 (2^5 = 32)
502 */
503 static int
floor_log2(int v)504 floor_log2(int v)
505 {
506 return (v == 0 ? 0 : (int) floor(log2(v)));
507 }
508
509 /* floor_log2k()
510 * Returns $\lfloor \log_{2^k} v \rfloor$
511 */
512 static int
floor_log2k(int v,int k)513 floor_log2k(int v, int k)
514 {
515 return ( v == 0 ? 0 : (int) floor( log2(v)) / k);
516 }
517
len_expgol(int v,int k)518 int len_expgol(int v, int k) { return (2 * floor_log2((v / (1<<k)) + 1) + 1 + k); }
max_expgol(int b,int k)519 int64_t max_expgol(int b, int k) { return (((1ull << (((b-k-1)/2) + 1)) - 1) << k) - 1; }
len_rice(int v,int k)520 int len_rice (int v, int k) { return (1 + k + (v / (1ull << k))); }
max_rice(int b,int k)521 int max_rice (int b, int k) { return ( (1ull << k) * (b-k) - 1); }
len_delta(int v)522 int len_delta (int v) { return (floor_log2(v) + 2 * floor_log2( (floor_log2(v) + 1)) + 1); }
len_google(int v,int k)523 int len_google(int v, int k) { return ( k * (floor_log2k(v,k-1) + 1)); }
max_google(int b,int k)524 int64_t max_google(int b, int k) { return ((1ull << ((b/k) * (k-1))) - 1); }
525
526
527 static void
utest_expgol(ESL_RANDOMNESS * rng)528 utest_expgol(ESL_RANDOMNESS *rng)
529 {
530 char msg[] = "utest_expgol exponential Golomb tests failed";
531 uint64_t code;
532 int64_t maxv;
533 int v, v2;
534 int n, n2;
535 int k;
536 int b;
537 int i;
538
539 /* Spot check exponential Golomb (standard, k=0) against
540 * documentation's example table.
541 * 9 encodes as 0001010, len 7, e.g. 0xa
542 */
543 if ( esl_varint_expgol( 9, 0, &code, &n) != eslOK) esl_fatal(msg);
544 if ( code != 0xall || n != 7) esl_fatal(msg);
545
546 /* Also spot check documented exp-Golomb-2 code
547 * 9 encodes as 01101, len 5, e.g. 0xd
548 */
549 if ( esl_varint_expgol( 9, 2, &code, &n) != eslOK) esl_fatal(msg);
550 if ( code != 0xdll || n != 5) esl_fatal(msg);
551
552 /* Systematically check exp-Golomb-k codes from k=0..10 */
553 for (k = 0; k <= 10; k++)
554 {
555 /* Check encoding of v=0..999 */
556 for (v = 0; v < 1000; v++)
557 {
558 if ( esl_varint_expgol(v, k, &code, &n) != eslOK) esl_fatal(msg);
559 if ( esl_varint_expgol_decode(code << (64-n), k, &v2, &n2) != eslOK) esl_fatal(msg);
560 if (n != len_expgol(v, k)) esl_fatal(msg);
561 if (v != v2 || n != n2) esl_fatal(msg);
562 }
563
564 /* Check a bunch of random values on valid range */
565 maxv = ESL_MIN(INT_MAX, max_expgol(64, k)); // this is always INT_MAX
566 for (i = 0; i < 1000; i++)
567 {
568 v = esl_rnd_Roll(rng, (int) maxv); // 0..maxv-1, anyway.
569 if ( esl_varint_expgol(v, k, &code, &n) != eslOK) esl_fatal(msg);
570 if ( esl_varint_expgol_decode(code << (64-n), k, &v2, &n2) != eslOK) esl_fatal(msg);
571 if (n != len_expgol(v, k)) esl_fatal(msg);
572 if (v != v2 || n != n2) esl_fatal(msg);
573 }
574
575 /* Check our documented maxv calculations, and encoding at each max */
576 for (b = k+1; b <= 62; b++) // 62, because maxv+1 will need larger code, and exp-golomb codes grow in steps of 2
577 {
578 maxv = max_expgol(b, k);
579 if (maxv <= INT_MAX) // check that we can represent maxv in an int
580 {
581 if ( esl_varint_expgol((int) maxv, k, &code, &n) != eslOK) esl_fatal(msg);
582 if ( esl_varint_expgol_decode(code << (64-n), k, &v2, &n2) != eslOK) esl_fatal(msg);
583 if (n != len_expgol(maxv, k)) esl_fatal(msg);
584 if (maxv != v2 || n != n2) esl_fatal(msg);
585 }
586 if (maxv < INT_MAX) // maxv+1 can be coded
587 {
588 if ( esl_varint_expgol((int) (maxv+1), k, NULL, &n2) != eslOK) esl_fatal(msg);
589 if (n2 <= n) esl_fatal(msg);
590 }
591 }
592 }
593 }
594
595
596 static void
utest_rice(ESL_RANDOMNESS * rng)597 utest_rice(ESL_RANDOMNESS *rng)
598 {
599 char msg[] = "utest_rice Golomb-Rice tests failed";
600 uint64_t code;
601 int64_t maxv;
602 int v, v2;
603 int n, n2;
604 int k;
605 int b;
606 int i;
607
608 /* Spot check against documentation's example table for Golomb-Rice-2.
609 * 9 encodes as 11001, len 5, e.g. 0x19
610 */
611 if ( esl_varint_rice( 9, 2, &code, &n) != eslOK) esl_fatal(msg);
612 if ( code != 0x19ll || n != 5) esl_fatal(msg);
613
614 /* Systematically check Golomb-Rice-k codes from k=0..10 */
615 for (k = 0; k <= 10; k++)
616 {
617 /* Check encoding of v=0..1000 */
618 maxv = ESL_MIN(1000, max_rice(64, k));
619 for (v = 0; v <= maxv; v++)
620 {
621 if ( esl_varint_rice(v, k, &code, &n) != eslOK) esl_fatal(msg);
622 if ( esl_varint_rice_decode(code << (64-n), k, &v2, &n2) != eslOK) esl_fatal(msg);
623 if (n != len_rice(v, k)) esl_fatal(msg);
624 if (v != v2 || n != n2) esl_fatal(msg);
625 }
626
627 /* Check a bunch of random values on valid range */
628 maxv = ESL_MIN(INT_MAX, max_rice(64, k)); // Rice codes have limited range.
629 for (i = 0; i < 1000; i++)
630 {
631 v = esl_rnd_Roll(rng, (int) maxv); // 0..maxv-1, anyway.
632 if ( esl_varint_rice(v, k, &code, &n) != eslOK) esl_fatal(msg);
633 if ( esl_varint_rice_decode(code << (64-n), k, &v2, &n2) != eslOK) esl_fatal(msg);
634 if (n != len_rice(v, k)) esl_fatal(msg);
635 if (v != v2 || n != n2) esl_fatal(msg);
636 }
637
638 /* Check our documented maxv calculations, and encoding at each max */
639 for (b = k+1; b <= 63; b++) // 63, because maxv+1 will need larger code, and Golomb-Rice codes grow in steps of 1
640 {
641 maxv = max_rice(b, k);
642 if (maxv <= INT_MAX) // check that we can represent maxv in an int
643 {
644 if ( esl_varint_rice((int) maxv, k, &code, &n) != eslOK) esl_fatal(msg);
645 if ( esl_varint_rice_decode(code << (64-n), k, &v2, &n2) != eslOK) esl_fatal(msg);
646 if (n != len_rice(maxv, k)) esl_fatal(msg);
647 if (maxv != v2 || n != n2) esl_fatal(msg);
648 }
649 if (maxv < INT_MAX) // maxv+1 can be coded
650 {
651 if ( esl_varint_rice((int) (maxv+1), k, NULL, &n2) != eslOK) esl_fatal(msg);
652 if (n2 <= n) esl_fatal(msg);
653 }
654 }
655 }
656 }
657
658 static void
utest_delta(ESL_RANDOMNESS * rng)659 utest_delta(ESL_RANDOMNESS *rng)
660 {
661 char msg[] = "utest_delta Elias delta tests failed";
662 uint64_t code;
663 int64_t maxv;
664 int v, v2;
665 int n, n2;
666 int i;
667
668 /* Spot check against documentation's example table for Elias delta.
669 * 9 encodes as 00100001, len 8, e.g. 0x21
670 */
671 if ( esl_varint_delta( 9, &code, &n) != eslOK) esl_fatal(msg);
672 if ( code != 0x21ll || n != 8) esl_fatal(msg);
673
674 /* Check encoding of v=1..999 */
675 for (v = 1; v < 1000; v++)
676 {
677 if ( esl_varint_delta(v, &code, &n) != eslOK) esl_fatal(msg);
678 if ( esl_varint_delta_decode(code << (64-n), &v2, &n2) != eslOK) esl_fatal(msg);
679 if (n != len_delta(v)) esl_fatal(msg);
680 if (v != v2 || n != n2) esl_fatal(msg);
681 }
682
683 /* Check a bunch of random values on valid range */
684 maxv = INT_MAX;
685 for (i = 0; i < 1000; i++)
686 {
687 v = 1 + esl_rnd_Roll(rng, (int) maxv); // 1..INT_MAX
688 if ( esl_varint_delta(v, &code, &n) != eslOK) esl_fatal(msg);
689 if ( esl_varint_delta_decode(code << (64-n), &v2, &n2) != eslOK) esl_fatal(msg);
690 if (n != len_delta(v)) esl_fatal(msg);
691 if (v != v2 || n != n2) esl_fatal(msg);
692 }
693
694 /* (we didn't document or even bother to analyze maxv given b bits for Elias delta) */
695 }
696
697
698
699 static void
utest_google(ESL_RANDOMNESS * rng)700 utest_google(ESL_RANDOMNESS *rng)
701 {
702 char msg[] = "utest_google Google varint tests failed";
703 uint64_t code;
704 int v, v2;
705 int n, n2;
706 int k;
707 int b;
708 int i;
709 int64_t maxv;
710
711 /* Spot check against our documentation's example table:
712 * 9, in google-2, encodes as 11101001, len 8; e.g. 0xe9
713 */
714 if ( esl_varint_google( 9, 2, &code, &n) != eslOK) esl_fatal(msg);
715 if ( code != 0xe9ll) esl_fatal(msg);
716 if ( n != 8) esl_fatal(msg);
717
718 /* Systematically check all codes k=2..10 */
719 for (k = 2; k <= 10; k++)
720 {
721 /* Check encoding of all values up to 999 */
722 for (v = 0; v < 1000; v++)
723 {
724 if ( esl_varint_google(v, k, &code, &n) != eslOK) esl_fatal(msg);
725 if ( esl_varint_google_decode(code << (64-n), k, &v2, &n2) != eslOK) esl_fatal(msg);
726 if (n != len_google(v, k)) esl_fatal(msg);
727 if (v != v2) esl_fatal(msg);
728 if (n != n2) esl_fatal(msg);
729 }
730
731 /* Check a bunch of random values on valid range
732 */
733 maxv = ESL_MIN(INT_MAX, max_google(64, k)); // this calculation is always INT_MAX but its heart is in the right place
734 for (i = 0; i < 1000; i++)
735 {
736 v = esl_rnd_Roll(rng, (int) maxv); // well, that's 0..maxv-1 anyway. getting 0..maxv, when maxv is INT_MAX, is more trouble than it's worth.
737 if ( esl_varint_google(v, k, &code, &n) != eslOK) esl_fatal(msg);
738 if ( esl_varint_google_decode(code << (64-n), k, &v2, &n2) != eslOK) esl_fatal(msg);
739 if (n != len_google(v, k)) esl_fatal(msg);
740 if (v != v2) esl_fatal(msg);
741 if (n != n2) esl_fatal(msg);
742 }
743
744 /* Check our maxv calculations, and encoding at each max
745 */
746 for (b = k; b <= 64-k; b++) // 64-k because maxv+1 needs larger code, and google codes grow in steps of k
747 {
748 maxv = max_google(b, k);
749 if (maxv <= INT_MAX) // check that we can represent maxv in an int
750 {
751 if ( esl_varint_google((int) maxv, k, &code, &n) != eslOK) esl_fatal(msg);
752 if ( esl_varint_google_decode(code << (64-n), k, &v2, &n2) != eslOK) esl_fatal(msg);
753 if (n != len_google(maxv, k)) esl_fatal(msg);
754 if (maxv != v2) esl_fatal(msg);
755 if (n != n2) esl_fatal(msg);
756 }
757 if (maxv < INT_MAX) // and does maxv+1 fit in an int - check that size indeed steps up
758 {
759 if ( esl_varint_google((int) (maxv+1), k, NULL, &n2) != eslOK) esl_fatal(msg);
760 if (n2 <= n) esl_fatal(msg);
761 }
762 }
763 }
764 }
765 #endif // eslVARINT_TESTDRIVE
766
767
768 /*****************************************************************
769 * 6. Test driver
770 *****************************************************************/
771 #ifdef eslVARINT_TESTDRIVE
772
773 #include "easel.h"
774 #include "esl_getopts.h"
775 #include "esl_random.h"
776
777 static ESL_OPTIONS options[] = {
778 /* name type default env range toggles reqs incomp help docgroup*/
779 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
780 { "-s", eslARG_INT, "0", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>", 0 },
781 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
782 };
783 static char usage[] = "[-options]";
784 static char banner[] = "test driver for varint module";
785
786
787 int
main(int argc,char ** argv)788 main(int argc, char **argv)
789 {
790 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
791 ESL_RANDOMNESS *rng = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));
792
793 fprintf(stderr, "## %s\n", argv[0]);
794 fprintf(stderr, "# rng seed = %" PRIu32 "\n", esl_randomness_GetSeed(rng));
795
796 utest_expgol(rng);
797 utest_rice (rng);
798 utest_delta (rng);
799 utest_google(rng);
800
801 fprintf(stderr, "# status = ok\n");
802
803 esl_randomness_Destroy(rng);
804 esl_getopts_Destroy(go);
805 return eslOK;
806 }
807 #endif // eslVARINT_TESTDRIVE
808
809 /*****************************************************************
810 * 7. Example
811 *****************************************************************/
812 #ifdef eslVARINT_EXAMPLE
813
814 #include "esl_getopts.h"
815
816 /* `./esl_varint` or `./esl_varint --tbl` (--tbl is default)
817 * makes tables of int v | codelen | codeword, as used in documentation.
818 * Markdown table format.
819 * `./esl_varint -1 --fig1`
820 * makes xy data file of value vs. codelen, XMGRACE XY format.
821 * `./esl_varint -1 --fig2`
822 * makes xy data file of value vs. implicit probability, XMGRACE XY format.
823 *
824 * The -1 option is because when we encode runlengths, we never need
825 * to encode 0. We get shorter codes if we offset our integer
826 * runlength by -1 before encoding it.
827 */
828
829 #define CODEOPTS "--expgol,--rice,--google,--delta"
830 #define OUTOPTS "--tbl,--fig1,--fig2"
831
832 static ESL_OPTIONS options[] = {
833 /* name type default env range toggles reqs incomp help docgroup*/
834 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0},
835 { "-k", eslARG_INT, "2", NULL, NULL, NULL, NULL,"--delta", "k parameter for an encoding", 0},
836 { "-L", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "only show lengths, not code (for large -n)",0},
837 { "-n", eslARG_INT, "200", NULL, NULL, NULL, NULL, NULL, "show table for encoding of up to v=<n>", 0},
838 { "-t", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "format table for LaTeX tabular", 0},
839 { "-1", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL,"--delta", "encode v-1 (i.e. 1..n not 0..n)", 0},
840 { "--expgol", eslARG_NONE,"default",NULL, NULL, CODEOPTS, NULL, NULL, "Use exp-Golomb-k encoding", 0},
841 { "--rice", eslARG_NONE, FALSE, NULL, NULL, CODEOPTS, NULL, NULL, "Use Golomb-Rice-k encoding", 0},
842 { "--delta", eslARG_NONE, FALSE, NULL, NULL, CODEOPTS, NULL, NULL, "Use Elias delta encoding", 0},
843 { "--google", eslARG_NONE, FALSE, NULL, NULL, CODEOPTS, NULL, NULL, "Use Varint-k encoding", 0},
844 { "--tbl", eslARG_NONE,"default",NULL, NULL, OUTOPTS, NULL, NULL, "output table of ints and their codes", 0},
845 { "--fig1", eslARG_NONE, FALSE, NULL, NULL, OUTOPTS, NULL, NULL, "output XY fig data for v vs codelen", 0},
846 { "--fig2", eslARG_NONE, FALSE, NULL, NULL, OUTOPTS, NULL, NULL, "output XY fig data for v vs implicit prob", 0},
847 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
848 };
849
850
851 static char banner[] = "table and figure data for varint prefix codes";
852 static char usage[] = "[-options]";
853
854 static void
dump_uint64(FILE * fp,uint64_t code,int n,int fieldwidth)855 dump_uint64(FILE *fp, uint64_t code, int n, int fieldwidth)
856 {
857 ESL_DASSERT1(( n > 0 ));
858 ESL_DASSERT1(( fieldwidth > 0 ));
859 int i;
860
861 fprintf(fp, "%*s", fieldwidth-n, ""); // i.e. right justification of the output
862 for (i = n-1; i >= 0; i--)
863 putc( (code & (1ull<<i)) ? '1' : '0', fp); // note the 1ull, not 1; else C99 will try to do this in a 32bit int and fail.
864 }
865
866 int
main(int argc,char ** argv)867 main(int argc, char **argv)
868 {
869 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
870 int k = esl_opt_GetInteger(go, "-k");
871 int vmin = esl_opt_GetBoolean(go, "--delta") ? 1 : 0; // All codes except Elias delta code start with 0.
872 int vmax = esl_opt_GetInteger(go, "-n");
873 int do_offset = esl_opt_GetBoolean(go, "-1");
874 int v;
875 uint64_t code;
876 int n;
877 int do_encoding = esl_opt_GetBoolean(go, "--tbl") ? TRUE : FALSE;
878
879 for (v = vmin; v <= vmax; v++)
880 {
881 /* If we're printing the code table, we need the code (obviously); otherwise just retrieve the length.
882 * Golomb-Rice codes have limited range, and blow up a 64-bit codelen limit, and we want to see that in our plots.
883 */
884 if (esl_opt_GetBoolean(go, "--expgol")) esl_varint_expgol(v, k, do_encoding? &code : NULL, &n);
885 else if (esl_opt_GetBoolean(go, "--rice")) esl_varint_rice (v, k, do_encoding? &code : NULL, &n);
886 else if (esl_opt_GetBoolean(go, "--google")) esl_varint_google(v, k, do_encoding? &code : NULL, &n);
887 else if (esl_opt_GetBoolean(go, "--delta")) esl_varint_delta (v, do_encoding? &code : NULL, &n);
888
889 if (esl_opt_GetBoolean(go, "--tbl"))
890 {
891 printf("| %10d | %2d | ", do_offset ? v+1 : v, n);
892 dump_uint64(stdout, code, n, 30);
893 printf("\n");
894 }
895 else if (esl_opt_GetBoolean(go, "--fig1")) printf("%d %d\n", do_offset? v+1 : v, n);
896 else if (esl_opt_GetBoolean(go, "--fig2")) printf("%d %g\n", do_offset? v+1 : v, pow(2., -1. * n));
897 }
898
899 if (esl_opt_GetBoolean(go, "--fig1") || esl_opt_GetBoolean(go, "--fig2"))
900 printf("&\n"); // Termination of an XY set in XMGRACE formats
901
902 esl_getopts_Destroy(go);
903 return eslOK;
904 }
905 #endif // eslVARINT_EXAMPLE
906
907
908