1 /* Variable-length binary prefix codes for integers
2  *
3  * Contents:
4  *    1. Exponential Golomb codes
5  *    2. Golomb-Rice codes
6  *    3. Elias delta code
7  *    4. Google varint codes
8  *    5. Unit tests
9  *    6. Test driver
10  *    7. Example
11  *
12  * See also:
13  *    esl_huffman : Huffman coding
14  */
15 #include "esl_config.h"
16 
17 #include <stdio.h>
18 #include <limits.h>
19 
20 #include "easel.h"
21 #include "esl_varint.h"
22 
23 /*****************************************************************
24  * 1. Exponential Golomb codes
25  *****************************************************************/
26 
27 /* Function:  esl_varint_expgol()
28  * Synopsis:  Generalized exponential Golomb coding
29  * Incept:    SRE, Wed 09 Jan 2019
30  *
31  * Purpose:   Encode nonnegative integer <v> in exponential-Golomb-<k>
32  *            code.  $k=0$ is standard exponential Golomb code; $k>0$
33  *            are the generalized Golomb codes. Optionally return codeword in
34  *            low-order bits of <*opt_code> (i.e. right flush), and
35  *            its length in <*opt_n>.
36  *
37  * Args:      v        : integer to encode; v >= 0
38  *            k        : parameter for which exp-Golomb code; k >= 0
39  *            opt_code : optRETURN: encoding, right flushed (low order bits)
40  *            opt_n    : optRETURN: length of encoding in bits
41  *
42  * Returns:   <eslOK> on success.
43  *
44  * Throws:    <eslERANGE> if caller requested <*opt_code> to be returned,
45  *            but encoding doesn't fit in a single <uint64_t>. Now <*opt_code>
46  *            and <*opt_n> are set to 0.
47  */
48 int
esl_varint_expgol(int v,int k,uint64_t * opt_code,int * opt_n)49 esl_varint_expgol(int v, int k, uint64_t *opt_code, int *opt_n)
50 {
51   ESL_DASSERT1(( v >= 0 && k >= 0 ));
52 
53   uint64_t m    = (1 << k);  // 2^k.                                                      For k=0 standard expgol: m=1,
54   uint64_t q    = v / m;     // \floor (v / 2^k) piece, to be encoded in exp-golomb-0      ... q=v, all bits here,
55   uint64_t r    = v % m;     // will become rightmost bits                                 ... r=0, no bits stored
56   int      n    = 0;
57   uint64_t code = q+1;
58   int      status;
59 
60   // q stored in exp-golomb-0.  We already set low order bits to q+1; now we just need to determine width of q+1.
61   q = q+1;
62   while (q) { q >>= 1; n++; }   // Let a = # of bits needed to represent q+1
63   n = 2 * n - 1;                // now a-1 leading bits are 0, followed by a bits of q+1, for an exp-golomb-0 code.
64 
65   if (opt_code && n > 64) ESL_XEXCEPTION(eslERANGE, "exponential Golomb codeword length > 64");
66 
67   // r stored in its binary representation, k bits.
68   code = (code << k) | r;
69   n   += k;
70 
71   if (opt_code) *opt_code = code;
72   if (opt_n)    *opt_n    = n;
73   return eslOK;
74 
75  ERROR:
76   if (opt_code) *opt_code = 0;
77   if (opt_n)    *opt_n    = 0;
78   return status;
79 }
80 
81 
82 /* Function:  esl_varint_expgol_decode()
83  * Synopsis:  Decode an exponential Golomb codeword.
84  * Incept:    SRE, Thu 10 Jan 2019 [Slaid Cleaves, Still Fighting the War]
85  *
86  * Purpose:   Decode the leading (high order) prefix bits of <code>
87  *            according to exponential-Golomb-<k> code. Optionally
88  *            return the decoded integer value in <*opt_v>, and the
89  *            length of the decoded codeword in <*opt_n>.
90  *
91  *            Note that <code> is _left flush_ for decoding: the
92  *            prefix codeword is decoded from the leftmost, most
93  *            significant bit.
94  *
95  *            If <code> consists of only 0's, and therefore has no
96  *            complete codeword, return <eslEOD>. This can be used to
97  *            detect normal end of a bit string.
98  *
99  *            If <code> does contain a 1 bit but does not contain a
100  *            complete codeword, assume it is corrupted somehow,
101  *            and return <eslECORRUPT>.
102  *
103  * Args:      code : bit string to decode, starting from MSB
104  *            k    : parameter for which exp-Golomb code; k >= 0
105  *            opt_v: optRETURN: integer value
106  *            opt_n: optRETURN: length of decoded prefix codeword, in bits
107  *
108  * Returns:   <eslOK> on success.
109  *
110  *            <eslEOD> if <code> is all 0 bits.
111  *            <eslECORRUPT> if at least one 1 bit is set, yet no
112  *            codeword is present. On errors, <*opt_v> and <*opt_n>
113  *            are 0.
114  */
115 int
esl_varint_expgol_decode(uint64_t code,int k,int * opt_v,int * opt_n)116 esl_varint_expgol_decode(uint64_t code, int k, int *opt_v, int *opt_n)
117 {
118   int b,n,r,v;
119   int status;
120 
121   if (! code) { status = eslEOD; goto ERROR; }   // all exp-Golomb codewords contain at least one 1
122 
123   for (b = 63; b >= 0; b--)
124     if (code & (1ull << b)) break;  // now b is position of leftmost 1 bit (i.e. MSB); 63..0.
125   n = 2*(64-b)+k-1;                 // ...and from that, we know the size of the codeword.
126   if (n > 64) { status = eslECORRUPT; goto ERROR; } // looks like a code but can't be valid
127 
128   // the n-bit codeword consists of the binary representation of q+1 (with leading zeros),
129   // followed by a k-bit remainder.
130   code >>= 64-n;
131   r    = code & ((1ull << k) - 1);
132   code >>= k;
133   v    = (code-1) * (1ull << k) + r;
134 
135   if (opt_v) *opt_v = v;
136   if (opt_n) *opt_n = n;
137   return eslOK;
138 
139  ERROR:
140   if (opt_v) *opt_v = 0;
141   if (opt_n) *opt_n = 0;
142   return status;
143 }
144 
145 
146 /*****************************************************************
147  * 2. Golomb-Rice codes
148  *****************************************************************/
149 
150 /* Function:  esl_varint_rice()
151  * Synopsis:  Golomb-Rice coding
152  * Incept:    SRE, Sat 12 Jan 2019 [Ramin Djawadi, Westworld, The Maze]
153  *
154  * Purpose:   Encode nonnegative integer <v> in Golomb-Rice-<k> code,
155  *            for $k \geq 0$. Optionally return codeword in low-order bits of
156  *            <*opt_code> (i.e. right flush) and its length in <*opt_n>.
157  *
158  * Args:      v        : integer to encode; v >= 0
159  *            k        : parameter for which Golomb-Rice code; k >= 0
160  *            opt_code : optRETURN: encoding, right flushed (low order bits)
161  *            opt_n    : optRETURN: length of encoding in bits
162  *
163  * Returns:   <eslOK> on success.
164  *
165  * Throws:    <eslERANGE> if caller requested <*opt_code> to be returned,
166  *            but encoding doesn't fit in a single <uint64_t>. Now <*opt_code>
167  *            and <*opt_n> are set to 0.
168  */
169 int
esl_varint_rice(int v,int k,uint64_t * opt_code,int * opt_n)170 esl_varint_rice(int v, int k, uint64_t *opt_code, int *opt_n)
171 {
172   ESL_DASSERT1(( v >= 0 && k >= 0));
173   int      m    = (1 << k); // 2^k
174   int      q    = v / m;    // int(v / 2^k), quotient.  code leads with this many 1's; then a 0
175   int      r    = v % m;    // remainder. encoded in k bits
176   uint64_t code = 0;
177   int      status;
178 
179   if (opt_code && q > 64-(k+1)) ESL_XEXCEPTION(eslERANGE, "Golomb-Rice codeword length > 64");
180 
181   code = (1ull << q) - 1;       // sets q low-order bits to 1; we know 2^{q+1} fits in uint64_t
182   code  = code << (1 + k);      // bit k+1 is a 0; low-order k bits hold r
183   code |= r;
184 
185   if (opt_code) *opt_code = code;
186   if (opt_n)    *opt_n    = q + k + 1;
187   return eslOK;
188 
189  ERROR:
190   if (opt_code) *opt_code = 0;
191   if (opt_n)    *opt_n    = 0;
192   return status;
193 }
194 
195 
196 /* Function:  esl_varint_rice_decode()
197  * Synopsis:  Decode a Golomb-Rice codeword
198  * Incept:    SRE, Sat 12 Jan 2019 [Real Madrid v. Leganes]
199  *
200  * Purpose:   Decode the leading (high order) prefix bits of <code>
201  *            according to Golomb-Rice-<k> code. Optionally return the
202  *            decoded integer value in <*opt_v>, and the length
203  *            of the decoded codeword in <*opt_n>.
204  *
205  *            Note that <code> is _left flush_ for decoding:
206  *            the prefix codeword is decoded from the leftmost,
207  *            most significant bit.
208  *
209  *            If <code> does not contain a 0 bit, return <eslEOD>; it
210  *            cannot contain a valid codeword.  A caller can shift on
211  *            a pattern of all 1 bits to signal EOD.
212  *
213  *            There is no check for overflowing <v>. We assume that
214  *            <code> encodes a value that fits in an int.
215  *
216  * Args:      code : bit string to decode, starting from MSB
217  *            k    : parameter for which Google varint code; k >= 2
218  *            opt_v: optRETURN: integer value
219  *            opt_n: optRETURN: length of decoded prefix codeword, in bits
220  *
221  * Returns:   <eslOK> on success.
222  *            <eslEOD> if <code> has no completed codeword.
223  *
224  * Throws:    (no abnormal error conditions)
225  */
226 int
esl_varint_rice_decode(uint64_t code,int k,int * opt_v,int * opt_n)227 esl_varint_rice_decode(uint64_t code, int k, int *opt_v, int *opt_n)
228 {
229   int q;
230   int status;
231 
232   if (~code == 0) { status = eslEOD; goto ERROR; } // valid rice code has at least one 0 bit
233 
234   for (q = 0; q <= 64-k; q++)
235     if ( (~code) & (1ull << (63-q))) break;  // q = # of leading 1 bits. +1, +k: n = q + k + 1.
236 
237   code >>= 64-(q+k+1);  // shift 64 - (q+k+1) bits off to the right, right-flushing the codeword.
238   /*                   q  *    2^k    +    r, obtained by masking low k bits */
239   if (opt_v) *opt_v =  (int) (q * (1ull << k) + (code & ( (1ull << (k+1)) - 1)));
240   if (opt_n) *opt_n =  q + k + 1;
241   return eslOK;
242 
243  ERROR:
244   if (opt_v) *opt_v = 0;
245   if (opt_n) *opt_n = 0;
246   return status;
247 }
248 
249 
250 
251 /*****************************************************************
252  * 3. Elias delta code
253  *****************************************************************/
254 
255 /* Function:  esl_varint_delta
256  * Synopsis:  Elias delta coding
257  * Incept:    SRE, Sat 12 Jan 2019
258  *
259  * Purpose:   Encode positive integer <v> in Elias delta code.
260  *            Optionally return codeword in low-order bits of
261  *            <*opt_code> (i.e. right flush), and its length
262  *            in <*opt_n>.
263  *
264  * Args:      v        : integer to encode; v >= 1
265  *            opt_code : optRETURN: encoding, right flushed (low order bits)
266  *            opt_n    : optRETURN: length of encoding in bits
267  *
268  * Returns:   <eslOK> on success.
269  *
270  * Throws:    <eslERANGE> if caller requested <*opt_code> to be returned,
271  *            but encoding doesn't fit in a single <uint64_t>. Now <*opt_code>
272  *            and <*opt_n> are set to 0.
273  */
274 int
esl_varint_delta(int v,uint64_t * opt_code,int * opt_n)275 esl_varint_delta(int v, uint64_t *opt_code, int *opt_n)
276 {
277   ESL_DASSERT1(( v >= 1 ));
278   int  a = 0;  // \floor log_2 v      2^a is the highest power of 2 in v
279   int  b = 0;  // \floor log_2 (a+1)
280   int  n = 0;
281   uint64_t code = 0;
282   uint64_t tmp;
283   int  status;
284 
285   tmp = ((uint64_t) v     >> 1); while (tmp) { tmp = tmp >> 1; a++; }
286   tmp = ((uint64_t) (a+1) >> 1); while (tmp) { tmp = tmp >> 1; b++; }
287 
288   n =  b;                                               // b leading zeros
289   n += b+1; code = a+1;                                 // b+1 bits, representation of a+1
290   n += a;   code = (code << a) | (v & ((1 << a) - 1));  // a bits, v % 2^a
291 
292   if (opt_code && n > 64) ESL_XEXCEPTION(eslERANGE, "Elias delta codeword length > 64");
293 
294   if (opt_code) *opt_code = code;
295   if (opt_n)    *opt_n    = n;
296   return eslOK;
297 
298  ERROR:
299   if (opt_code) *opt_code = 0;
300   if (opt_n)    *opt_n    = 0;
301   return status;
302 }
303 
304 
305 /* Function:  esl_varint_delta_decode()
306  * Synopsis:  Decode an Elias delta codeword.
307  * Incept:    SRE, Sat 12 Jan 2019
308  *
309  * Purpose:   Decode the leading (high order) prefix bits of <code>
310  *            using Elias delta code. Optionally return the decoded
311  *            integer value in <*opt_v>, and/or the length of the
312  *            decoded codeword in <*opt_n>.
313  *
314  *            Note that <code> is _left flush_ for decoding: the
315  *            prefix codeword is decoded from the leftmost, most
316  *            significant bit.
317  *
318  *            If <code> consists of only 0's, and therefore has
319  *            no complete codeword, return <eslEOD>. This can be
320  *            used to detect the normal end of a bit string.
321  *
322  *            If <code> does contain a 1 bit but does not contain a
323  *            complete codeword, assume it is corrupted somehow, and
324  *            throw <eslECORRUPT>.
325  *
326  * Args:      code : bit string to decode, starting from MSB
327  *            opt_v: optRETURN: integer value
328  *            opt_n: optRETURN: length of decoded prefix codeword, in bits
329  *
330  * Returns:   <eslOK> on success.
331  *
332  *            <eslEOD> if <code> is all 0 bits.
333  *            <eslECORRUPT> if at least one 1 bit is set, yet no codeword
334  *            is present. On errors, <*opt_v> and <*opt_n> are 0.
335  */
336 int
esl_varint_delta_decode(uint64_t code,int * opt_v,int * opt_n)337 esl_varint_delta_decode(uint64_t code, int *opt_v, int *opt_n)
338 {
339   int a,b,n,v, status;
340 
341   if (code == 0) { status = eslEOD; goto ERROR; }
342 
343   for (b = 0; b < 64; b++)
344     if (code & (1ull << (63-b))) break;  // stop on first 1.  b = # of leading 0 bits
345   a = (code >> (63 - 2*b)) - 1;          // After b leading 0's, b+1 bits encode a+1
346   n = 2*b + a + 1;
347 
348   if (n > 64) { status = eslECORRUPT; goto ERROR; }
349 
350   v = (code >> (63 - 2*b -a));           //  ... and a bits encodes v mod 2^a, all but its highest order bit
351   v += (1 << a);                         // put the 2^a bit back
352 
353   // Last a bits encodes v mod 2^a, all but the highest order bit;
354   // shift and mask to get those a bits, then add 2^a back.
355   if (opt_v) *opt_v = (1<<a) + ((code >> (64 - n)) & ( (1<<a)-1));
356   if (opt_n) *opt_n = n;
357   return eslOK;
358 
359  ERROR:
360   if (opt_v) *opt_v = 0;
361   if (opt_n) *opt_n = 0;
362   return status;
363 }
364 
365 
366 
367 
368 /*****************************************************************
369  * 4. Google varint codes
370  *****************************************************************/
371 
372 /* Function:  esl_varint_google()
373  * Synopsis:  Google varint coding
374  * Incept:    SRE, Fri 11 Jan 2019 [Eminem, Lose Yourself]
375  *
376  * Purpose:   Encode nonnegative integer <v> in Google varint-<k> code;
377  *            $k \geq 2$; $k=8$ gives you one-byte codewords, the code
378  *            used by Google Protobuf. Optionally return code in
379  *            low-order bits of <*opt_code> (i.e. right flushed), and
380  *            length of that code in <*opt_n>.
381  *
382  * Args:      v        : integer to encode; v >= 0
383  *            k        : parameter for which Google varint code; k >= 2
384  *            opt_code : optRETURN: encoding, right flushed (low order bits)
385  *            opt_n    : optRETURN: length of encoding in bits
386  *
387  * Returns:   <eslOK> on success.
388  *
389  * Throws:    <eslERANGE> if caller requested <*opt_code> to be returned,
390  *            but binary encoding won't fit in a single
391  *            <uint64_t>. <*opt_code> and <*opt_n> are set to 0.
392  */
393 int
esl_varint_google(int v,int k,uint64_t * opt_code,int * opt_n)394 esl_varint_google(int v, int k, uint64_t *opt_code, int *opt_n)
395 {
396   ESL_DASSERT1(( k >= 2 ));
397   ESL_DASSERT1(( v >= 0 ));
398 
399   int      base = 1 << (k-1);  // 2^(k-1)
400   uint64_t code = 0;
401   int      n    = 0;
402   int      status;
403 
404   code |= v % base;
405   v /= base;
406   n += k;
407   while (v > 0)
408     {
409       code |= (1 << (k-1));
410       code = code << k;
411 
412       code |= v % base;
413       v /= base;
414       n += k;
415     }
416   if (n > 64 && opt_code) ESL_XEXCEPTION(eslERANGE, "google varint code length > 64");
417 
418   if (opt_code) *opt_code = code;
419   if (opt_n)    *opt_n    = n;
420   return eslOK;
421 
422  ERROR:
423   if (opt_code) *opt_code = 0;
424   if (opt_n)    *opt_n    = 0;
425   return status;
426 }
427 
428 
429 /* Function:  esl_varint_google_decode()
430  * Synopsis:  Decode a Google varint codeword
431  * Incept:    SRE, Fri 11 Jan 2019
432  *
433  * Purpose:   Decode the leading (high order) prefix bits of <code>
434  *            according to Google varint-<k> code. Optionally
435  *            return the decoded integer value in <*opt_v>, and the
436  *            length of the decoded codeword in <*opt_n>.
437  *
438  *            Note that <code> is _left flush_ for decoding: the
439  *            prefix codeword is decoded from the leftmost, most
440  *            significant bit.
441  *
442  *            If <code> does not contain a termination codegroup (i.e.
443  *            it consists only of continuation groups that start with
444  *            1), return <eslEOD>. A caller can shift on a pattern of
445  *            all 1 bits to signal EOD.
446  *
447  *            There is no check for overflowing <v>. We assume that
448  *            <code> encodes a value that fits in an int.
449  *
450  * Args:      code : bit string to decode, starting from MSB
451  *            k    : parameter for which Google varint code; k >= 2
452  *            opt_v: optRETURN: integer value
453  *            opt_n: optRETURN: length of decoded prefix codeword, in bits
454  *
455  * Returns:   <eslOK> on success.
456  *            <eslEOD> if <code> has no completed codeword.
457  */
458 int
esl_varint_google_decode(uint64_t code,int k,int * opt_v,int * opt_n)459 esl_varint_google_decode(uint64_t code, int k, int *opt_v, int *opt_n)
460 {
461   uint64_t cmask = 1ull << (k-1);
462   uint64_t vmask = cmask-1;
463   int      v     = 0;
464   uint64_t grp;                     // current k-bit code group
465   int      g;                       // which group (least significant group first)
466   int      status;
467 
468   for (g = 0; g < 64/k; code <<= k, g++)
469     {
470       grp = code >> (64-k);
471       v  += (grp & vmask) << (g*(k-1));
472       if (!(grp & cmask)) break;
473     }
474   if (grp & cmask) { status = eslEOD; goto ERROR; }
475 
476   if (opt_v) *opt_v = v;
477   if (opt_n) *opt_n = (g+1)*k;
478   return eslOK;
479 
480  ERROR:
481   if (opt_v) *opt_v = 0;
482   if (opt_n) *opt_n = 0;
483   return status;
484 }
485 
486 
487 
488 
489 /*****************************************************************
490  * 5. Unit tests
491  *****************************************************************/
492 #ifdef eslVARINT_TESTDRIVE
493 
494 #include "esl_random.h"
495 
496 /* floor_log2()
497  * Returns $w = \lfloor \log_2 v \rfloor$
498  * w is the highest power of 2 in v;
499  * equivalently, w is the MSB set in v
500  *
501  * Example: v=53 ==> 00110101 ==> w=5  (2^5 = 32)
502  */
503 static int
floor_log2(int v)504 floor_log2(int v)
505 {
506   return (v == 0 ? 0 : (int) floor(log2(v)));
507 }
508 
509 /* floor_log2k()
510  * Returns $\lfloor \log_{2^k} v \rfloor$
511  */
512 static int
floor_log2k(int v,int k)513 floor_log2k(int v, int k)
514 {
515   return ( v == 0 ? 0 : (int) floor( log2(v)) / k);
516 }
517 
len_expgol(int v,int k)518 int     len_expgol(int v, int k) { return (2 * floor_log2((v / (1<<k)) + 1)  + 1 + k); }
max_expgol(int b,int k)519 int64_t max_expgol(int b, int k) { return (((1ull << (((b-k-1)/2) + 1)) - 1) << k) - 1;  }
len_rice(int v,int k)520 int     len_rice  (int v, int k) { return (1 + k + (v / (1ull << k))); }
max_rice(int b,int k)521 int     max_rice  (int b, int k) { return ( (1ull << k) * (b-k) - 1); }
len_delta(int v)522 int     len_delta (int v)        { return (floor_log2(v)  + 2 * floor_log2( (floor_log2(v) + 1)) + 1); }
len_google(int v,int k)523 int     len_google(int v, int k) { return ( k * (floor_log2k(v,k-1) + 1)); }
max_google(int b,int k)524 int64_t max_google(int b, int k) { return ((1ull << ((b/k) * (k-1))) - 1); }
525 
526 
527 static void
utest_expgol(ESL_RANDOMNESS * rng)528 utest_expgol(ESL_RANDOMNESS *rng)
529 {
530   char     msg[] = "utest_expgol exponential Golomb tests failed";
531   uint64_t code;
532   int64_t  maxv;
533   int      v, v2;
534   int      n, n2;
535   int      k;
536   int      b;
537   int      i;
538 
539   /* Spot check exponential Golomb (standard, k=0) against
540    * documentation's example table.
541    * 9 encodes as 0001010, len 7, e.g. 0xa
542    */
543   if ( esl_varint_expgol( 9, 0, &code, &n) != eslOK) esl_fatal(msg);
544   if ( code != 0xall || n != 7)                      esl_fatal(msg);
545 
546   /* Also spot check documented exp-Golomb-2 code
547    * 9 encodes as 01101, len 5, e.g. 0xd
548    */
549   if ( esl_varint_expgol( 9, 2, &code, &n) != eslOK) esl_fatal(msg);
550   if ( code != 0xdll || n != 5)                      esl_fatal(msg);
551 
552   /* Systematically check exp-Golomb-k codes from k=0..10 */
553   for (k = 0; k <= 10; k++)
554     {
555       /* Check encoding of v=0..999 */
556       for (v = 0; v < 1000; v++)
557 	{
558 	  if ( esl_varint_expgol(v, k, &code, &n)                    != eslOK) esl_fatal(msg);
559 	  if ( esl_varint_expgol_decode(code << (64-n), k, &v2, &n2) != eslOK) esl_fatal(msg);
560 	  if (n != len_expgol(v, k)) esl_fatal(msg);
561 	  if (v != v2 || n != n2)    esl_fatal(msg);
562 	}
563 
564       /* Check a bunch of random values on valid range */
565       maxv = ESL_MIN(INT_MAX, max_expgol(64, k));    // this is always INT_MAX
566       for (i = 0; i < 1000; i++)
567 	{
568 	  v = esl_rnd_Roll(rng, (int) maxv);         // 0..maxv-1, anyway.
569 	  if ( esl_varint_expgol(v, k, &code, &n)                    != eslOK) esl_fatal(msg);
570 	  if ( esl_varint_expgol_decode(code << (64-n), k, &v2, &n2) != eslOK) esl_fatal(msg);
571 	  if (n != len_expgol(v, k)) esl_fatal(msg);
572 	  if (v != v2 || n != n2)    esl_fatal(msg);
573 	}
574 
575       /* Check our documented maxv calculations, and encoding at each max */
576       for (b = k+1; b <= 62; b++)  // 62, because maxv+1 will need larger code, and exp-golomb codes grow in steps of 2
577 	{
578 	  maxv = max_expgol(b, k);
579 	  if (maxv <= INT_MAX)      // check that we can represent maxv in an int
580 	    {
581 	      if ( esl_varint_expgol((int) maxv, k, &code, &n)           != eslOK) esl_fatal(msg);
582 	      if ( esl_varint_expgol_decode(code << (64-n), k, &v2, &n2) != eslOK) esl_fatal(msg);
583 	      if (n != len_expgol(maxv, k)) esl_fatal(msg);
584 	      if (maxv != v2 || n != n2)    esl_fatal(msg);
585 	    }
586 	  if (maxv < INT_MAX)  // maxv+1 can be coded
587 	    {
588 	      if ( esl_varint_expgol((int) (maxv+1), k, NULL, &n2) != eslOK) esl_fatal(msg);
589 	      if (n2 <= n) esl_fatal(msg);
590 	    }
591 	}
592     }
593 }
594 
595 
596 static void
utest_rice(ESL_RANDOMNESS * rng)597 utest_rice(ESL_RANDOMNESS *rng)
598 {
599   char     msg[] = "utest_rice Golomb-Rice tests failed";
600   uint64_t code;
601   int64_t  maxv;
602   int      v, v2;
603   int      n, n2;
604   int      k;
605   int      b;
606   int      i;
607 
608   /* Spot check against documentation's example table for Golomb-Rice-2.
609    * 9 encodes as 11001, len 5, e.g. 0x19
610    */
611   if ( esl_varint_rice( 9, 2, &code, &n) != eslOK) esl_fatal(msg);
612   if ( code != 0x19ll || n != 5)                   esl_fatal(msg);
613 
614   /* Systematically check Golomb-Rice-k codes from k=0..10 */
615   for (k = 0; k <= 10; k++)
616     {
617       /* Check encoding of v=0..1000 */
618       maxv = ESL_MIN(1000, max_rice(64, k));
619       for (v = 0; v <= maxv; v++)
620 	{
621 	  if ( esl_varint_rice(v, k, &code, &n)                    != eslOK) esl_fatal(msg);
622 	  if ( esl_varint_rice_decode(code << (64-n), k, &v2, &n2) != eslOK) esl_fatal(msg);
623 	  if (n != len_rice(v, k))   esl_fatal(msg);
624 	  if (v != v2 || n != n2)    esl_fatal(msg);
625 	}
626 
627       /* Check a bunch of random values on valid range */
628       maxv = ESL_MIN(INT_MAX, max_rice(64, k));    // Rice codes have limited range.
629       for (i = 0; i < 1000; i++)
630 	{
631 	  v = esl_rnd_Roll(rng, (int) maxv);         // 0..maxv-1, anyway.
632 	  if ( esl_varint_rice(v, k, &code, &n)                    != eslOK) esl_fatal(msg);
633 	  if ( esl_varint_rice_decode(code << (64-n), k, &v2, &n2) != eslOK) esl_fatal(msg);
634 	  if (n != len_rice(v, k))   esl_fatal(msg);
635 	  if (v != v2 || n != n2)    esl_fatal(msg);
636 	}
637 
638       /* Check our documented maxv calculations, and encoding at each max */
639       for (b = k+1; b <= 63; b++)  // 63, because maxv+1 will need larger code, and Golomb-Rice codes grow in steps of 1
640 	{
641 	  maxv = max_rice(b, k);
642 	  if (maxv <= INT_MAX)      // check that we can represent maxv in an int
643 	    {
644 	      if ( esl_varint_rice((int) maxv, k, &code, &n)           != eslOK) esl_fatal(msg);
645 	      if ( esl_varint_rice_decode(code << (64-n), k, &v2, &n2) != eslOK) esl_fatal(msg);
646 	      if (n != len_rice(maxv, k)) esl_fatal(msg);
647 	      if (maxv != v2 || n != n2)    esl_fatal(msg);
648 	    }
649 	  if (maxv < INT_MAX)  // maxv+1 can be coded
650 	    {
651 	      if ( esl_varint_rice((int) (maxv+1), k, NULL, &n2) != eslOK) esl_fatal(msg);
652 	      if (n2 <= n) esl_fatal(msg);
653 	    }
654 	}
655     }
656 }
657 
658 static void
utest_delta(ESL_RANDOMNESS * rng)659 utest_delta(ESL_RANDOMNESS *rng)
660 {
661   char     msg[] = "utest_delta Elias delta tests failed";
662   uint64_t code;
663   int64_t  maxv;
664   int      v, v2;
665   int      n, n2;
666   int      i;
667 
668   /* Spot check against documentation's example table for Elias delta.
669    * 9 encodes as 00100001, len 8, e.g. 0x21
670    */
671   if ( esl_varint_delta( 9, &code, &n) != eslOK) esl_fatal(msg);
672   if ( code != 0x21ll || n != 8)                 esl_fatal(msg);
673 
674   /* Check encoding of v=1..999 */
675   for (v = 1; v < 1000; v++)
676     {
677       if ( esl_varint_delta(v, &code, &n)                    != eslOK) esl_fatal(msg);
678       if ( esl_varint_delta_decode(code << (64-n), &v2, &n2) != eslOK) esl_fatal(msg);
679       if (n != len_delta(v))    esl_fatal(msg);
680       if (v != v2 || n != n2)   esl_fatal(msg);
681     }
682 
683   /* Check a bunch of random values on valid range */
684   maxv = INT_MAX;
685   for (i = 0; i < 1000; i++)
686     {
687       v = 1 + esl_rnd_Roll(rng, (int) maxv);   // 1..INT_MAX
688       if ( esl_varint_delta(v, &code, &n)                    != eslOK) esl_fatal(msg);
689       if ( esl_varint_delta_decode(code << (64-n), &v2, &n2) != eslOK) esl_fatal(msg);
690       if (n != len_delta(v))   esl_fatal(msg);
691       if (v != v2 || n != n2)  esl_fatal(msg);
692     }
693 
694   /* (we didn't document or even bother to analyze maxv given b bits for Elias delta) */
695 }
696 
697 
698 
699 static void
utest_google(ESL_RANDOMNESS * rng)700 utest_google(ESL_RANDOMNESS *rng)
701 {
702   char     msg[] = "utest_google Google varint tests failed";
703   uint64_t code;
704   int      v, v2;
705   int      n, n2;
706   int      k;
707   int      b;
708   int      i;
709   int64_t  maxv;
710 
711   /* Spot check against our documentation's example table:
712    * 9, in google-2, encodes as 11101001, len 8; e.g. 0xe9
713    */
714   if ( esl_varint_google( 9, 2, &code, &n) != eslOK) esl_fatal(msg);
715   if ( code != 0xe9ll)                               esl_fatal(msg);
716   if ( n    != 8)                                    esl_fatal(msg);
717 
718   /* Systematically check all codes k=2..10 */
719   for (k = 2; k <= 10; k++)
720     {
721       /* Check encoding of all values up to 999 */
722       for (v = 0; v < 1000; v++)
723 	{
724 	  if ( esl_varint_google(v, k, &code, &n)                    != eslOK) esl_fatal(msg);
725 	  if ( esl_varint_google_decode(code << (64-n), k, &v2, &n2) != eslOK) esl_fatal(msg);
726 	  if (n != len_google(v, k)) esl_fatal(msg);
727 	  if (v != v2)               esl_fatal(msg);
728 	  if (n != n2)               esl_fatal(msg);
729 	}
730 
731       /* Check a bunch of random values on valid range
732        */
733       maxv = ESL_MIN(INT_MAX, max_google(64, k));  // this calculation is always INT_MAX but its heart is in the right place
734       for (i = 0; i < 1000; i++)
735 	{
736 	  v = esl_rnd_Roll(rng, (int) maxv);       // well, that's 0..maxv-1 anyway. getting 0..maxv, when maxv is INT_MAX, is more trouble than it's worth.
737 	  if ( esl_varint_google(v, k, &code, &n)                    != eslOK) esl_fatal(msg);
738 	  if ( esl_varint_google_decode(code << (64-n), k, &v2, &n2) != eslOK) esl_fatal(msg);
739 	  if (n != len_google(v, k)) esl_fatal(msg);
740 	  if (v != v2)               esl_fatal(msg);
741 	  if (n != n2)               esl_fatal(msg);
742 	}
743 
744       /* Check our maxv calculations, and encoding at each max
745        */
746       for (b = k; b <= 64-k; b++)   // 64-k because maxv+1 needs larger code, and google codes grow in steps of k
747 	{
748 	  maxv = max_google(b, k);
749 	  if (maxv <= INT_MAX)  // check that we can represent maxv in an int
750 	    {
751 	      if ( esl_varint_google((int) maxv, k, &code, &n)           != eslOK) esl_fatal(msg);
752 	      if ( esl_varint_google_decode(code << (64-n), k, &v2, &n2) != eslOK) esl_fatal(msg);
753 	      if (n    != len_google(maxv, k)) esl_fatal(msg);
754 	      if (maxv != v2)                  esl_fatal(msg);
755 	      if (n    != n2)                  esl_fatal(msg);
756 	    }
757 	  if (maxv < INT_MAX)  // and does maxv+1 fit in an int - check that size indeed steps up
758 	    {
759 	      if ( esl_varint_google((int) (maxv+1), k, NULL, &n2) != eslOK) esl_fatal(msg);
760 	      if (n2 <= n) esl_fatal(msg);
761 	    }
762 	}
763     }
764 }
765 #endif // eslVARINT_TESTDRIVE
766 
767 
768 /*****************************************************************
769  * 6. Test driver
770  *****************************************************************/
771 #ifdef eslVARINT_TESTDRIVE
772 
773 #include "easel.h"
774 #include "esl_getopts.h"
775 #include "esl_random.h"
776 
777 static ESL_OPTIONS options[] = {
778   /* name           type      default  env  range toggles reqs incomp  help                                       docgroup*/
779   { "-h",        eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",             0 },
780   { "-s",        eslARG_INT,      "0",  NULL, NULL,  NULL,  NULL, NULL, "set random number seed to <n>",                    0 },
781   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
782 };
783 static char usage[]  = "[-options]";
784 static char banner[] = "test driver for varint module";
785 
786 
787 int
main(int argc,char ** argv)788 main(int argc, char **argv)
789 {
790   ESL_GETOPTS    *go   = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
791   ESL_RANDOMNESS *rng  = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));
792 
793   fprintf(stderr, "## %s\n", argv[0]);
794   fprintf(stderr, "#  rng seed = %" PRIu32 "\n", esl_randomness_GetSeed(rng));
795 
796   utest_expgol(rng);
797   utest_rice  (rng);
798   utest_delta (rng);
799   utest_google(rng);
800 
801   fprintf(stderr, "#  status = ok\n");
802 
803   esl_randomness_Destroy(rng);
804   esl_getopts_Destroy(go);
805   return eslOK;
806 }
807 #endif // eslVARINT_TESTDRIVE
808 
809 /*****************************************************************
810  * 7. Example
811  *****************************************************************/
812 #ifdef eslVARINT_EXAMPLE
813 
814 #include "esl_getopts.h"
815 
816 /* `./esl_varint` or   `./esl_varint --tbl` (--tbl is default)
817  *     makes tables of int v | codelen | codeword, as used in documentation.
818  *     Markdown table format.
819  * `./esl_varint -1 --fig1`
820  *     makes xy data file of value vs. codelen, XMGRACE XY format.
821  * `./esl_varint -1 --fig2`
822  *     makes xy data file of value vs. implicit probability, XMGRACE XY format.
823  *
824  * The -1 option is because when we encode runlengths, we never need
825  * to encode 0. We get shorter codes if we offset our integer
826  * runlength by -1 before encoding it.
827  */
828 
829 #define CODEOPTS "--expgol,--rice,--google,--delta"
830 #define OUTOPTS  "--tbl,--fig1,--fig2"
831 
832 static ESL_OPTIONS options[] = {
833   /* name          type          default  env range  toggles   reqs   incomp     help                   docgroup*/
834   { "-h",          eslARG_NONE,   FALSE, NULL, NULL,     NULL, NULL,    NULL,  "show help and usage",                       0},
835   { "-k",          eslARG_INT,      "2", NULL, NULL,     NULL, NULL,"--delta", "k parameter for an encoding",               0},
836   { "-L",          eslARG_NONE,   FALSE, NULL, NULL,     NULL, NULL,    "-t",  "only show lengths, not code (for large -n)",0},
837   { "-n",          eslARG_INT,    "200", NULL, NULL,     NULL, NULL,    NULL,  "show table for encoding of up to v=<n>",    0},
838   { "-t",          eslARG_NONE,   FALSE, NULL, NULL,     NULL, NULL,    NULL,  "format table for LaTeX tabular",            0},
839   { "-1",          eslARG_NONE,   FALSE, NULL, NULL,     NULL, NULL,"--delta", "encode v-1 (i.e. 1..n not 0..n)",           0},
840   { "--expgol",    eslARG_NONE,"default",NULL, NULL, CODEOPTS, NULL,    NULL,  "Use exp-Golomb-k encoding",                 0},
841   { "--rice",      eslARG_NONE,   FALSE, NULL, NULL, CODEOPTS, NULL,    NULL,  "Use Golomb-Rice-k encoding",                0},
842   { "--delta",     eslARG_NONE,   FALSE, NULL, NULL, CODEOPTS, NULL,    NULL,  "Use Elias delta encoding",                  0},
843   { "--google",    eslARG_NONE,   FALSE, NULL, NULL, CODEOPTS, NULL,    NULL,  "Use Varint-k encoding",                     0},
844   { "--tbl",       eslARG_NONE,"default",NULL, NULL,  OUTOPTS, NULL,    NULL,  "output table of ints and their codes",      0},
845   { "--fig1",      eslARG_NONE,   FALSE, NULL, NULL,  OUTOPTS, NULL,    NULL,  "output XY fig data for v vs codelen",       0},
846   { "--fig2",      eslARG_NONE,   FALSE, NULL, NULL,  OUTOPTS, NULL,    NULL,  "output XY fig data for v vs implicit prob", 0},
847   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
848 };
849 
850 
851 static char banner[] = "table and figure data for varint prefix codes";
852 static char usage[]  = "[-options]";
853 
854 static void
dump_uint64(FILE * fp,uint64_t code,int n,int fieldwidth)855 dump_uint64(FILE *fp, uint64_t code, int n, int fieldwidth)
856 {
857   ESL_DASSERT1(( n > 0 ));
858   ESL_DASSERT1(( fieldwidth > 0 ));
859   int      i;
860 
861   fprintf(fp, "%*s", fieldwidth-n, "");  // i.e. right justification of the output
862   for (i = n-1; i >= 0; i--)
863     putc( (code & (1ull<<i)) ? '1' : '0', fp); // note the 1ull, not 1; else C99 will try to do this in a 32bit int and fail.
864 }
865 
866 int
main(int argc,char ** argv)867 main(int argc, char **argv)
868 {
869   ESL_GETOPTS *go        = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
870   int          k         = esl_opt_GetInteger(go, "-k");
871   int          vmin      = esl_opt_GetBoolean(go, "--delta") ? 1 : 0; // All codes except Elias delta code start with 0.
872   int          vmax      = esl_opt_GetInteger(go, "-n");
873   int          do_offset = esl_opt_GetBoolean(go, "-1");
874   int          v;
875   uint64_t     code;
876   int          n;
877   int          do_encoding = esl_opt_GetBoolean(go, "--tbl") ? TRUE : FALSE;
878 
879   for (v = vmin; v <= vmax; v++)
880     {
881       /* If we're printing the code table, we need the code (obviously); otherwise just retrieve the length.
882        * Golomb-Rice codes have limited range, and blow up a 64-bit codelen limit, and we want to see that in our plots.
883        */
884       if      (esl_opt_GetBoolean(go, "--expgol"))  esl_varint_expgol(v, k, do_encoding? &code : NULL, &n);
885       else if (esl_opt_GetBoolean(go, "--rice"))    esl_varint_rice  (v, k, do_encoding? &code : NULL, &n);
886       else if (esl_opt_GetBoolean(go, "--google"))  esl_varint_google(v, k, do_encoding? &code : NULL, &n);
887       else if (esl_opt_GetBoolean(go, "--delta"))   esl_varint_delta (v,    do_encoding? &code : NULL, &n);
888 
889       if (esl_opt_GetBoolean(go, "--tbl"))
890 	{
891 	  printf("| %10d | %2d | ", do_offset ? v+1 : v, n);
892 	  dump_uint64(stdout, code, n, 30);
893 	  printf("\n");
894 	}
895       else if (esl_opt_GetBoolean(go, "--fig1")) printf("%d %d\n", do_offset? v+1 : v, n);
896       else if (esl_opt_GetBoolean(go, "--fig2")) printf("%d %g\n", do_offset? v+1 : v, pow(2., -1. * n));
897     }
898 
899   if (esl_opt_GetBoolean(go, "--fig1") || esl_opt_GetBoolean(go, "--fig2"))
900     printf("&\n");   // Termination of an XY set in XMGRACE formats
901 
902   esl_getopts_Destroy(go);
903   return eslOK;
904 }
905 #endif // eslVARINT_EXAMPLE
906 
907 
908