1 /*
2  * Copyright (c) 2017-2020 Genome Research Ltd.
3  * Author(s): James Bonfield
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  *    1. Redistributions of source code must retain the above copyright notice,
9  *       this list of conditions and the following disclaimer.
10  *
11  *    2. Redistributions in binary form must reproduce the above
12  *       copyright notice, this list of conditions and the following
13  *       disclaimer in the documentation and/or other materials provided
14  *       with the distribution.
15  *
16  *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17  *       Institute nor the names of its contributors may be used to endorse
18  *       or promote products derived from this software without specific
19  *       prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25  * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 // As per standard rANS_static but using optional RLE or bit-packing
35 // techniques prior to entropy encoding.  This is a significant
36 // reduction in some data sets.
37 
38 // top bits in order byte
39 #define X_PACK   0x80    // Pack 2,4,8 or infinite symbols into a byte.
40 #define X_RLE    0x40    // Run length encoding with runs & lits encoded separately
41 #define X_CAT    0x20    // Nop; for tiny segments where rANS overhead is too big
42 #define X_NOSZ   0x10    // Don't store the original size; used by STRIPE mode
43 #define X_STRIPE 0x08    // For N-byte integer data; rotate & encode N streams.
44 
45 // FIXME Can we get decoder to return the compressed sized read, avoiding
46 // us needing to store it?  Yes we can.  See c-size comments.  If we added all these
47 // together we could get rans_uncompress_to_4x16 to return the number of bytes
48 // consumed, avoiding the calling code from needed to explicitly stored the size.
49 // However the effect on name tokeniser is to save 0.1 to 0.2% so not worth it.
50 
51 /*-------------------------------------------------------------------------- */
52 /*
53  * Example wrapper to use the rans_byte.h functions included above.
54  *
55  * This demonstrates how to use, and unroll, an order-0 and order-1 frequency
56  * model.
57  */
58 
59 #include "config.h"
60 
61 #include <stdint.h>
62 #include <stdlib.h>
63 #include <stdio.h>
64 #include <unistd.h>
65 #include <assert.h>
66 #include <string.h>
67 #include <sys/time.h>
68 #include <limits.h>
69 #include <math.h>
70 #ifndef NO_THREADS
71 #include <pthread.h>
72 #endif
73 
74 #include "rANS_word.h"
75 #include "rANS_static4x16.h"
76 #include "varint.h"
77 #include "pack.h"
78 #include "rle.h"
79 #include "utils.h"
80 
81 #define TF_SHIFT 12
82 #define TOTFREQ (1<<TF_SHIFT)
83 
84 // 9-11 is considerably faster in the O1 variant due to reduced table size.
85 // We auto-tune between 10 and 12 though.  Anywhere from 9 to 14 are viable.
86 #ifndef TF_SHIFT_O1
87 #define TF_SHIFT_O1 12
88 #endif
89 #ifndef TF_SHIFT_O1_FAST
90 #define TF_SHIFT_O1_FAST 10
91 #endif
92 #define TOTFREQ_O1 (1<<TF_SHIFT_O1)
93 #define TOTFREQ_O1_FAST (1<<TF_SHIFT_O1_FAST)
94 
95 
96 /*-----------------------------------------------------------------------------
97  * Memory to memory compression functions.
98  *
99  * These are original versions without any manual loop unrolling. They
100  * are easier to understand, but can be up to 2x slower.
101  */
102 
103 // Rounds to next power of 2.
104 // credit to http://graphics.stanford.edu/~seander/bithacks.html
round2(uint32_t v)105 static uint32_t round2(uint32_t v) {
106     v--;
107     v |= v >> 1;
108     v |= v >> 2;
109     v |= v >> 4;
110     v |= v >> 8;
111     v |= v >> 16;
112     v++;
113     return v;
114 }
115 
normalise_freq(uint32_t * F,int size,uint32_t tot)116 static int normalise_freq(uint32_t *F, int size, uint32_t tot) {
117     int m, M, j, loop = 0;
118     uint64_t tr;
119     if (!size)
120 	return 0;
121 
122  again:
123     tr = ((uint64_t)tot<<31)/size + (1<<30)/size;
124 
125     for (size = m = M = j = 0; j < 256; j++) {
126 	if (!F[j])
127 	    continue;
128 
129 	if (m < F[j])
130 	    m = F[j], M = j;
131 
132 	if ((F[j] = (F[j]*tr)>>31) == 0)
133 	    F[j] = 1;
134 	size += F[j];
135     }
136 
137     int adjust = tot - size;
138     if (adjust > 0) {
139 	F[M] += adjust;
140     } else if (adjust < 0) {
141 	if (F[M] > -adjust && (loop == 1 || F[M]/2 >= -adjust)) {
142 	    F[M] += adjust;
143 	} else {
144 	    if (loop < 1) {
145 		loop++;
146 		goto again;
147 	    }
148 	    adjust += F[M]-1;
149 	    F[M] = 1;
150 	    for (j = 0; adjust && j < 256; j++) {
151 		if (F[j] < 2) continue;
152 
153 		int d = F[j] > -adjust;
154 		int m = d ? adjust : 1-F[j];
155 		F[j]   += m;
156 		adjust -= m;
157 	    }
158 	}
159     }
160 
161     //printf("F[%d]=%d\n", M, F[M]);
162     return F[M]>0 ? 0 : -1;
163 }
164 
165 // A specialised version of normalise_freq_shift where the input size
166 // is already normalised to a power of 2, meaning we can just perform
167 // shifts instead of hard to define multiplications and adjustments.
normalise_freq_shift(uint32_t * F,uint32_t size,uint32_t max_tot)168 static void normalise_freq_shift(uint32_t *F, uint32_t size,
169 				 uint32_t max_tot) {
170     if (size == 0 || size == max_tot)
171 	return;
172 
173     int shift = 0, i;
174     while (size < max_tot)
175 	size*=2, shift++;
176 
177     for (i = 0; i < 256; i++)
178 	F[i] <<= shift;
179 }
180 
181 // symbols only
encode_alphabet(uint8_t * cp,uint32_t * F)182 static int encode_alphabet(uint8_t *cp, uint32_t *F) {
183     uint8_t *op = cp;
184     int rle, j;
185 
186     for (rle = j = 0; j < 256; j++) {
187 	if (F[j]) {
188 	    // j
189 	    if (rle) {
190 		rle--;
191 	    } else {
192 		*cp++ = j;
193 		if (!rle && j && F[j-1])  {
194 		    for(rle=j+1; rle<256 && F[rle]; rle++)
195 			;
196 		    rle -= j+1;
197 		    *cp++ = rle;
198 		}
199 		//fprintf(stderr, "%d: %d %d\n", j, rle, N[j]);
200 	    }
201 	}
202     }
203     *cp++ = 0;
204 
205     return cp - op;
206 }
207 
decode_alphabet(uint8_t * cp,uint8_t * cp_end,uint32_t * F)208 static int decode_alphabet(uint8_t *cp, uint8_t *cp_end, uint32_t *F) {
209     if (cp == cp_end)
210 	return 0;
211 
212     uint8_t *op = cp;
213     int rle = 0;
214     int j = *cp++;
215     if (cp+2 >= cp_end)
216 	goto carefully;
217 
218     do {
219 	F[j] = 1;
220 	if (!rle && j+1 == *cp) {
221 	    j = *cp++;
222 	    rle = *cp++;
223 	} else if (rle) {
224 	    rle--;
225 	    j++;
226 	    if (j > 255)
227 		return 0;
228 	} else {
229 	    j = *cp++;
230 	}
231     } while(j && cp+2 < cp_end);
232 
233  carefully:
234     if (j) {
235 	do {
236 	    F[j] = 1;
237 	    if(cp >= cp_end) return 0;
238 	    if (!rle && j+1 == *cp) {
239 		if (cp+1 >= cp_end) return 0;
240 		j = *cp++;
241 		rle = *cp++;
242 	    } else if (rle) {
243 		rle--;
244 		j++;
245 		if (j > 255)
246 		    return 0;
247 	    } else {
248 		if (cp >= cp_end) return 0;
249 		j = *cp++;
250 	    }
251 	} while(j && cp < cp_end);
252     }
253 
254     return cp - op;
255 }
256 
encode_freq(uint8_t * cp,uint32_t * F)257 static int encode_freq(uint8_t *cp, uint32_t *F) {
258     uint8_t *op = cp;
259     int j;
260 
261     cp += encode_alphabet(cp, F);
262 
263     for (j = 0; j < 256; j++) {
264 	if (F[j])
265 	    cp += var_put_u32(cp, NULL, F[j]);
266     }
267 
268     return cp - op;
269 }
270 
decode_freq(uint8_t * cp,uint8_t * cp_end,uint32_t * F,uint32_t * fsum)271 static int decode_freq(uint8_t *cp, uint8_t *cp_end, uint32_t *F,
272 		       uint32_t *fsum) {
273     if (cp == cp_end)
274 	return 0;
275 
276     uint8_t *op = cp;
277     cp += decode_alphabet(cp, cp_end, F);
278 
279     int j, tot = 0;
280     for (j = 0; j < 256; j++) {
281 	if (F[j]) {
282 	    cp += var_get_u32(cp, cp_end, (unsigned int *)&F[j]);
283 	    tot += F[j];
284 	}
285     }
286 
287     *fsum = tot;
288     return cp - op;
289 }
290 
291 
292 // Use the order-0 freqs in F0 to encode the order-1 stats in F.
293 // All symbols present in F are present in F0, but some in F0 will
294 // be empty in F.  Thus we run-length encode the 0 frequencies.
encode_freq_d(uint8_t * cp,uint32_t * F0,uint32_t * F)295 static int encode_freq_d(uint8_t *cp, uint32_t *F0, uint32_t *F) {
296     uint8_t *op = cp;
297     int j, dz;
298 
299     for (dz = j = 0; j < 256; j++) {
300 	if (F0[j]) {
301 	    if (F[j] != 0) {
302 		if (dz) {
303 		    // Replace dz zeros with zero + dz-1 run length
304 		    cp -= dz-1;
305 		    *cp++ = dz-1;
306 		}
307 		dz = 0;
308 		cp += var_put_u32(cp, NULL, F[j]);
309 	    } else {
310 		//fprintf(stderr, "2: j=%d F0[j]=%d, F[j]=%d, dz=%d\n", j, F0[j], F[j], dz);
311 		dz++;
312 		*cp++ = 0;
313 	    }
314 	} else {
315 	    assert(F[j] == 0);
316 	}
317     }
318 
319     if (dz) {
320 	cp -= dz-1;
321 	*cp++ = dz-1;
322     }
323 
324     return cp - op;
325 }
326 
decode_freq_d(uint8_t * cp,uint8_t * cp_end,uint32_t * F0,uint32_t * F,uint32_t * total)327 static int decode_freq_d(uint8_t *cp, uint8_t *cp_end, uint32_t *F0,
328 			 uint32_t *F, uint32_t *total) {
329     if (cp == cp_end)
330 	return 0;
331 
332     uint8_t *op = cp;
333     int j, dz, T = 0;
334 
335     for (j = dz = 0; j < 256 && cp < cp_end; j++) {
336 	//if (F0[j]) fprintf(stderr, "F0[%d]=%d\n", j, F0[j]);
337 	if (!F0[j])
338 	    continue;
339 
340 	uint32_t f;
341 	if (dz) {
342 	    f = 0;
343 	    dz--;
344 	} else {
345 	    if (cp >= cp_end) return 0;
346 	    cp += var_get_u32(cp, cp_end, &f);
347 	    if (f == 0) {
348 		if (cp >= cp_end) return 0;
349 		dz = *cp++;
350 	    }
351 	}
352 	F[j] = f;
353 	T += f;
354     }
355 
356     if (total) *total = T;
357     return cp - op;
358 }
359 
rans_compress_bound_4x16(unsigned int size,int order)360 unsigned int rans_compress_bound_4x16(unsigned int size, int order) {
361     int N = order>>8;
362     if (!N) N=4;
363 
364     order &= 0xff;
365     int sz = (order == 0
366 	? 1.05*size + 257*3 + 4
367 	: 1.05*size + 257*257*3 + 4 + 257*3+4) +
368 	((order & X_PACK) ? 1 : 0) +
369 	((order & X_RLE) ? 1 + 257*3+4: 0) + 20 +
370 	((order & X_STRIPE) ? 1 + 5*N: 0);
371     return sz + (sz&1) + 2; // make this even so buffers are word aligned
372 }
373 
374 // Compresses in_size bytes from 'in' to *out_size bytes in 'out'.
375 //
376 // NB: The output buffer does not hold the original size, so it is up to
377 // the caller to store this.
378 static
rans_compress_O0_4x16(unsigned char * in,unsigned int in_size,unsigned char * out,unsigned int * out_size)379 unsigned char *rans_compress_O0_4x16(unsigned char *in, unsigned int in_size,
380 				     unsigned char *out, unsigned int *out_size) {
381     unsigned char *cp, *out_end;
382     RansEncSymbol syms[256];
383     RansState rans0;
384     RansState rans2;
385     RansState rans1;
386     RansState rans3;
387     uint8_t* ptr;
388     uint32_t F[256+MAGIC] = {0};
389     int i, j, tab_size = 0, rle, x;
390     int bound = rans_compress_bound_4x16(in_size,0)-20; // -20 for order/size/meta
391 
392     if (!out) {
393 	*out_size = bound;
394 	out = malloc(*out_size);
395     }
396     if (!out || bound > *out_size)
397 	return NULL;
398 
399     // If "out" isn't word aligned, tweak out_end/ptr to ensure it is.
400     // We already added more round in bound to allow for this.
401     if (((size_t)out)&1)
402 	bound--;
403     ptr = out_end = out + bound;
404 
405     if (in_size == 0)
406 	goto empty;
407 
408     // Compute statistics
409     hist8(in, in_size, F);
410 
411     // Normalise so frequences sum to power of 2
412     uint32_t fsum = in_size;
413     uint32_t max_val = round2(fsum);
414     if (max_val > TOTFREQ)
415 	max_val = TOTFREQ;
416 
417     if (normalise_freq(F, fsum, max_val) < 0)
418 	return NULL;
419     fsum=max_val;
420 
421     cp = out;
422     cp += encode_freq(cp, F);
423     tab_size = cp-out;
424     //write(2, out+4, cp-(out+4));
425 
426     if (normalise_freq(F, fsum, TOTFREQ) < 0)
427 	return NULL;
428 
429     // Encode statistics.
430     for (x = rle = j = 0; j < 256; j++) {
431 	if (F[j]) {
432 	    RansEncSymbolInit(&syms[j], x, F[j], TF_SHIFT);
433 	    x += F[j];
434 	}
435     }
436 
437     RansEncInit(&rans0);
438     RansEncInit(&rans1);
439     RansEncInit(&rans2);
440     RansEncInit(&rans3);
441 
442     switch (i=(in_size&3)) {
443     case 3: RansEncPutSymbol(&rans2, &ptr, &syms[in[in_size-(i-2)]]);
444     case 2: RansEncPutSymbol(&rans1, &ptr, &syms[in[in_size-(i-1)]]);
445     case 1: RansEncPutSymbol(&rans0, &ptr, &syms[in[in_size-(i-0)]]);
446     case 0:
447 	break;
448     }
449     for (i=(in_size &~3); i>0; i-=4) {
450 	RansEncSymbol *s3 = &syms[in[i-1]];
451 	RansEncSymbol *s2 = &syms[in[i-2]];
452 	RansEncSymbol *s1 = &syms[in[i-3]];
453 	RansEncSymbol *s0 = &syms[in[i-4]];
454 
455 #if 1
456 	RansEncPutSymbol(&rans3, &ptr, s3);
457 	RansEncPutSymbol(&rans2, &ptr, s2);
458 	RansEncPutSymbol(&rans1, &ptr, s1);
459 	RansEncPutSymbol(&rans0, &ptr, s0);
460 #else
461 	// Slightly beter on gcc, much better on clang
462 	uint16_t *ptr16 = (uint16_t *)ptr;
463 
464 	if (rans3 >= s3->x_max) *--ptr16 = (uint16_t)rans3, rans3 >>= 16;
465 	if (rans2 >= s2->x_max) *--ptr16 = (uint16_t)rans2, rans2 >>= 16;
466 	uint32_t q3 = (uint32_t) (((uint64_t)rans3 * s3->rcp_freq) >> s3->rcp_shift);
467 	uint32_t q2 = (uint32_t) (((uint64_t)rans2 * s2->rcp_freq) >> s2->rcp_shift);
468 	rans3 += s3->bias + q3 * s3->cmpl_freq;
469 	rans2 += s2->bias + q2 * s2->cmpl_freq;
470 
471 	if (rans1 >= s1->x_max) *--ptr16 = (uint16_t)rans1, rans1 >>= 16;
472 	if (rans0 >= s0->x_max) *--ptr16 = (uint16_t)rans0, rans0 >>= 16;
473 	uint32_t q1 = (uint32_t) (((uint64_t)rans1 * s1->rcp_freq) >> s1->rcp_shift);
474 	uint32_t q0 = (uint32_t) (((uint64_t)rans0 * s0->rcp_freq) >> s0->rcp_shift);
475 	rans1 += s1->bias + q1 * s1->cmpl_freq;
476 	rans0 += s0->bias + q0 * s0->cmpl_freq;
477 
478 	ptr = (uint8_t *)ptr16;
479 #endif
480     }
481 
482     RansEncFlush(&rans3, &ptr);
483     RansEncFlush(&rans2, &ptr);
484     RansEncFlush(&rans1, &ptr);
485     RansEncFlush(&rans0, &ptr);
486 
487  empty:
488     // Finalise block size and return it
489     *out_size = (out_end - ptr) + tab_size;
490 
491     memmove(out + tab_size, ptr, out_end-ptr);
492 
493     return out;
494 }
495 
496 typedef struct {
497     unsigned char R[TOTFREQ];
498 } ari_decoder;
499 
500 static
rans_uncompress_O0_4x16(unsigned char * in,unsigned int in_size,unsigned char * out,unsigned int out_sz)501 unsigned char *rans_uncompress_O0_4x16(unsigned char *in, unsigned int in_size,
502 				       unsigned char *out, unsigned int out_sz) {
503     if (in_size < 16) // 4-states at least
504 	return NULL;
505 
506     if (out_sz >= INT_MAX)
507 	return NULL; // protect against some overflow cases
508 
509 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
510     if (out_sz > 100000)
511 	return NULL;
512 #endif
513 
514     /* Load in the static tables */
515     unsigned char *cp = in, *out_free = NULL;
516     unsigned char *cp_end = in + in_size - 8; // within 8 => be extra safe
517     int i, j;
518     unsigned int x, y;
519     uint16_t sfreq[TOTFREQ+32];
520     uint16_t sbase[TOTFREQ+32]; // faster to use 32-bit on clang
521     uint8_t  ssym [TOTFREQ+64]; // faster to use 16-bit on clang
522 
523     if (!out)
524 	out_free = out = malloc(out_sz);
525     if (!out)
526 	return NULL;
527 
528     // Precompute reverse lookup of frequency.
529     uint32_t F[256] = {0}, fsum;
530     int fsz = decode_freq(cp, cp_end, F, &fsum);
531     if (!fsz)
532 	goto err;
533     cp += fsz;
534 
535     normalise_freq_shift(F, fsum, TOTFREQ);
536 
537     // Build symbols; fixme, do as part of decode, see the _d variant
538     for (j = x = 0; j < 256; j++) {
539 	if (F[j]) {
540 	    if (F[j] > TOTFREQ - x)
541 		goto err;
542 	    for (y = 0; y < F[j]; y++) {
543 		ssym [y + x] = j;
544 		sfreq[y + x] = F[j];
545 		sbase[y + x] = y;
546 	    }
547 	    x += F[j];
548 	}
549     }
550 
551     if (x != TOTFREQ)
552 	goto err;
553 
554     if (cp+16 > cp_end+8)
555 	goto err;
556 
557     RansState R[4];
558     RansDecInit(&R[0], &cp); if (R[0] < RANS_BYTE_L) goto err;
559     RansDecInit(&R[1], &cp); if (R[1] < RANS_BYTE_L) goto err;
560     RansDecInit(&R[2], &cp); if (R[2] < RANS_BYTE_L) goto err;
561     RansDecInit(&R[3], &cp); if (R[3] < RANS_BYTE_L) goto err;
562 
563 // Simple version is comparable to below, but only with -O3
564 //
565 //    for (i = 0; cp < cp_end-8 && i < (out_sz&~7); i+=8) {
566 //        for(j=0; j<8;j++) {
567 //	    RansState m = RansDecGet(&R[j%4], TF_SHIFT);
568 //	    R[j%4] = sfreq[m] * (R[j%4] >> TF_SHIFT) + sbase[m];
569 //	    out[i+j] = ssym[m];
570 //	    RansDecRenorm(&R[j%4], &cp);
571 //        }
572 //    }
573 
574     for (i = 0; cp < cp_end-8 && i < (out_sz&~7); i+=8) {
575 	for (j = 0; j < 8; j+=4) {
576 	    RansState m0 = RansDecGet(&R[0], TF_SHIFT);
577 	    RansState m1 = RansDecGet(&R[1], TF_SHIFT);
578 	    R[0] = sfreq[m0] * (R[0] >> TF_SHIFT) + sbase[m0];
579 	    R[1] = sfreq[m1] * (R[1] >> TF_SHIFT) + sbase[m1];
580 
581 	    RansDecRenorm(&R[0], &cp);
582 	    RansDecRenorm(&R[1], &cp);
583 
584 	    out[i+j+0] = ssym[m0];
585 	    out[i+j+1] = ssym[m1];
586 
587 	    RansState m3 = RansDecGet(&R[2], TF_SHIFT);
588 	    RansState m4 = RansDecGet(&R[3], TF_SHIFT);
589 
590 	    R[2] = sfreq[m3] * (R[2] >> TF_SHIFT) + sbase[m3];
591 	    R[3] = sfreq[m4] * (R[3] >> TF_SHIFT) + sbase[m4];
592 
593 	    out[i+j+2] = ssym[m3];
594 	    out[i+j+3] = ssym[m4];
595 
596 	    RansDecRenorm(&R[2], &cp);
597 	    RansDecRenorm(&R[3], &cp);
598 	}
599     }
600 
601     // remainder
602     for (; i < out_sz; i++) {
603         RansState m = RansDecGet(&R[i%4], TF_SHIFT);
604 	R[i%4] = sfreq[m] * (R[i%4] >> TF_SHIFT) + sbase[m];
605 	out[i] = ssym[m];
606 	RansDecRenormSafe(&R[i%4], &cp, cp_end+8);
607     }
608 
609     //fprintf(stderr, "    0 Decoded %d bytes\n", (int)(cp-in)); //c-size
610 
611     return out;
612 
613  err:
614     free(out_free);
615     return NULL;
616 }
617 
618 //-----------------------------------------------------------------------------
619 
fast_log(double a)620 double fast_log(double a) {
621   union { double d; long long x; } u = { a };
622   return (u.x - 4606921278410026770) * 1.539095918623324e-16; /* 1 / 6497320848556798.0; */
623 }
624 
625 // Compute the entropy of 12-bit vs 10-bit frequency tables.
626 // 10 bit means smaller memory footprint when decoding and
627 // more speed due to cache hits, but it *may* be a poor
628 // compression fit.
compute_shift(uint32_t * F0,uint32_t (* F)[256],uint32_t * T,int * S)629 static int compute_shift(uint32_t *F0, uint32_t (*F)[256], uint32_t *T,
630 			 int *S) {
631     int i, j;
632 
633     double e10 = 0, e12 = 0;
634     int max_tot = 0;
635     for (i = 0; i < 256; i++) {
636 	if (F0[i] == 0)
637 	    continue;
638 	int max_val = round2(T[i]);
639 	int ns = 0;
640 #define MAX(a,b) ((a)>(b)?(a):(b))
641 
642 	// Number of samples that get their freq bumped to 1
643 	int sm10 = 0, sm12 = 0;
644 	for (j = 0; j < 256; j++) {
645 	    if (F[i][j] && max_val / F[i][j] > TOTFREQ_O1_FAST)
646 		sm10++;
647 	    if (F[i][j] && max_val / F[i][j] > TOTFREQ_O1)
648 		sm12++;
649 	}
650 
651 	double l10 = log(TOTFREQ_O1_FAST + sm10);
652 	double l12 = log(TOTFREQ_O1      + sm12);
653 
654 	for (j = 0; j < 256; j++) {
655 	    if (F[i][j]) {
656 		ns++;
657 
658 		int x = (double)TOTFREQ_O1_FAST * F[i][j]/T[i];
659 		e10 -= F[i][j] * (fast_log(MAX(x,1)) - l10);
660 
661 		x = (double)TOTFREQ_O1 * F[i][j]/T[i];
662 		e12 -= F[i][j] * (fast_log(MAX(x,1)) - l12);
663 
664 		// Estimation of compressedf symbol freq table too.
665 		e10 += 4;
666 		e12 += 6;
667 	    }
668 	}
669 
670 	// Order-1 frequencies often end up totalling under TOTFREQ.
671 	// In this case it's smaller to output the real frequencies
672 	// prior to normalisation and normalise after (with an extra
673 	// normalisation step needed in the decoder too).
674 	//
675 	// Thus we normalise to a power of 2 only, store those,
676 	// and renormalise later here (and in decoder) by bit-shift
677 	// to get to the fixed size.
678 	if (ns < 64 && max_val > 128) max_val /= 2;
679 	if (max_val > 1024)           max_val /= 2;
680 	if (max_val > TOTFREQ_O1)     max_val = TOTFREQ_O1;
681 	S[i] = max_val; // scale to max this
682 	if (max_tot < max_val)
683 	    max_tot = max_val;
684     }
685     int shift = e10/e12 < 1.01 || max_tot <= TOTFREQ_O1_FAST ? TF_SHIFT_O1_FAST : TF_SHIFT_O1;
686 
687 //    fprintf(stderr, "e10/12 = %f %f %f, shift %d\n",
688 //    	    e10/log(256), e12/log(256), e10/e12, shift);
689 
690     return shift;
691 }
692 
693 
694 static
rans_compress_O1_4x16(unsigned char * in,unsigned int in_size,unsigned char * out,unsigned int * out_size)695 unsigned char *rans_compress_O1_4x16(unsigned char *in, unsigned int in_size,
696 				     unsigned char *out, unsigned int *out_size) {
697     unsigned char *cp, *out_end, *op;
698     unsigned int tab_size;
699     RansEncSymbol syms[256][256];
700     int bound = rans_compress_bound_4x16(in_size,1)-20; // -20 for order/size/meta
701 
702     if (!out) {
703 	*out_size = bound;
704 	out = malloc(*out_size);
705     }
706     if (!out || bound > *out_size)
707 	return NULL;
708 
709     if (((size_t)out)&1)
710 	bound--;
711     out_end = out + bound;
712 
713     uint32_t F[256][256] = {{0}}, T[256+MAGIC] = {0};
714     int i, j;
715 
716     //memset(F, 0, 256*256*sizeof(int));
717     //memset(T, 0, 256*sizeof(int));
718 
719     hist1_4(in, in_size, F, T);
720     F[0][in[1*(in_size>>2)]]++;
721     F[0][in[2*(in_size>>2)]]++;
722     F[0][in[3*(in_size>>2)]]++;
723     T[0]+=3;
724 
725     op = cp = out;
726     *cp++ = 0; // uncompressed header marker
727 
728     // Encode the order-0 symbols for use in the order-1 frequency tables
729     uint32_t F0[256+MAGIC] = {0};
730     present8(in, in_size, F0);
731     F0[0]=1;
732     cp += encode_alphabet(cp, F0);
733 
734     // Decide between 10-bit and 12-bit freqs.
735     // Fills out S[] to hold the new scaled maximum value.
736     int S[256] = {0};
737     int shift = compute_shift(F0, F, T, S);
738 
739     // Normalise so T[i] == TOTFREQ_O1
740     for (i = 0; i < 256; i++) {
741 	unsigned int x;
742 
743 	if (F0[i] == 0)
744 	    continue;
745 
746 	int max_val = S[i];
747 	if (shift == TF_SHIFT_O1_FAST && max_val > TOTFREQ_O1_FAST)
748 	    max_val = TOTFREQ_O1_FAST;
749 
750 	if (normalise_freq(F[i], T[i], max_val) < 0)
751 	    return NULL;
752 	T[i]=max_val;
753 
754 	cp += encode_freq_d(cp, F0, F[i]);
755 
756 	normalise_freq_shift(F[i], T[i], 1<<shift); T[i]=1<<shift;
757 
758 	uint32_t *F_i_ = F[i];
759 	for (x = j = 0; j < 256; j++) {
760 	    RansEncSymbolInit(&syms[i][j], x, F_i_[j], shift);
761 	    x += F_i_[j];
762 	}
763 
764     }
765 
766     *op = shift<<4;
767     if (cp - op > 1000) {
768 	// try rans0 compression of header
769 	unsigned int u_freq_sz = cp-(op+1);
770 	unsigned int c_freq_sz;
771 	unsigned char *c_freq = rans_compress_O0_4x16(op+1, u_freq_sz, NULL, &c_freq_sz);
772 	if (c_freq && c_freq_sz + 6 < cp-op) {
773 	    *op++ |= 1; // compressed
774 	    op += var_put_u32(op, NULL, u_freq_sz);
775 	    op += var_put_u32(op, NULL, c_freq_sz);
776 	    memcpy(op, c_freq, c_freq_sz);
777 	    cp = op+c_freq_sz;
778 	}
779 	free(c_freq);
780     }
781 
782     //write(2, out+4, cp-(out+4));
783     tab_size = cp - out;
784     assert(tab_size < 257*257*3);
785 
786     RansState rans0, rans1, rans2, rans3;
787     RansEncInit(&rans0);
788     RansEncInit(&rans1);
789     RansEncInit(&rans2);
790     RansEncInit(&rans3);
791 
792     uint8_t* ptr = out_end;
793 
794     int isz4 = in_size>>2;
795     int i0 = 1*isz4-2;
796     int i1 = 2*isz4-2;
797     int i2 = 3*isz4-2;
798     int i3 = 4*isz4-2;
799 
800     unsigned char l0 = in[i0+1];
801     unsigned char l1 = in[i1+1];
802     unsigned char l2 = in[i2+1];
803     unsigned char l3 = in[i3+1];
804 
805     // Deal with the remainder
806     l3 = in[in_size-1];
807     for (i3 = in_size-2; i3 > 4*isz4-2; i3--) {
808 	unsigned char c3 = in[i3];
809 	RansEncPutSymbol(&rans3, &ptr, &syms[c3][l3]);
810 	l3 = c3;
811     }
812 
813     for (; i0 >= 0; i0--, i1--, i2--, i3--) {
814 	unsigned char c0, c1, c2, c3;
815 	RansEncSymbol *s3 = &syms[c3 = in[i3]][l3];
816 	RansEncSymbol *s2 = &syms[c2 = in[i2]][l2];
817 	RansEncSymbol *s1 = &syms[c1 = in[i1]][l1];
818 	RansEncSymbol *s0 = &syms[c0 = in[i0]][l0];
819 
820 	RansEncPutSymbol(&rans3, &ptr, s3);
821 	RansEncPutSymbol(&rans2, &ptr, s2);
822 	RansEncPutSymbol(&rans1, &ptr, s1);
823 	RansEncPutSymbol(&rans0, &ptr, s0);
824 
825 	l0 = c0;
826 	l1 = c1;
827 	l2 = c2;
828 	l3 = c3;
829     }
830 
831     RansEncPutSymbol(&rans3, &ptr, &syms[0][l3]);
832     RansEncPutSymbol(&rans2, &ptr, &syms[0][l2]);
833     RansEncPutSymbol(&rans1, &ptr, &syms[0][l1]);
834     RansEncPutSymbol(&rans0, &ptr, &syms[0][l0]);
835 
836     RansEncFlush(&rans3, &ptr);
837     RansEncFlush(&rans2, &ptr);
838     RansEncFlush(&rans1, &ptr);
839     RansEncFlush(&rans0, &ptr);
840 
841     *out_size = (out_end - ptr) + tab_size;
842 
843     cp = out;
844     memmove(out + tab_size, ptr, out_end-ptr);
845 
846     return out;
847 }
848 
849 #ifndef NO_THREADS
850 /*
851  * Thread local storage per thread in the pool.
852  */
853 pthread_once_t rans_once = PTHREAD_ONCE_INIT;
854 pthread_key_t rans_key;
855 
rans_tls_init(void)856 static void rans_tls_init(void) {
857     pthread_key_create(&rans_key, free);
858 }
859 #endif
860 
861 //#define MAGIC2 111
862 #define MAGIC2 179
863 //#define MAGIC2 0
864 typedef struct {
865     uint16_t f;
866     uint16_t b;
867 } fb_t;
868 
869 static
rans_uncompress_O1_4x16(unsigned char * in,unsigned int in_size,unsigned char * out,unsigned int out_sz)870 unsigned char *rans_uncompress_O1_4x16(unsigned char *in, unsigned int in_size,
871 				       unsigned char *out, unsigned int out_sz) {
872     if (in_size < 16) // 4-states at least
873 	return NULL;
874 
875     if (out_sz >= INT_MAX)
876 	return NULL; // protect against some overflow cases
877 
878 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
879     if (out_sz > 100000)
880 	return NULL;
881 #endif
882 
883     /* Load in the static tables */
884     unsigned char *cp = in, *cp_end = in+in_size, *out_free = NULL;
885     unsigned char *c_freq = NULL;
886     int i, j = -999;
887     unsigned int x;
888 
889 #ifndef NO_THREADS
890     /*
891      * The calloc below is expensive as it's a large structure.  We
892      * could use malloc, but we're only initialising parts of the structure
893      * that we need to, as dictated by the frequency table.  This is far
894      * faster than initialising everything (ie malloc+memset => calloc).
895      * Not initialising the data means malformed input with mismatching
896      * frequency tables to actual data can lead to accessing of the
897      * uninitialised sfb table and in turn potential leakage of the
898      * uninitialised memory returned by malloc.  That could be anything at
899      * all, including important encryption keys used within a server (for
900      * example).
901      *
902      * However (I hope!) we don't care about leaking about the sfb symbol
903      * frequencies previously computed by an earlier execution of *this*
904      * code.  So calloc once and reuse is the fastest alternative.
905      *
906      * We do this through pthread local storage as we don't know if this
907      * code is being executed in many threads simultaneously.
908      */
909     pthread_once(&rans_once, rans_tls_init);
910 
911     uint8_t *sfb_ = pthread_getspecific(rans_key);
912     if (!sfb_) {
913 	sfb_ = calloc(256*(TOTFREQ_O1+MAGIC2), sizeof(*sfb_));
914 	pthread_setspecific(rans_key, sfb_);
915     }
916 #else
917     uint8_t *sfb_ = calloc(256*(TOTFREQ_O1+MAGIC2), sizeof(*sfb_));
918 #endif
919 
920     if (!sfb_)
921 	return NULL;
922     fb_t fb[256][256];
923     uint8_t *sfb[256];
924     if ((*cp >> 4) == TF_SHIFT_O1) {
925 	for (i = 0; i < 256; i++)
926 	    sfb[i]=  sfb_ + i*(TOTFREQ_O1+MAGIC2);
927     } else {
928 	for (i = 0; i < 256; i++)
929 	    sfb[i]=  sfb_ + i*(TOTFREQ_O1_FAST+MAGIC2);
930     }
931 
932     if (!out)
933 	out_free = out = malloc(out_sz);
934 
935     if (!out)
936 	goto err;
937 
938     //fprintf(stderr, "out_sz=%d\n", out_sz);
939 
940     // compressed header? If so uncompress it
941     unsigned char *tab_end = NULL;
942     unsigned char *c_freq_end = cp_end;
943     unsigned int shift = *cp >> 4;
944     if (*cp++ & 1) {
945 	uint32_t u_freq_sz, c_freq_sz;
946 	cp += var_get_u32(cp, cp_end, &u_freq_sz);
947 	cp += var_get_u32(cp, cp_end, &c_freq_sz);
948 	if (c_freq_sz >= cp_end - cp - 16)
949 	    goto err;
950 	tab_end = cp + c_freq_sz;
951 	if (!(c_freq = rans_uncompress_O0_4x16(cp, c_freq_sz, NULL, u_freq_sz)))
952 	    goto err;
953 	cp = c_freq;
954 	c_freq_end = c_freq + u_freq_sz;
955     }
956 
957     // Decode order-0 symbol list; avoids needing in order-1 tables
958     uint32_t F0[256] = {0};
959     int fsz = decode_alphabet(cp, c_freq_end, F0);
960     if (!fsz)
961 	goto err;
962     cp += fsz;
963 
964     if (cp >= c_freq_end)
965 	goto err;
966 
967     for (i = 0; i < 256; i++) {
968 	if (F0[i] == 0)
969 	    continue;
970 
971 	uint32_t F[256] = {0}, T = 0;
972 	fsz = decode_freq_d(cp, c_freq_end, F0, F, &T);
973 	if (!fsz)
974 	    goto err;
975 	cp += fsz;
976 
977 	if (!T) {
978 	    //fprintf(stderr, "No freq for F_%d\n", i);
979 	    continue;
980 	}
981 
982 	normalise_freq_shift(F, T, 1<<shift);
983 
984 	// Build symbols; fixme, do as part of decode, see the _d variant
985 	for (j = x = 0; j < 256; j++) {
986 	    if (F[j]) {
987 		if (F[j] > (1<<shift) - x)
988 		    goto err;
989 
990 		memset(&sfb[i][x], j, F[j]);
991 		fb[i][j].f = F[j];
992 		fb[i][j].b = x;
993 		x += F[j];
994 	    }
995 	}
996 	if (x != (1<<shift))
997 	    goto err;
998     }
999 
1000     if (tab_end)
1001 	cp = tab_end;
1002     free(c_freq);
1003     c_freq = NULL;
1004 
1005     if (cp+16 > cp_end)
1006 	goto err;
1007 
1008     RansState rans0, rans1, rans2, rans3;
1009     uint8_t *ptr = cp, *ptr_end = in + in_size - 8;
1010     RansDecInit(&rans0, &ptr); if (rans0 < RANS_BYTE_L) goto err;
1011     RansDecInit(&rans1, &ptr); if (rans1 < RANS_BYTE_L) goto err;
1012     RansDecInit(&rans2, &ptr); if (rans2 < RANS_BYTE_L) goto err;
1013     RansDecInit(&rans3, &ptr); if (rans3 < RANS_BYTE_L) goto err;
1014 
1015     unsigned int isz4 = out_sz>>2;
1016     int l0 = 0, l1 = 0, l2 = 0, l3 = 0;
1017     unsigned int i4[] = {0*isz4, 1*isz4, 2*isz4, 3*isz4};
1018 
1019     RansState R[4];
1020     R[0] = rans0;
1021     R[1] = rans1;
1022     R[2] = rans2;
1023     R[3] = rans3;
1024 
1025     // Around 15% faster to specialise for 10/12 than to have one
1026     // loop with shift as a variable.
1027     if (shift == TF_SHIFT_O1) {
1028 	// TF_SHIFT_O1 = 12
1029 
1030 	const uint32_t mask = ((1u << TF_SHIFT_O1)-1);
1031 	for (; i4[0] < isz4; i4[0]++, i4[1]++, i4[2]++, i4[3]++) {
1032 	    uint16_t m, c;
1033 	    c = sfb[l0][m = R[0] & mask];
1034 	    R[0] = fb[l0][c].f * (R[0]>>TF_SHIFT_O1) + m - fb[l0][c].b;
1035 	    out[i4[0]] = l0 = c;
1036 
1037 	    c = sfb[l1][m = R[1] & mask];
1038 	    R[1] = fb[l1][c].f * (R[1]>>TF_SHIFT_O1) + m - fb[l1][c].b;
1039 	    out[i4[1]] = l1 = c;
1040 
1041 	    c = sfb[l2][m = R[2] & mask];
1042 	    R[2] = fb[l2][c].f * (R[2]>>TF_SHIFT_O1) + m - fb[l2][c].b;
1043 	    out[i4[2]] = l2 = c;
1044 
1045 	    c = sfb[l3][m = R[3] & mask];
1046 	    R[3] = fb[l3][c].f * (R[3]>>TF_SHIFT_O1) + m - fb[l3][c].b;
1047 	    out[i4[3]] = l3 = c;
1048 
1049 	    if (ptr < ptr_end) {
1050 		RansDecRenorm(&R[0], &ptr);
1051 		RansDecRenorm(&R[1], &ptr);
1052 		RansDecRenorm(&R[2], &ptr);
1053 		RansDecRenorm(&R[3], &ptr);
1054 	    } else {
1055 		RansDecRenormSafe(&R[0], &ptr, ptr_end+8);
1056 		RansDecRenormSafe(&R[1], &ptr, ptr_end+8);
1057 		RansDecRenormSafe(&R[2], &ptr, ptr_end+8);
1058 		RansDecRenormSafe(&R[3], &ptr, ptr_end+8);
1059 	    }
1060 	}
1061 
1062 	// Remainder
1063 	for (; i4[3] < out_sz; i4[3]++) {
1064 	    uint32_t m3 = R[3] & ((1u<<TF_SHIFT_O1)-1);
1065 	    unsigned char c3 = sfb[l3][m3];
1066 	    out[i4[3]] = c3;
1067 	    R[3] = fb[l3][c3].f * (R[3]>>TF_SHIFT_O1) + m3 - fb[l3][c3].b;
1068 	    RansDecRenormSafe(&R[3], &ptr, ptr_end + 8);
1069 	    l3 = c3;
1070 	}
1071     } else {
1072 	// TF_SHIFT_O1 = 10
1073 	const uint32_t mask = ((1u << TF_SHIFT_O1_FAST)-1);
1074 	for (; i4[0] < isz4; i4[0]++, i4[1]++, i4[2]++, i4[3]++) {
1075 	    uint16_t m, c;
1076 	    c = sfb[l0][m = R[0] & mask];
1077 	    R[0] = fb[l0][c].f * (R[0]>>TF_SHIFT_O1_FAST) + m - fb[l0][c].b;
1078 	    out[i4[0]] = l0 = c;
1079 
1080 	    c = sfb[l1][m = R[1] & mask];
1081 	    R[1] = fb[l1][c].f * (R[1]>>TF_SHIFT_O1_FAST) + m - fb[l1][c].b;
1082 	    out[i4[1]] = l1 = c;
1083 
1084 	    c = sfb[l2][m = R[2] & mask];
1085 	    R[2] = fb[l2][c].f * (R[2]>>TF_SHIFT_O1_FAST) + m - fb[l2][c].b;
1086 	    out[i4[2]] = l2 = c;
1087 
1088 	    c = sfb[l3][m = R[3] & mask];
1089 	    R[3] = fb[l3][c].f * (R[3]>>TF_SHIFT_O1_FAST) + m - fb[l3][c].b;
1090 	    out[i4[3]] = l3 = c;
1091 
1092 	    if (ptr < ptr_end) {
1093 		RansDecRenorm(&R[0], &ptr);
1094 		RansDecRenorm(&R[1], &ptr);
1095 		RansDecRenorm(&R[2], &ptr);
1096 		RansDecRenorm(&R[3], &ptr);
1097 	    } else {
1098 		RansDecRenormSafe(&R[0], &ptr, ptr_end+8);
1099 		RansDecRenormSafe(&R[1], &ptr, ptr_end+8);
1100 		RansDecRenormSafe(&R[2], &ptr, ptr_end+8);
1101 		RansDecRenormSafe(&R[3], &ptr, ptr_end+8);
1102 	    }
1103 	}
1104 
1105 	// Remainder
1106 	for (; i4[3] < out_sz; i4[3]++) {
1107 	    uint32_t m3 = R[3] & ((1u<<TF_SHIFT_O1_FAST)-1);
1108 	    unsigned char c3 = sfb[l3][m3];
1109 	    out[i4[3]] = c3;
1110 	    R[3] = fb[l3][c3].f * (R[3]>>TF_SHIFT_O1_FAST) + m3 - fb[l3][c3].b;
1111 	    RansDecRenormSafe(&R[3], &ptr, ptr_end + 8);
1112 	    l3 = c3;
1113 	}
1114     }
1115     //fprintf(stderr, "    1 Decoded %d bytes\n", (int)(ptr-in)); //c-size
1116 
1117 #ifdef NO_THREADS
1118     free(sfb_);
1119 #endif
1120     return out;
1121 
1122  err:
1123 #ifdef NO_THREADS
1124     free(sfb_);
1125 #endif
1126     free(out_free);
1127     free(c_freq);
1128 
1129     return NULL;
1130 }
1131 
1132 
1133 /*-----------------------------------------------------------------------------
1134  * Simple interface to the order-0 vs order-1 encoders and decoders.
1135  *
1136  * Smallest is method, <in_size> <input>, so worst case 2 bytes longer.
1137  */
rans_compress_to_4x16(unsigned char * in,unsigned int in_size,unsigned char * out,unsigned int * out_size,int order)1138 unsigned char *rans_compress_to_4x16(unsigned char *in,  unsigned int in_size,
1139 				     unsigned char *out, unsigned int *out_size,
1140 				     int order) {
1141     unsigned int c_meta_len;
1142     uint8_t *meta = NULL, *rle = NULL, *packed = NULL;
1143 
1144     if (!out) {
1145 	*out_size = rans_compress_bound_4x16(in_size, order);
1146 	if (!(out = malloc(*out_size)))
1147 	    return NULL;
1148     }
1149     unsigned char *out_end = out + *out_size;
1150 
1151     if (in_size <= 20)
1152 	order &= ~X_STRIPE;
1153 
1154     if (order & X_STRIPE) {
1155 	int N = (order>>8);
1156 	if (N == 0) N = 4; // default for compatibility with old tests
1157 
1158 	if (N > 255)
1159 	    return NULL;
1160 
1161 	unsigned char *transposed = malloc(in_size);
1162 	unsigned int part_len[256];
1163 	unsigned int idx[256];
1164 	if (!transposed)
1165 	    return NULL;
1166 	int i, j, x;
1167 
1168 	for (i = 0; i < N; i++) {
1169 	    part_len[i] = in_size / N + ((in_size % N) > i);
1170 	    idx[i] = i ? idx[i-1] + part_len[i-1] : 0; // cumulative index
1171 	}
1172 
1173 	for (i = x = 0; i < in_size-N; i += N, x++) {
1174 	    for (j = 0; j < N; j++)
1175 		transposed[idx[j]+x] = in[i+j];
1176 	}
1177 	for (; i < in_size; i += N, x++) {
1178 	    for (j = 0; i+j < in_size; j++)
1179 		transposed[idx[j]+x] = in[i+j];
1180 	}
1181 
1182 	unsigned int olen2;
1183 	unsigned char *out2, *out2_start;
1184 	c_meta_len = 1;
1185 	*out = order & ~X_NOSZ;
1186 	c_meta_len += var_put_u32(out+c_meta_len, out_end, in_size);
1187 	out[c_meta_len++] = N;
1188 
1189 	out2_start = out2 = out+2+5*N; // shares a buffer with c_meta
1190         for (i = 0; i < N; i++) {
1191             // Brute force try all methods.
1192             int j, m[] = {1,64,128,0}, best_j = 0, best_sz = in_size+10;
1193             for (j = 0; j < 4; j++) {
1194 		if ((order & m[j]) != m[j])
1195                     continue;
1196                 olen2 = *out_size - (out2 - out);
1197                 rans_compress_to_4x16(transposed+idx[i], part_len[i],
1198 				      out2, &olen2, m[j] | X_NOSZ);
1199                 if (best_sz > olen2) {
1200                     best_sz = olen2;
1201                     best_j = j;
1202                 }
1203             }
1204 	    if (best_j != j-1) {
1205 		olen2 = *out_size - (out2 - out);
1206 		rans_compress_to_4x16(transposed+idx[i], part_len[i],
1207 				      out2, &olen2, m[best_j] | X_NOSZ);
1208 	    }
1209             out2 += olen2;
1210             c_meta_len += var_put_u32(out+c_meta_len, out_end, olen2);
1211         }
1212 	memmove(out+c_meta_len, out2_start, out2-out2_start);
1213 	free(transposed);
1214 	*out_size = c_meta_len + out2-out2_start;
1215 	return out;
1216     }
1217 
1218     if (order & X_CAT) {
1219 	out[0] = X_CAT;
1220 	c_meta_len = 1;
1221 	c_meta_len += var_put_u32(&out[1], out_end, in_size);
1222 	memcpy(out+c_meta_len, in, in_size);
1223 	*out_size = c_meta_len + in_size;
1224 	return out;
1225     }
1226 
1227     int do_pack = order & X_PACK;
1228     int do_rle  = order & X_RLE;
1229     int no_size = order & X_NOSZ;
1230 
1231     out[0] = order;
1232     c_meta_len = 1;
1233 
1234     if (!no_size)
1235 	c_meta_len += var_put_u32(&out[1], out_end, in_size);
1236 
1237     order &= 0xf;
1238 
1239     // Format is compressed meta-data, compressed data.
1240     // Meta-data can be empty, pack, rle lengths, or pack + rle lengths.
1241     // Data is either the original data, bit-packed packed, rle literals or
1242     // packed + rle literals.
1243 
1244     if (do_pack && in_size) {
1245 	// PACK 2, 4 or 8 symbols into one byte.
1246 	int pmeta_len;
1247 	uint64_t packed_len;
1248 	packed = hts_pack(in, in_size, out+c_meta_len, &pmeta_len, &packed_len);
1249 	if (!packed || (pmeta_len == 1 && out[c_meta_len] > 16)) {
1250 	    out[0] &= ~X_PACK;
1251 	    do_pack = 0;
1252 	    free(packed);
1253 	    packed = NULL;
1254 	} else {
1255 	    in = packed;
1256 	    in_size = packed_len;
1257 	    c_meta_len += pmeta_len;
1258 
1259 	    // Could derive this rather than storing verbatim.
1260 	    // Orig size * 8/nbits (+1 if not multiple of 8/n)
1261 	    int sz = var_put_u32(out+c_meta_len, out_end, in_size);
1262 	    c_meta_len += sz;
1263 	    *out_size -= sz;
1264 	}
1265     } else if (do_pack) {
1266 	out[0] &= ~X_PACK;
1267     }
1268 
1269     if (do_rle && in_size) {
1270 	// RLE 'in' -> rle_length + rle_literals arrays
1271 	unsigned int rmeta_len, c_rmeta_len;
1272 	uint64_t rle_len;
1273 	c_rmeta_len = in_size+257;
1274 	if (!(meta = malloc(c_rmeta_len)))
1275 	    return NULL;
1276 
1277 	uint8_t rle_syms[256];
1278 	int rle_nsyms = 0;
1279 	uint64_t rmeta_len64;
1280 	rle = rle_encode(in, in_size, meta, &rmeta_len64,
1281 			 rle_syms, &rle_nsyms, NULL, &rle_len);
1282 	memmove(meta+1+rle_nsyms, meta, rmeta_len64);
1283 	meta[0] = rle_nsyms;
1284 	memcpy(meta+1, rle_syms, rle_nsyms);
1285 	rmeta_len = rmeta_len64 + rle_nsyms+1;
1286 
1287 	if (!rle || rle_len + rmeta_len >= .99*in_size) {
1288 	    // Not worth the speed hit.
1289 	    out[0] &= ~X_RLE;
1290 	    do_rle = 0;
1291 	    free(rle);
1292 	    rle = NULL;
1293 	} else {
1294 	    // Compress lengths with O0 and literals with O0/O1 ("order" param)
1295 	    int sz = var_put_u32(out+c_meta_len, out_end, rmeta_len*2), sz2;
1296 	    sz += var_put_u32(out+c_meta_len+sz, out_end, rle_len);
1297 	    c_rmeta_len = *out_size - (c_meta_len+sz+5);
1298 	    rans_compress_O0_4x16(meta, rmeta_len, out+c_meta_len+sz+5, &c_rmeta_len);
1299 	    if (c_rmeta_len < rmeta_len) {
1300 		sz2 = var_put_u32(out+c_meta_len+sz, out_end, c_rmeta_len);
1301 		memmove(out+c_meta_len+sz+sz2, out+c_meta_len+sz+5, c_rmeta_len);
1302 	    } else {
1303 		// Uncompressed RLE meta-data as too small
1304 		sz = var_put_u32(out+c_meta_len, out_end, rmeta_len*2+1);
1305 		sz2 = var_put_u32(out+c_meta_len+sz, out_end, rle_len);
1306 		memcpy(out+c_meta_len+sz+sz2, meta, rmeta_len);
1307 		c_rmeta_len = rmeta_len;
1308 	    }
1309 
1310 	    c_meta_len += sz + sz2 + c_rmeta_len;
1311 
1312 	    in = rle;
1313 	    in_size = rle_len;
1314 	}
1315 
1316 	free(meta);
1317     } else if (do_rle) {
1318 	out[0] &= ~X_RLE;
1319     }
1320 
1321     *out_size -= c_meta_len;
1322     if (order && in_size < 8) {
1323 	out[0] &= ~1;
1324 	order  &= ~1;
1325     }
1326 
1327     if (order == 1)
1328 	rans_compress_O1_4x16(in, in_size, out+c_meta_len, out_size);
1329     else
1330 	rans_compress_O0_4x16(in, in_size, out+c_meta_len, out_size);
1331 
1332     if (*out_size >= in_size) {
1333 	out[0] &= ~3;
1334 	out[0] |= X_CAT | no_size;
1335 	memcpy(out+c_meta_len, in, in_size);
1336 	*out_size = in_size;
1337     }
1338 
1339     free(rle);
1340     free(packed);
1341 
1342     *out_size += c_meta_len;
1343 
1344     return out;
1345 }
1346 
rans_compress_4x16(unsigned char * in,unsigned int in_size,unsigned int * out_size,int order)1347 unsigned char *rans_compress_4x16(unsigned char *in, unsigned int in_size,
1348 				  unsigned int *out_size, int order) {
1349     return rans_compress_to_4x16(in, in_size, NULL, out_size, order);
1350 }
1351 
rans_uncompress_to_4x16(unsigned char * in,unsigned int in_size,unsigned char * out,unsigned int * out_size)1352 unsigned char *rans_uncompress_to_4x16(unsigned char *in,  unsigned int in_size,
1353 				       unsigned char *out, unsigned int *out_size) {
1354     unsigned char *in_end = in + in_size;
1355     unsigned char *out_free = NULL, *tmp_free = NULL, *meta_free = NULL;
1356 
1357     if (in_size == 0)
1358 	return NULL;
1359 
1360     if (*in & X_STRIPE) {
1361 	unsigned int ulen, olen, c_meta_len = 1;
1362 	int i;
1363 	uint64_t clen_tot = 0;
1364 
1365 	// Decode lengths
1366 	c_meta_len += var_get_u32(in+c_meta_len, in_end, &ulen);
1367 	if (c_meta_len >= in_size)
1368 	    return NULL;
1369 	unsigned int N = in[c_meta_len++];
1370 	unsigned int clenN[256], ulenN[256], idxN[256];
1371 	if (!out) {
1372 	    if (ulen >= INT_MAX)
1373 		return NULL;
1374 	    if (!(out_free = out = malloc(ulen))) {
1375 		return NULL;
1376 	    }
1377 	    *out_size = ulen;
1378 	}
1379 	if (ulen != *out_size) {
1380 	    free(out_free);
1381 	    return NULL;
1382 	}
1383 
1384 	for (i = 0; i < N; i++) {
1385 	    ulenN[i] = ulen / N + ((ulen % N) > i);
1386 	    idxN[i] = i ? idxN[i-1] + ulenN[i-1] : 0;
1387 	    c_meta_len += var_get_u32(in+c_meta_len, in_end, &clenN[i]);
1388 	    clen_tot += clenN[i];
1389 	    if (c_meta_len > in_size || clenN[i] > in_size || clenN[i] < 1) {
1390 		free(out_free);
1391 		return NULL;
1392 	    }
1393 	}
1394 
1395 	// We can call this with a larger buffer, but once we've determined
1396 	// how much we really use we limit it so the recursion becomes easier
1397 	// to limit.
1398 	if (c_meta_len + clen_tot > in_size) {
1399 	    free(out_free);
1400 	    return NULL;
1401 	}
1402 	in_size = c_meta_len + clen_tot;
1403 
1404 	//fprintf(stderr, "    stripe meta %d\n", c_meta_len); //c-size
1405 
1406 	// Uncompress the N streams
1407 	unsigned char *outN = malloc(ulen);
1408 	if (!outN) {
1409 	    free(out_free);
1410 	    return NULL;
1411 	}
1412 	for (i = 0; i < N; i++) {
1413 	    olen = ulenN[i];
1414 	    if (in_size < c_meta_len) {
1415 		free(out_free);
1416 		free(outN);
1417 		return NULL;
1418 	    }
1419 	    if (!rans_uncompress_to_4x16(in+c_meta_len, in_size-c_meta_len, outN + idxN[i], &olen)
1420 		|| olen != ulenN[i]) {
1421 		free(out_free);
1422 		free(outN);
1423 		return NULL;
1424 	    }
1425 	    c_meta_len += clenN[i];
1426 	}
1427 
1428 	unstripe(out, outN, ulen, N, idxN);
1429 
1430 	free(outN);
1431 	*out_size = ulen;
1432 	return out;
1433     }
1434 
1435     int order = *in++;  in_size--;
1436     int do_pack = order & X_PACK;
1437     int do_rle  = order & X_RLE;
1438     int do_cat  = order & X_CAT;
1439     int no_size = order & X_NOSZ;
1440     order &= 1;
1441 
1442     int sz = 0;
1443     unsigned int osz;
1444     if (!no_size)
1445 	sz = var_get_u32(in, in_end, &osz);
1446     else
1447 	sz = 0, osz = *out_size;
1448     in += sz;
1449     in_size -= sz;
1450 
1451 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1452     if (osz > 100000)
1453 	return NULL;
1454 #endif
1455 
1456     if (no_size && !out)
1457 	goto err; // Need one or the other
1458 
1459     if (!out) {
1460 	*out_size = osz;
1461 	if (!(out = out_free = malloc(*out_size)))
1462 	    return NULL;
1463     } else {
1464 	if (*out_size < osz)
1465 	goto err;
1466 	*out_size = osz;
1467     }
1468 
1469 //    if (do_pack || do_rle) {
1470 //	in += sz; // size field not needed when pure rANS
1471 //	in_size -= sz;
1472 //    }
1473 
1474     uint32_t c_meta_size = 0;
1475     unsigned int tmp1_size = *out_size;
1476     unsigned int tmp2_size = *out_size;
1477     unsigned int tmp3_size = *out_size;
1478     unsigned char *tmp1 = NULL, *tmp2 = NULL, *tmp3 = NULL, *tmp = NULL;
1479 
1480     // Need In, Out and Tmp buffers with temporary buffer of the same size
1481     // as output.  All use rANS, but with optional transforms (none, RLE,
1482     // Pack, or both).
1483     //
1484     //                    rans   unrle  unpack
1485     // If none:     in -> out
1486     // If RLE:      in -> tmp -> out
1487     // If Pack:     in -> tmp        -> out
1488     // If RLE+Pack: in -> out -> tmp -> out
1489     //                    tmp1   tmp2   tmp3
1490     //
1491     // So rans is in   -> tmp1
1492     // RLE     is tmp1 -> tmp2
1493     // Unpack  is tmp2 -> tmp3
1494 
1495     // Format is meta data (Pack and RLE in that order if present),
1496     // followed by rANS compressed data.
1497 
1498     if (do_pack || do_rle) {
1499 	if (!(tmp = tmp_free = malloc(*out_size)))
1500 	    goto err;
1501 	if (do_pack && do_rle) {
1502 	    tmp1 = out;
1503 	    tmp2 = tmp;
1504 	    tmp3 = out;
1505 	} else if (do_pack) {
1506 	    tmp1 = tmp;
1507 	    tmp2 = tmp1;
1508 	    tmp3 = out;
1509 	} else if (do_rle) {
1510 	    tmp1 = tmp;
1511 	    tmp2 = out;
1512 	    tmp3 = out;
1513 	}
1514     } else {
1515 	// neither
1516 	tmp  = NULL;
1517 	tmp1 = out;
1518 	tmp2 = out;
1519 	tmp3 = out;
1520     }
1521 
1522 
1523     // Decode the bit-packing map.
1524     uint8_t map[16] = {0};
1525     int npacked_sym = 0;
1526     uint64_t unpacked_sz = 0; // FIXME: rename to packed_per_byte
1527     if (do_pack) {
1528 	c_meta_size = hts_unpack_meta(in, in_size, *out_size, map, &npacked_sym);
1529 	if (c_meta_size == 0)
1530 	    goto err;
1531 
1532 	unpacked_sz = osz;
1533 	in      += c_meta_size;
1534 	in_size -= c_meta_size;
1535 
1536 	// New unpacked size.  We could derive this bit from *out_size
1537 	// and npacked_sym.
1538 	unsigned int osz;
1539 	sz = var_get_u32(in, in_end, &osz);
1540 	in += sz;
1541 	in_size -= sz;
1542 	if (osz > tmp1_size)
1543 	    goto err;
1544 	tmp1_size = osz;
1545     }
1546 
1547     uint8_t *meta = NULL;
1548     uint32_t u_meta_size = 0;
1549     if (do_rle) {
1550 	// Uncompress meta data
1551 	uint32_t c_meta_size, rle_len, sz;
1552 	sz  = var_get_u32(in,    in_end, &u_meta_size);
1553 	sz += var_get_u32(in+sz, in_end, &rle_len);
1554 	if (rle_len > tmp1_size) // should never grow
1555 	    goto err;
1556 	if (u_meta_size & 1) {
1557 	    meta = in + sz;
1558 	    u_meta_size = u_meta_size/2 > (in_end-meta) ? (in_end-meta) : u_meta_size/2;
1559 	    c_meta_size = u_meta_size;
1560 	} else {
1561 	    sz += var_get_u32(in+sz, in_end, &c_meta_size);
1562 	    u_meta_size /= 2;
1563 	    meta_free = meta = rans_uncompress_O0_4x16(in+sz, in_size-sz, NULL, u_meta_size);
1564 	    if (!meta)
1565 		goto err;
1566 	}
1567 	if (c_meta_size+sz > in_size)
1568 	    goto err;
1569 	in      += c_meta_size+sz;
1570 	in_size -= c_meta_size+sz;
1571 	tmp1_size = rle_len;
1572     }
1573 
1574     //fprintf(stderr, "    meta_size %d bytes\n", (int)(in - orig_in)); //c-size
1575 
1576     // uncompress RLE data.  in -> tmp1
1577     if (in_size) {
1578 	if (do_cat) {
1579 	    //fprintf(stderr, "    CAT %d\n", tmp1_size); //c-size
1580 	    if (tmp1_size > in_size)
1581 		goto err;
1582 	    if (tmp1_size > *out_size)
1583 		goto err;
1584 	    memcpy(tmp1, in, tmp1_size);
1585 	} else {
1586 	    tmp1 = order
1587 		? rans_uncompress_O1_4x16(in, in_size, tmp1, tmp1_size)
1588 		: rans_uncompress_O0_4x16(in, in_size, tmp1, tmp1_size);
1589 	    if (!tmp1)
1590 		goto err;
1591 	}
1592     } else {
1593 	tmp1 = NULL;
1594 	tmp1_size = 0;
1595     }
1596     tmp2_size = tmp3_size = tmp1_size;
1597 
1598     if (do_rle) {
1599 	// Unpack RLE.  tmp1 -> tmp2.
1600 	if (u_meta_size == 0)
1601 	    goto err;
1602 	uint64_t unrle_size = *out_size;
1603 	int rle_nsyms = *meta ? *meta : 256;
1604 	if (u_meta_size < 1+rle_nsyms)
1605 	    goto err;
1606 	if (!rle_decode(tmp1, tmp1_size,
1607 			meta+1+rle_nsyms, u_meta_size-(1+rle_nsyms),
1608 			meta+1, rle_nsyms, tmp2, &unrle_size))
1609 	    goto err;
1610 	tmp3_size = tmp2_size = unrle_size;
1611 	free(meta_free);
1612 	meta_free = NULL;
1613     }
1614     if (do_pack) {
1615 	// Unpack bits via pack-map.  tmp2 -> tmp3
1616 	if (npacked_sym == 1)
1617 	    unpacked_sz = tmp2_size;
1618 	//uint8_t *porig = unpack(tmp2, tmp2_size, unpacked_sz, npacked_sym, map);
1619 	//memcpy(tmp3, porig, unpacked_sz);
1620 	if (!hts_unpack(tmp2, tmp2_size, tmp3, unpacked_sz, npacked_sym, map))
1621 	    goto err;
1622 	tmp3_size = unpacked_sz;
1623     }
1624 
1625     if (tmp)
1626 	free(tmp);
1627 
1628     *out_size = tmp3_size;
1629     return tmp3;
1630 
1631  err:
1632     free(meta_free);
1633     free(out_free);
1634     free(tmp_free);
1635     return NULL;
1636 }
1637 
rans_uncompress_4x16(unsigned char * in,unsigned int in_size,unsigned int * out_size)1638 unsigned char *rans_uncompress_4x16(unsigned char *in, unsigned int in_size,
1639 				    unsigned int *out_size) {
1640     return rans_uncompress_to_4x16(in, in_size, NULL, out_size);
1641 }
1642