1 /*
2  * Copyright (c) 2014-2020 Genome Research Ltd.
3  * Author(s): James Bonfield
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  *    1. Redistributions of source code must retain the above copyright notice,
9  *       this list of conditions and the following disclaimer.
10  *
11  *    2. Redistributions in binary form must reproduce the above
12  *       copyright notice, this list of conditions and the following
13  *       disclaimer in the documentation and/or other materials provided
14  *       with the distribution.
15  *
16  *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17  *       Institute nor the names of its contributors may be used to endorse
18  *       or promote products derived from this software without specific
19  *       prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25  * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "config.h"
35 
36 #if defined(NO_THREADS) && (defined(__APPLE__) || defined(_WIN32))
37 // When pthreads is available, we use a single malloc, otherwise we'll
38 // (normally) use the stack instead.
39 //
40 // However the MacOS X default stack size can be tiny (512K), albeit
41 // I think only when threading?  We request malloc/free for the large
42 // local arrays instead to avoid this, but it does have a performance hit.
43 #define USE_HEAP
44 #endif
45 
46 // Use 11 for order-1?
47 #define TF_SHIFT 12
48 #define TOTFREQ (1<<TF_SHIFT)
49 
50 #include "rANS_byte.h"
51 #include "utils.h"
52 
53 /*-------------------------------------------------------------------------- */
54 /*
55  * Example wrapper to use the rans_byte.h functions included above.
56  *
57  * This demonstrates how to use, and unroll, an order-0 and order-1 frequency
58  * model.
59  */
60 
61 #include <stdint.h>
62 #include <stdlib.h>
63 #include <stdio.h>
64 #include <unistd.h>
65 #include <assert.h>
66 #include <string.h>
67 #include <limits.h>
68 #include <sys/time.h>
69 #ifndef NO_THREADS
70 #include <pthread.h>
71 #endif
72 
73 #include "rANS_static.h"
74 
75 #define ABS(a) ((a)>0?(a):-(a))
76 
77 /*-----------------------------------------------------------------------------
78  * Memory to memory compression functions.
79  *
80  * These are original versions without any manual loop unrolling. They
81  * are easier to understand, but can be up to 2x slower.
82  */
83 
84 static
rans_compress_O0(unsigned char * in,unsigned int in_size,unsigned int * out_size)85 unsigned char *rans_compress_O0(unsigned char *in, unsigned int in_size,
86 				unsigned int *out_size) {
87     unsigned char *out_buf = malloc(1.05*in_size + 257*257*3 + 9);
88     unsigned char *cp, *out_end;
89     RansEncSymbol syms[256];
90     RansState rans0;
91     RansState rans2;
92     RansState rans1;
93     RansState rans3;
94     uint8_t* ptr;
95     int F[256+MAGIC] = {0}, i, j, tab_size, rle, x, fsum = 0;
96     int m = 0, M = 0;
97     uint64_t tr;
98 
99     if (!out_buf)
100 	return NULL;
101 
102     ptr = out_end = out_buf + (int)(1.05*in_size) + 257*257*3 + 9;
103 
104     // Compute statistics
105     hist8(in, in_size, (uint32_t *)F);
106     tr = ((uint64_t)TOTFREQ<<31)/in_size + (1<<30)/in_size;
107 
108  normalise_harder:
109     // Normalise so T[i] == TOTFREQ
110     for (fsum = m = M = j = 0; j < 256; j++) {
111 	if (!F[j])
112 	    continue;
113 
114 	if (m < F[j])
115 	    m = F[j], M = j;
116 
117 	if ((F[j] = (F[j]*tr)>>31) == 0)
118 	    F[j] = 1;
119 	fsum += F[j];
120     }
121 
122     fsum++;
123     if (fsum < TOTFREQ) {
124 	F[M] += TOTFREQ-fsum;
125     } else if (fsum-TOTFREQ > F[M]/2) {
126 	// Corner case to avoid excessive frequency reduction
127 	tr = 2104533975; goto normalise_harder; // equiv to *0.98.
128     } else {
129 	F[M] -= fsum-TOTFREQ;
130     }
131 
132     //printf("F[%d]=%d\n", M, F[M]);
133     assert(F[M]>0);
134 
135     // Encode statistics.
136     cp = out_buf+9;
137 
138     for (x = rle = j = 0; j < 256; j++) {
139 	if (F[j]) {
140 	    // j
141 	    if (rle) {
142 		rle--;
143 	    } else {
144 		*cp++ = j;
145 		if (!rle && j && F[j-1])  {
146 		    for(rle=j+1; rle<256 && F[rle]; rle++)
147 			;
148 		    rle -= j+1;
149 		    *cp++ = rle;
150 		}
151 		//fprintf(stderr, "%d: %d %d\n", j, rle, N[j]);
152 	    }
153 
154 	    // F[j]
155 	    if (F[j]<128) {
156 		*cp++ = F[j];
157 	    } else {
158 		*cp++ = 128 | (F[j]>>8);
159 		*cp++ = F[j]&0xff;
160 	    }
161 	    RansEncSymbolInit(&syms[j], x, F[j], TF_SHIFT);
162 	    x += F[j];
163 	}
164     }
165     *cp++ = 0;
166 
167     //write(2, out_buf+4, cp-(out_buf+4));
168     tab_size = cp-out_buf;
169 
170     RansEncInit(&rans0);
171     RansEncInit(&rans1);
172     RansEncInit(&rans2);
173     RansEncInit(&rans3);
174 
175     switch (i=(in_size&3)) {
176     case 3: RansEncPutSymbol(&rans2, &ptr, &syms[in[in_size-(i-2)]]);
177     case 2: RansEncPutSymbol(&rans1, &ptr, &syms[in[in_size-(i-1)]]);
178     case 1: RansEncPutSymbol(&rans0, &ptr, &syms[in[in_size-(i-0)]]);
179     case 0:
180 	break;
181     }
182     for (i=(in_size &~3); i>0; i-=4) {
183 	RansEncSymbol *s3 = &syms[in[i-1]];
184 	RansEncSymbol *s2 = &syms[in[i-2]];
185 	RansEncSymbol *s1 = &syms[in[i-3]];
186 	RansEncSymbol *s0 = &syms[in[i-4]];
187 
188 	RansEncPutSymbol(&rans3, &ptr, s3);
189 	RansEncPutSymbol(&rans2, &ptr, s2);
190 	RansEncPutSymbol(&rans1, &ptr, s1);
191 	RansEncPutSymbol(&rans0, &ptr, s0);
192     }
193 
194     RansEncFlush(&rans3, &ptr);
195     RansEncFlush(&rans2, &ptr);
196     RansEncFlush(&rans1, &ptr);
197     RansEncFlush(&rans0, &ptr);
198 
199     // Finalise block size and return it
200     *out_size = (out_end - ptr) + tab_size;
201 
202     cp = out_buf;
203 
204     *cp++ = 0; // order
205     *cp++ = ((*out_size-9)>> 0) & 0xff;
206     *cp++ = ((*out_size-9)>> 8) & 0xff;
207     *cp++ = ((*out_size-9)>>16) & 0xff;
208     *cp++ = ((*out_size-9)>>24) & 0xff;
209 
210     *cp++ = (in_size>> 0) & 0xff;
211     *cp++ = (in_size>> 8) & 0xff;
212     *cp++ = (in_size>>16) & 0xff;
213     *cp++ = (in_size>>24) & 0xff;
214 
215     memmove(out_buf + tab_size, ptr, out_end-ptr);
216 
217     return out_buf;
218 }
219 
220 typedef struct {
221     unsigned char R[TOTFREQ];
222 } ari_decoder;
223 
224 static
rans_uncompress_O0(unsigned char * in,unsigned int in_size,unsigned int * out_size)225 unsigned char *rans_uncompress_O0(unsigned char *in, unsigned int in_size,
226 				  unsigned int *out_size) {
227     /* Load in the static tables */
228     unsigned char *cp = in + 9;
229     unsigned char *cp_end = in + in_size;
230     const uint32_t mask = (1u << TF_SHIFT)-1;
231     int i, j, rle;
232     unsigned int x, y;
233     unsigned int out_sz, in_sz;
234     char *out_buf;
235     RansState R[4];
236     RansState m[4];
237     uint16_t sfreq[TOTFREQ+32];
238     uint16_t ssym [TOTFREQ+32]; // faster, but only needs uint8_t
239     uint32_t sbase[TOTFREQ+16]; // faster, but only needs uint16_t
240 
241     if (in_size < 26) // Need at least this many bytes just to start
242         return NULL;
243 
244     if (*in++ != 0) // Order-0 check
245 	return NULL;
246 
247     in_sz  = ((in[0])<<0) | ((in[1])<<8) | ((in[2])<<16) | (((uint32_t)in[3])<<24);
248     out_sz = ((in[4])<<0) | ((in[5])<<8) | ((in[6])<<16) | (((uint32_t)in[7])<<24);
249     if (in_sz != in_size-9)
250 	return NULL;
251 
252     if (out_sz >= INT_MAX)
253 	return NULL; // protect against some overflow cases
254 
255     // For speeding up the fuzzer only.
256     // Small input can lead to large uncompressed data.
257     // We reject this as it just slows things up instead of testing more code
258     // paths (once we've verified a few times for large data).
259 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
260     if (out_sz > 100000)
261 	return NULL;
262 #endif
263 
264     out_buf = malloc(out_sz);
265     if (!out_buf)
266 	return NULL;
267 
268     //fprintf(stderr, "out_sz=%d\n", out_sz);
269 
270     // Precompute reverse lookup of frequency.
271     rle = x = y = 0;
272     j = *cp++;
273     do {
274 	int F, C;
275         if (cp > cp_end - 16) goto cleanup; // Not enough input bytes left
276 	if ((F = *cp++) >= 128) {
277 	    F &= ~128;
278 	    F = ((F & 127) << 8) | *cp++;
279 	}
280 	C = x;
281 
282 	if (x + F > TOTFREQ)
283 	    goto cleanup;
284 
285         for (y = 0; y < F; y++) {
286             ssym [y + C] = j;
287             sfreq[y + C] = F;
288             sbase[y + C] = y;
289         }
290 	x += F;
291 
292 	if (!rle && j+1 == *cp) {
293 	    j = *cp++;
294 	    rle = *cp++;
295 	} else if (rle) {
296 	    rle--;
297 	    j++;
298 	    if (j > 255)
299 		goto cleanup;
300 	} else {
301 	    j = *cp++;
302 	}
303     } while(j);
304 
305     if (x < TOTFREQ-1 || x > TOTFREQ)
306 	goto cleanup;
307 
308     // 16 bytes of cp here. Also why cp - 16 in above loop.
309     if (cp > cp_end - 16) goto cleanup; // Not enough input bytes left
310 
311     RansDecInit(&R[0], &cp); if (R[0] < RANS_BYTE_L) goto cleanup;
312     RansDecInit(&R[1], &cp); if (R[1] < RANS_BYTE_L) goto cleanup;
313     RansDecInit(&R[2], &cp); if (R[2] < RANS_BYTE_L) goto cleanup;
314     RansDecInit(&R[3], &cp); if (R[3] < RANS_BYTE_L) goto cleanup;
315 
316     int out_end = (out_sz&~3);
317     cp_end -= 8; // within 8 for simplicity of loop below
318     for (i=0; i < out_end; i+=4) {
319 	m[0] = R[0] & mask;
320         out_buf[i+0] = ssym[m[0]];
321         R[0] = sfreq[m[0]] * (R[0] >> TF_SHIFT) + sbase[m[0]];
322 
323         m[1] = R[1] & mask;
324 	out_buf[i+1] = ssym[m[1]];
325         R[1] = sfreq[m[1]] * (R[1] >> TF_SHIFT) + sbase[m[1]];
326 
327         m[2] = R[2] & mask;
328 	out_buf[i+2] = ssym[m[2]];
329         R[2] = sfreq[m[2]] * (R[2] >> TF_SHIFT) + sbase[m[2]];
330 
331         m[3] = R[3] & mask;
332 	out_buf[i+3] = ssym[m[3]];
333         R[3] = sfreq[m[3]] * (R[3] >> TF_SHIFT) + sbase[m[3]];
334 
335 	if (cp < cp_end) {
336 	    RansDecRenorm2(&R[0], &R[1], &cp);
337 	    RansDecRenorm2(&R[2], &R[3], &cp);
338 	} else {
339 	    RansDecRenormSafe(&R[0], &cp, cp_end+8);
340 	    RansDecRenormSafe(&R[1], &cp, cp_end+8);
341 	    RansDecRenormSafe(&R[2], &cp, cp_end+8);
342 	    RansDecRenormSafe(&R[3], &cp, cp_end+8);
343 	}
344     }
345 
346     switch(out_sz&3) {
347     case 3:
348         out_buf[out_end + 2] = ssym[R[2] & mask];
349     case 2:
350         out_buf[out_end + 1] = ssym[R[1] & mask];
351     case 1:
352         out_buf[out_end] = ssym[R[0] & mask];
353     default:
354         break;
355     }
356 
357     *out_size = out_sz;
358     return (unsigned char *)out_buf;
359 
360  cleanup:
361     free(out_buf);
362     return NULL;
363 }
364 
365 
366 #ifndef NO_THREADS
367 /*
368  * Thread local storage per thread in the pool.
369  * This avoids needing to memset/calloc F and syms in the encoder,
370  * which can be speed things this encoder up a little.
371  */
372 static pthread_once_t rans_enc_once = PTHREAD_ONCE_INIT;
373 static pthread_key_t rans_enc_key;
374 
375 typedef struct {
376     RansEncSymbol (*syms)[256];
377     int (*F)[256];
378 } thread_enc_data;
379 
rans_enc_free(void * vp)380 static void rans_enc_free(void *vp) {
381     thread_enc_data *te = (thread_enc_data *)vp;
382     if (!te)
383 	return;
384     free(te->F);
385     free(te->syms);
386     free(te);
387 }
388 
rans_enc_alloc(void)389 static thread_enc_data *rans_enc_alloc(void) {
390     thread_enc_data *te = malloc(sizeof(*te));
391     if (!te)
392 	return NULL;
393     te->F = calloc(256, sizeof(*te->F));
394     te->syms = calloc(256, sizeof(*te->syms));
395     if (!te->F || !te->syms) {
396 	rans_enc_free(te);
397 	return NULL;
398     }
399 
400     return te;
401 }
402 
rans_tls_enc_init(void)403 static void rans_tls_enc_init(void) {
404     pthread_key_create(&rans_enc_key, rans_enc_free);
405 }
406 #endif
407 
408 static
rans_compress_O1(unsigned char * in,unsigned int in_size,unsigned int * out_size)409 unsigned char *rans_compress_O1(unsigned char *in, unsigned int in_size,
410 				unsigned int *out_size) {
411     unsigned char *out_buf = NULL, *out_end, *cp;
412     unsigned int tab_size, rle_i, rle_j;
413 
414 
415 #ifndef NO_THREADS
416     pthread_once(&rans_enc_once, rans_tls_enc_init);
417     thread_enc_data *te = pthread_getspecific(rans_enc_key);
418     if (!te) {
419 	if (!(te = rans_enc_alloc()))
420 	    return NULL;
421 	pthread_setspecific(rans_enc_key, te);
422     }
423     RansEncSymbol (*syms)[256] = te->syms;
424     int (*F)[256] = te->F;
425     memset(F, 0, 256*sizeof(*F));
426 #else
427 #ifdef USE_HEAP
428     RansEncSymbol (*syms)[256] = malloc(256 * sizeof(*syms));
429     int (*F)[256] = calloc(256, sizeof(*F));
430 #else
431     RansEncSymbol syms[256][256];
432     int F[256][256] = {{0}};
433 #endif
434 #endif
435     int T[256+MAGIC] = {0};
436     int i, j;
437 
438     if (in_size < 4)
439 	return rans_compress_O0(in, in_size, out_size);
440 
441 #ifdef USE_HEAP
442     if (!syms) goto cleanup;
443     if (!F) goto cleanup;
444 #endif
445 
446     out_buf = malloc(1.05*in_size + 257*257*3 + 9);
447     if (!out_buf) goto cleanup;
448 
449     out_end = out_buf + (int)(1.05*in_size) + 257*257*3 + 9;
450     cp = out_buf+9;
451 
452     hist1_4(in, in_size, (uint32_t (*)[256])F, (uint32_t *)T);
453 
454     F[0][in[1*(in_size>>2)]]++;
455     F[0][in[2*(in_size>>2)]]++;
456     F[0][in[3*(in_size>>2)]]++;
457     T[0]+=3;
458 
459 
460     // Normalise so T[i] == TOTFREQ
461     for (rle_i = i = 0; i < 256; i++) {
462 	int t2, m, M;
463 	unsigned int x;
464 
465 	if (T[i] == 0)
466 	    continue;
467 
468 	//uint64_t p = (TOTFREQ * TOTFREQ) / t;
469 	double p = ((double)TOTFREQ)/T[i];
470     normalise_harder:
471 	for (t2 = m = M = j = 0; j < 256; j++) {
472 	    if (!F[i][j])
473 		continue;
474 
475 	    if (m < F[i][j])
476 		m = F[i][j], M = j;
477 
478 	    //if ((F[i][j] = (F[i][j] * p) / TOTFREQ) == 0)
479 	    if ((F[i][j] *= p) == 0)
480 		F[i][j] = 1;
481 	    t2 += F[i][j];
482 	}
483 
484 	t2++;
485 	if (t2 < TOTFREQ) {
486 	    F[i][M] += TOTFREQ-t2;
487 	} else if (t2-TOTFREQ >= F[i][M]/2) {
488 	    // Corner case to avoid excessive frequency reduction
489 	    p = .98; goto normalise_harder;
490 	} else {
491 	    F[i][M] -= t2-TOTFREQ;
492 	}
493 
494 	// Store frequency table
495 	// i
496 	if (rle_i) {
497 	    rle_i--;
498 	} else {
499 	    *cp++ = i;
500 	    // FIXME: could use order-0 statistics to observe which alphabet
501 	    // symbols are present and base RLE on that ordering instead.
502 	    if (i && T[i-1]) {
503 		for(rle_i=i+1; rle_i<256 && T[rle_i]; rle_i++)
504 		    ;
505 		rle_i -= i+1;
506 		*cp++ = rle_i;
507 	    }
508 	}
509 
510 	int *F_i_ = F[i];
511 	x = 0;
512 	rle_j = 0;
513 	for (j = 0; j < 256; j++) {
514 	    if (F_i_[j]) {
515 		//fprintf(stderr, "F[%d][%d]=%d, x=%d\n", i, j, F_i_[j], x);
516 
517 		// j
518 		if (rle_j) {
519 		    rle_j--;
520 		} else {
521 		    *cp++ = j;
522 		    if (!rle_j && j && F_i_[j-1]) {
523 			for(rle_j=j+1; rle_j<256 && F_i_[rle_j]; rle_j++)
524 			    ;
525 			rle_j -= j+1;
526 			*cp++ = rle_j;
527 		    }
528 		}
529 
530 		// F_i_[j]
531 		if (F_i_[j]<128) {
532  		    *cp++ = F_i_[j];
533 		} else {
534 		    *cp++ = 128 | (F_i_[j]>>8);
535 		    *cp++ = F_i_[j]&0xff;
536 		}
537 
538 		RansEncSymbolInit(&syms[i][j], x, F_i_[j], TF_SHIFT);
539 		x += F_i_[j];
540 	    }
541 	}
542 	*cp++ = 0;
543     }
544     *cp++ = 0;
545 
546     //write(2, out_buf+4, cp-(out_buf+4));
547     tab_size = cp - out_buf;
548     assert(tab_size < 257*257*3);
549 
550     RansState rans0, rans1, rans2, rans3;
551     RansEncInit(&rans0);
552     RansEncInit(&rans1);
553     RansEncInit(&rans2);
554     RansEncInit(&rans3);
555 
556     uint8_t* ptr = out_end;
557 
558     int isz4 = in_size>>2;
559     int i0 = 1*isz4-2;
560     int i1 = 2*isz4-2;
561     int i2 = 3*isz4-2;
562     int i3 = 4*isz4-2;
563 
564     unsigned char l0 = in[i0+1];
565     unsigned char l1 = in[i1+1];
566     unsigned char l2 = in[i2+1];
567     unsigned char l3 = in[i3+1];
568 
569     // Deal with the remainder
570     l3 = in[in_size-1];
571     for (i3 = in_size-2; i3 > 4*isz4-2; i3--) {
572 	unsigned char c3 = in[i3];
573 	RansEncPutSymbol(&rans3, &ptr, &syms[c3][l3]);
574 	l3 = c3;
575     }
576 
577     for (; i0 >= 0; i0--, i1--, i2--, i3--) {
578 	unsigned char c3 = in[i3];
579 	unsigned char c2 = in[i2];
580 	unsigned char c1 = in[i1];
581 	unsigned char c0 = in[i0];
582 
583 	RansEncSymbol *s3 = &syms[c3][l3];
584 	RansEncSymbol *s2 = &syms[c2][l2];
585 	RansEncSymbol *s1 = &syms[c1][l1];
586 	RansEncSymbol *s0 = &syms[c0][l0];
587 
588 	RansEncPutSymbol4(&rans3, &rans2, &rans1, &rans0, &ptr,
589 			  s3, s2, s1, s0);
590 
591 	l3 = c3;
592 	l2 = c2;
593 	l1 = c1;
594 	l0 = c0;
595     }
596 
597     RansEncPutSymbol(&rans3, &ptr, &syms[0][l3]);
598     RansEncPutSymbol(&rans2, &ptr, &syms[0][l2]);
599     RansEncPutSymbol(&rans1, &ptr, &syms[0][l1]);
600     RansEncPutSymbol(&rans0, &ptr, &syms[0][l0]);
601 
602     RansEncFlush(&rans3, &ptr);
603     RansEncFlush(&rans2, &ptr);
604     RansEncFlush(&rans1, &ptr);
605     RansEncFlush(&rans0, &ptr);
606 
607     *out_size = (out_end - ptr) + tab_size;
608 
609     cp = out_buf;
610     *cp++ = 1; // order
611 
612     *cp++ = ((*out_size-9)>> 0) & 0xff;
613     *cp++ = ((*out_size-9)>> 8) & 0xff;
614     *cp++ = ((*out_size-9)>>16) & 0xff;
615     *cp++ = ((*out_size-9)>>24) & 0xff;
616 
617     *cp++ = (in_size>> 0) & 0xff;
618     *cp++ = (in_size>> 8) & 0xff;
619     *cp++ = (in_size>>16) & 0xff;
620     *cp++ = (in_size>>24) & 0xff;
621 
622     memmove(out_buf + tab_size, ptr, out_end-ptr);
623 
624  cleanup:
625 #ifdef USE_HEAP
626     free(syms);
627     free(F);
628 #endif
629 
630     return out_buf;
631 }
632 
633 #ifndef NO_THREADS
634 /*
635  * Thread local storage per thread in the pool.
636  * This avoids needing to memset/calloc D and syms in the decoder,
637  * which can be speed things this decoder up a little (~10%).
638  */
639 static pthread_once_t rans_once = PTHREAD_ONCE_INIT;
640 static pthread_key_t rans_key;
641 
642 typedef struct {
643     ari_decoder *D;
644     RansDecSymbol32 (*syms)[256];
645 } thread_data;
646 
rans_tb_free(void * vp)647 static void rans_tb_free(void *vp) {
648     thread_data *tb = (thread_data *)vp;
649     if (!tb)
650 	return;
651     free(tb->D);
652     free(tb->syms);
653     free(tb);
654 }
655 
rans_tb_alloc(void)656 static thread_data *rans_tb_alloc(void) {
657     thread_data *tb = malloc(sizeof(*tb));
658     if (!tb)
659 	return NULL;
660     tb->D = calloc(256, sizeof(*tb->D));
661     tb->syms = calloc(256, sizeof(*tb->syms));
662     if (!tb->D || !tb->syms) {
663 	rans_tb_free(tb);
664 	return NULL;
665     }
666 
667     return tb;
668 }
669 
rans_tls_init(void)670 static void rans_tls_init(void) {
671     pthread_key_create(&rans_key, rans_tb_free);
672 }
673 #endif
674 
675 static
rans_uncompress_O1(unsigned char * in,unsigned int in_size,unsigned int * out_size)676 unsigned char *rans_uncompress_O1(unsigned char *in, unsigned int in_size,
677 				  unsigned int *out_size) {
678     /* Load in the static tables */
679     unsigned char *cp = in + 9;
680     unsigned char *ptr_end = in + in_size;
681     int i, j = -999, rle_i, rle_j;
682     unsigned int x;
683     unsigned int out_sz, in_sz;
684     char *out_buf = NULL;
685     // D[] is 1Mb and syms[][] is 0.5Mb.
686 #ifndef NO_THREADS
687     pthread_once(&rans_once, rans_tls_init);
688     thread_data *tb = pthread_getspecific(rans_key);
689     if (!tb) {
690 	if (!(tb = rans_tb_alloc()))
691 	    return NULL;
692 	pthread_setspecific(rans_key, tb);
693     }
694     ari_decoder *const D = tb->D;
695     RansDecSymbol32 (*const syms)[256] = tb->syms;
696 #else
697 #ifdef USE_HEAP
698     //ari_decoder *const D = malloc(256 * sizeof(*D));
699     ari_decoder *const D = calloc(256, sizeof(*D));
700     RansDecSymbol32 (*const syms)[256] = malloc(256 * sizeof(*syms));
701     for (i = 1; i < 256; i++) memset(&syms[i][0], 0, sizeof(syms[0][0]));
702 #else
703     ari_decoder D[256] = {{{0}}}; //256*4k    => 1.0Mb
704     RansDecSymbol32 syms[256][256+6] = {{{0}}}; //256*262*8 => 0.5Mb
705 #endif
706 #endif
707     int16_t map[256], map_i = 0;
708 
709     memset(map, -1, 256*sizeof(*map));
710 
711     if (in_size < 27) // Need at least this many bytes to start
712         return NULL;
713 
714     if (*in++ != 1) // Order-1 check
715 	return NULL;
716 
717     in_sz  = ((in[0])<<0) | ((in[1])<<8) | ((in[2])<<16) | (((uint32_t)in[3])<<24);
718     out_sz = ((in[4])<<0) | ((in[5])<<8) | ((in[6])<<16) | (((uint32_t)in[7])<<24);
719     if (in_sz != in_size-9)
720 	return NULL;
721 
722     if (out_sz >= INT_MAX)
723 	return NULL; // protect against some overflow cases
724 
725     // For speeding up the fuzzer only.
726     // Small input can lead to large uncompressed data.
727     // We reject this as it just slows things up instead of testing more code
728     // paths (once we've verified a few times for large data).
729 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
730     if (out_sz > 100000)
731 	return NULL;
732 #endif
733 
734 #if defined(USE_HEAP)
735     if (!D || !syms) goto cleanup;
736     /* These memsets prevent illegal memory access in syms due to
737        broken compressed data.  As D is calloc'd, all illegal transitions
738        will end up in either row or column 0 of syms. */
739     memset(&syms[0], 0, sizeof(syms[0]));
740     for (i = 0; i < 256; i++)
741 	memset(&syms[i][0], 0, sizeof(syms[0][0]));
742 #endif
743 
744     //fprintf(stderr, "out_sz=%d\n", out_sz);
745 
746     //i = *cp++;
747     rle_i = 0;
748     i = *cp++;
749     do {
750 	// Map arbitrary a,b,c to 0,1,2 to improve cache locality.
751 	if (map[i] == -1)
752 	    map[i] = map_i++;
753 	int m_i = map[i];
754 
755 	rle_j = x = 0;
756 	j = *cp++;
757 	do {
758 	    if (map[j] == -1)
759 		map[j] = map_i++;
760 
761 	    int F, C;
762             if (cp > ptr_end - 16) goto cleanup; // Not enough input bytes left
763 	    if ((F = *cp++) >= 128) {
764 		F &= ~128;
765 		F = ((F & 127) << 8) | *cp++;
766 	    }
767 	    C = x;
768 
769 	    //fprintf(stderr, "i=%d j=%d F=%d C=%d\n", i, j, F, C);
770 
771 	    if (!F)
772 		F = TOTFREQ;
773 
774 	    RansDecSymbolInit32(&syms[m_i][j], C, F);
775 
776 	    /* Build reverse lookup table */
777 	    //if (!D[i].R) D[i].R = (unsigned char *)malloc(TOTFREQ);
778 	    if (x + F > TOTFREQ)
779 		goto cleanup;
780 
781 	    memset(&D[m_i].R[x], j, F);
782 	    x += F;
783 
784 	    if (!rle_j && j+1 == *cp) {
785 		j = *cp++;
786 		rle_j = *cp++;
787 	    } else if (rle_j) {
788 		rle_j--;
789 		j++;
790 		if (j > 255)
791 		    goto cleanup;
792 	    } else {
793 		j = *cp++;
794 	    }
795 	} while(j);
796 
797         if (x < TOTFREQ-1 || x > TOTFREQ)
798             goto cleanup;
799         if (x < TOTFREQ) // historically we fill 4095, not 4096
800             D[i].R[x] = D[i].R[x-1];
801 
802 	if (!rle_i && i+1 == *cp) {
803 	    i = *cp++;
804 	    rle_i = *cp++;
805 	} else if (rle_i) {
806 	    rle_i--;
807 	    i++;
808 	    if (i > 255)
809 		goto cleanup;
810 	} else {
811 	    i = *cp++;
812 	}
813     } while (i);
814     for (i = 0; i < 256; i++)
815 	if (map[i] == -1)
816 	    map[i] = 0;
817 
818     RansState rans0, rans1, rans2, rans3;
819     uint8_t *ptr = cp;
820     if (cp > ptr_end - 16) goto cleanup; // Not enough input bytes left
821     RansDecInit(&rans0, &ptr); if (rans0 < RANS_BYTE_L) return NULL;
822     RansDecInit(&rans1, &ptr); if (rans1 < RANS_BYTE_L) return NULL;
823     RansDecInit(&rans2, &ptr); if (rans2 < RANS_BYTE_L) return NULL;
824     RansDecInit(&rans3, &ptr); if (rans3 < RANS_BYTE_L) return NULL;
825 
826     RansState R[4];
827     R[0] = rans0;
828     R[1] = rans1;
829     R[2] = rans2;
830     R[3] = rans3;
831 
832     unsigned int isz4 = out_sz>>2;
833     uint32_t l0 = 0;
834     uint32_t l1 = 0;
835     uint32_t l2 = 0;
836     uint32_t l3 = 0;
837 
838     unsigned int i4[] = {0*isz4, 1*isz4, 2*isz4, 3*isz4};
839 
840     /* Allocate output buffer */
841     out_buf = malloc(out_sz);
842     if (!out_buf) goto cleanup;
843 
844     uint8_t cc0 = D[map[l0]].R[R[0] & ((1u << TF_SHIFT)-1)];
845     uint8_t cc1 = D[map[l1]].R[R[1] & ((1u << TF_SHIFT)-1)];
846     uint8_t cc2 = D[map[l2]].R[R[2] & ((1u << TF_SHIFT)-1)];
847     uint8_t cc3 = D[map[l3]].R[R[3] & ((1u << TF_SHIFT)-1)];
848 
849     ptr_end -= 8;
850     for (; i4[0] < isz4; i4[0]++, i4[1]++, i4[2]++, i4[3]++) {
851 	out_buf[i4[0]] = cc0;
852 	out_buf[i4[1]] = cc1;
853 	out_buf[i4[2]] = cc2;
854 	out_buf[i4[3]] = cc3;
855 
856 	//RansDecAdvanceStep(&R[0], syms[l0][cc0].start, syms[l0][cc0].freq, TF_SHIFT);
857 	//RansDecAdvanceStep(&R[1], syms[l1][cc1].start, syms[l1][cc1].freq, TF_SHIFT);
858 	//RansDecAdvanceStep(&R[2], syms[l2][cc2].start, syms[l2][cc2].freq, TF_vSHIFT);
859 	//RansDecAdvanceStep(&R[3], syms[l3][cc3].start, syms[l3][cc3].freq, TF_SHIFT);
860 
861 	{
862 	    uint32_t m[4];
863 
864 	    // Ordering to try and improve OoO cpu instructions
865 	    m[0] = R[0] & ((1u << TF_SHIFT)-1);
866 	    R[0] = syms[l0][cc0].freq * (R[0]>>TF_SHIFT);
867 	    m[1] = R[1] & ((1u << TF_SHIFT)-1);
868 	    R[0] += m[0] - syms[l0][cc0].start;
869 	    R[1] = syms[l1][cc1].freq * (R[1]>>TF_SHIFT);
870 	    m[2] = R[2] & ((1u << TF_SHIFT)-1);
871 	    R[1] += m[1] - syms[l1][cc1].start;
872 	    R[2] = syms[l2][cc2].freq * (R[2]>>TF_SHIFT);
873 	    m[3] = R[3] & ((1u << TF_SHIFT)-1);
874 	    R[3] = syms[l3][cc3].freq * (R[3]>>TF_SHIFT);
875 	    R[2] += m[2] - syms[l2][cc2].start;
876 	    R[3] += m[3] - syms[l3][cc3].start;
877 	}
878 
879 	l0 = map[cc0];
880 	l1 = map[cc1];
881 	l2 = map[cc2];
882 	l3 = map[cc3];
883 
884 	if (ptr < ptr_end) {
885 	    RansDecRenorm2(&R[0], &R[1], &ptr);
886 	    RansDecRenorm2(&R[2], &R[3], &ptr);
887 	} else {
888 	    RansDecRenormSafe(&R[0], &ptr, ptr_end+8);
889 	    RansDecRenormSafe(&R[1], &ptr, ptr_end+8);
890 	    RansDecRenormSafe(&R[2], &ptr, ptr_end+8);
891 	    RansDecRenormSafe(&R[3], &ptr, ptr_end+8);
892 	}
893 
894 	cc0 = D[l0].R[R[0] & ((1u << TF_SHIFT)-1)];
895 	cc1 = D[l1].R[R[1] & ((1u << TF_SHIFT)-1)];
896 	cc2 = D[l2].R[R[2] & ((1u << TF_SHIFT)-1)];
897 	cc3 = D[l3].R[R[3] & ((1u << TF_SHIFT)-1)];
898     }
899 
900     // Remainder
901     for (; i4[3] < out_sz; i4[3]++) {
902 	unsigned char c3 = D[l3].R[RansDecGet(&R[3], TF_SHIFT)];
903 	out_buf[i4[3]] = c3;
904 
905 	uint32_t m = R[3] & ((1u << TF_SHIFT)-1);
906 	R[3] = syms[l3][c3].freq * (R[3]>>TF_SHIFT) + m - syms[l3][c3].start;
907 	RansDecRenormSafe(&R[3], &ptr, ptr_end+8);
908 	l3 = map[c3];
909     }
910 
911     *out_size = out_sz;
912 
913  cleanup:
914 #if defined(USE_HEAP)
915     if (D)
916         free(D);
917 
918     free(syms);
919 #endif
920 
921     return (unsigned char *)out_buf;
922 }
923 
924 /*-----------------------------------------------------------------------------
925  * Simple interface to the order-0 vs order-1 encoders and decoders.
926  */
rans_compress(unsigned char * in,unsigned int in_size,unsigned int * out_size,int order)927 unsigned char *rans_compress(unsigned char *in, unsigned int in_size,
928 			     unsigned int *out_size, int order) {
929     return order
930 	? rans_compress_O1(in, in_size, out_size)
931 	: rans_compress_O0(in, in_size, out_size);
932 }
933 
rans_uncompress(unsigned char * in,unsigned int in_size,unsigned int * out_size)934 unsigned char *rans_uncompress(unsigned char *in, unsigned int in_size,
935 			       unsigned int *out_size) {
936     /* Both rans_uncompress functions need to be able to read at least 9
937        bytes. */
938     if (in_size < 9)
939         return NULL;
940     return in[0]
941 	? rans_uncompress_O1(in, in_size, out_size)
942 	: rans_uncompress_O0(in, in_size, out_size);
943 }
944