1 /*
2 * Copyright (c) 2014-2020 Genome Research Ltd.
3 * Author(s): James Bonfield
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above
12 * copyright notice, this list of conditions and the following
13 * disclaimer in the documentation and/or other materials provided
14 * with the distribution.
15 *
16 * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17 * Institute nor the names of its contributors may be used to endorse
18 * or promote products derived from this software without specific
19 * prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #include "config.h"
35
36 #if defined(NO_THREADS) && (defined(__APPLE__) || defined(_WIN32))
37 // When pthreads is available, we use a single malloc, otherwise we'll
38 // (normally) use the stack instead.
39 //
40 // However the MacOS X default stack size can be tiny (512K), albeit
41 // I think only when threading? We request malloc/free for the large
42 // local arrays instead to avoid this, but it does have a performance hit.
43 #define USE_HEAP
44 #endif
45
46 // Use 11 for order-1?
47 #define TF_SHIFT 12
48 #define TOTFREQ (1<<TF_SHIFT)
49
50 #include "rANS_byte.h"
51 #include "utils.h"
52
53 /*-------------------------------------------------------------------------- */
54 /*
55 * Example wrapper to use the rans_byte.h functions included above.
56 *
57 * This demonstrates how to use, and unroll, an order-0 and order-1 frequency
58 * model.
59 */
60
61 #include <stdint.h>
62 #include <stdlib.h>
63 #include <stdio.h>
64 #include <unistd.h>
65 #include <assert.h>
66 #include <string.h>
67 #include <limits.h>
68 #include <sys/time.h>
69 #ifndef NO_THREADS
70 #include <pthread.h>
71 #endif
72
73 #include "rANS_static.h"
74
75 #define ABS(a) ((a)>0?(a):-(a))
76
77 /*-----------------------------------------------------------------------------
78 * Memory to memory compression functions.
79 *
80 * These are original versions without any manual loop unrolling. They
81 * are easier to understand, but can be up to 2x slower.
82 */
83
84 static
rans_compress_O0(unsigned char * in,unsigned int in_size,unsigned int * out_size)85 unsigned char *rans_compress_O0(unsigned char *in, unsigned int in_size,
86 unsigned int *out_size) {
87 unsigned char *out_buf = malloc(1.05*in_size + 257*257*3 + 9);
88 unsigned char *cp, *out_end;
89 RansEncSymbol syms[256];
90 RansState rans0;
91 RansState rans2;
92 RansState rans1;
93 RansState rans3;
94 uint8_t* ptr;
95 int F[256+MAGIC] = {0}, i, j, tab_size, rle, x, fsum = 0;
96 int m = 0, M = 0;
97 uint64_t tr;
98
99 if (!out_buf)
100 return NULL;
101
102 ptr = out_end = out_buf + (int)(1.05*in_size) + 257*257*3 + 9;
103
104 // Compute statistics
105 hist8(in, in_size, (uint32_t *)F);
106 tr = ((uint64_t)TOTFREQ<<31)/in_size + (1<<30)/in_size;
107
108 normalise_harder:
109 // Normalise so T[i] == TOTFREQ
110 for (fsum = m = M = j = 0; j < 256; j++) {
111 if (!F[j])
112 continue;
113
114 if (m < F[j])
115 m = F[j], M = j;
116
117 if ((F[j] = (F[j]*tr)>>31) == 0)
118 F[j] = 1;
119 fsum += F[j];
120 }
121
122 fsum++;
123 if (fsum < TOTFREQ) {
124 F[M] += TOTFREQ-fsum;
125 } else if (fsum-TOTFREQ > F[M]/2) {
126 // Corner case to avoid excessive frequency reduction
127 tr = 2104533975; goto normalise_harder; // equiv to *0.98.
128 } else {
129 F[M] -= fsum-TOTFREQ;
130 }
131
132 //printf("F[%d]=%d\n", M, F[M]);
133 assert(F[M]>0);
134
135 // Encode statistics.
136 cp = out_buf+9;
137
138 for (x = rle = j = 0; j < 256; j++) {
139 if (F[j]) {
140 // j
141 if (rle) {
142 rle--;
143 } else {
144 *cp++ = j;
145 if (!rle && j && F[j-1]) {
146 for(rle=j+1; rle<256 && F[rle]; rle++)
147 ;
148 rle -= j+1;
149 *cp++ = rle;
150 }
151 //fprintf(stderr, "%d: %d %d\n", j, rle, N[j]);
152 }
153
154 // F[j]
155 if (F[j]<128) {
156 *cp++ = F[j];
157 } else {
158 *cp++ = 128 | (F[j]>>8);
159 *cp++ = F[j]&0xff;
160 }
161 RansEncSymbolInit(&syms[j], x, F[j], TF_SHIFT);
162 x += F[j];
163 }
164 }
165 *cp++ = 0;
166
167 //write(2, out_buf+4, cp-(out_buf+4));
168 tab_size = cp-out_buf;
169
170 RansEncInit(&rans0);
171 RansEncInit(&rans1);
172 RansEncInit(&rans2);
173 RansEncInit(&rans3);
174
175 switch (i=(in_size&3)) {
176 case 3: RansEncPutSymbol(&rans2, &ptr, &syms[in[in_size-(i-2)]]);
177 case 2: RansEncPutSymbol(&rans1, &ptr, &syms[in[in_size-(i-1)]]);
178 case 1: RansEncPutSymbol(&rans0, &ptr, &syms[in[in_size-(i-0)]]);
179 case 0:
180 break;
181 }
182 for (i=(in_size &~3); i>0; i-=4) {
183 RansEncSymbol *s3 = &syms[in[i-1]];
184 RansEncSymbol *s2 = &syms[in[i-2]];
185 RansEncSymbol *s1 = &syms[in[i-3]];
186 RansEncSymbol *s0 = &syms[in[i-4]];
187
188 RansEncPutSymbol(&rans3, &ptr, s3);
189 RansEncPutSymbol(&rans2, &ptr, s2);
190 RansEncPutSymbol(&rans1, &ptr, s1);
191 RansEncPutSymbol(&rans0, &ptr, s0);
192 }
193
194 RansEncFlush(&rans3, &ptr);
195 RansEncFlush(&rans2, &ptr);
196 RansEncFlush(&rans1, &ptr);
197 RansEncFlush(&rans0, &ptr);
198
199 // Finalise block size and return it
200 *out_size = (out_end - ptr) + tab_size;
201
202 cp = out_buf;
203
204 *cp++ = 0; // order
205 *cp++ = ((*out_size-9)>> 0) & 0xff;
206 *cp++ = ((*out_size-9)>> 8) & 0xff;
207 *cp++ = ((*out_size-9)>>16) & 0xff;
208 *cp++ = ((*out_size-9)>>24) & 0xff;
209
210 *cp++ = (in_size>> 0) & 0xff;
211 *cp++ = (in_size>> 8) & 0xff;
212 *cp++ = (in_size>>16) & 0xff;
213 *cp++ = (in_size>>24) & 0xff;
214
215 memmove(out_buf + tab_size, ptr, out_end-ptr);
216
217 return out_buf;
218 }
219
220 typedef struct {
221 unsigned char R[TOTFREQ];
222 } ari_decoder;
223
224 static
rans_uncompress_O0(unsigned char * in,unsigned int in_size,unsigned int * out_size)225 unsigned char *rans_uncompress_O0(unsigned char *in, unsigned int in_size,
226 unsigned int *out_size) {
227 /* Load in the static tables */
228 unsigned char *cp = in + 9;
229 unsigned char *cp_end = in + in_size;
230 const uint32_t mask = (1u << TF_SHIFT)-1;
231 int i, j, rle;
232 unsigned int x, y;
233 unsigned int out_sz, in_sz;
234 char *out_buf;
235 RansState R[4];
236 RansState m[4];
237 uint16_t sfreq[TOTFREQ+32];
238 uint16_t ssym [TOTFREQ+32]; // faster, but only needs uint8_t
239 uint32_t sbase[TOTFREQ+16]; // faster, but only needs uint16_t
240
241 if (in_size < 26) // Need at least this many bytes just to start
242 return NULL;
243
244 if (*in++ != 0) // Order-0 check
245 return NULL;
246
247 in_sz = ((in[0])<<0) | ((in[1])<<8) | ((in[2])<<16) | (((uint32_t)in[3])<<24);
248 out_sz = ((in[4])<<0) | ((in[5])<<8) | ((in[6])<<16) | (((uint32_t)in[7])<<24);
249 if (in_sz != in_size-9)
250 return NULL;
251
252 if (out_sz >= INT_MAX)
253 return NULL; // protect against some overflow cases
254
255 // For speeding up the fuzzer only.
256 // Small input can lead to large uncompressed data.
257 // We reject this as it just slows things up instead of testing more code
258 // paths (once we've verified a few times for large data).
259 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
260 if (out_sz > 100000)
261 return NULL;
262 #endif
263
264 out_buf = malloc(out_sz);
265 if (!out_buf)
266 return NULL;
267
268 //fprintf(stderr, "out_sz=%d\n", out_sz);
269
270 // Precompute reverse lookup of frequency.
271 rle = x = y = 0;
272 j = *cp++;
273 do {
274 int F, C;
275 if (cp > cp_end - 16) goto cleanup; // Not enough input bytes left
276 if ((F = *cp++) >= 128) {
277 F &= ~128;
278 F = ((F & 127) << 8) | *cp++;
279 }
280 C = x;
281
282 if (x + F > TOTFREQ)
283 goto cleanup;
284
285 for (y = 0; y < F; y++) {
286 ssym [y + C] = j;
287 sfreq[y + C] = F;
288 sbase[y + C] = y;
289 }
290 x += F;
291
292 if (!rle && j+1 == *cp) {
293 j = *cp++;
294 rle = *cp++;
295 } else if (rle) {
296 rle--;
297 j++;
298 if (j > 255)
299 goto cleanup;
300 } else {
301 j = *cp++;
302 }
303 } while(j);
304
305 if (x < TOTFREQ-1 || x > TOTFREQ)
306 goto cleanup;
307
308 // 16 bytes of cp here. Also why cp - 16 in above loop.
309 if (cp > cp_end - 16) goto cleanup; // Not enough input bytes left
310
311 RansDecInit(&R[0], &cp); if (R[0] < RANS_BYTE_L) goto cleanup;
312 RansDecInit(&R[1], &cp); if (R[1] < RANS_BYTE_L) goto cleanup;
313 RansDecInit(&R[2], &cp); if (R[2] < RANS_BYTE_L) goto cleanup;
314 RansDecInit(&R[3], &cp); if (R[3] < RANS_BYTE_L) goto cleanup;
315
316 int out_end = (out_sz&~3);
317 cp_end -= 8; // within 8 for simplicity of loop below
318 for (i=0; i < out_end; i+=4) {
319 m[0] = R[0] & mask;
320 out_buf[i+0] = ssym[m[0]];
321 R[0] = sfreq[m[0]] * (R[0] >> TF_SHIFT) + sbase[m[0]];
322
323 m[1] = R[1] & mask;
324 out_buf[i+1] = ssym[m[1]];
325 R[1] = sfreq[m[1]] * (R[1] >> TF_SHIFT) + sbase[m[1]];
326
327 m[2] = R[2] & mask;
328 out_buf[i+2] = ssym[m[2]];
329 R[2] = sfreq[m[2]] * (R[2] >> TF_SHIFT) + sbase[m[2]];
330
331 m[3] = R[3] & mask;
332 out_buf[i+3] = ssym[m[3]];
333 R[3] = sfreq[m[3]] * (R[3] >> TF_SHIFT) + sbase[m[3]];
334
335 if (cp < cp_end) {
336 RansDecRenorm2(&R[0], &R[1], &cp);
337 RansDecRenorm2(&R[2], &R[3], &cp);
338 } else {
339 RansDecRenormSafe(&R[0], &cp, cp_end+8);
340 RansDecRenormSafe(&R[1], &cp, cp_end+8);
341 RansDecRenormSafe(&R[2], &cp, cp_end+8);
342 RansDecRenormSafe(&R[3], &cp, cp_end+8);
343 }
344 }
345
346 switch(out_sz&3) {
347 case 3:
348 out_buf[out_end + 2] = ssym[R[2] & mask];
349 case 2:
350 out_buf[out_end + 1] = ssym[R[1] & mask];
351 case 1:
352 out_buf[out_end] = ssym[R[0] & mask];
353 default:
354 break;
355 }
356
357 *out_size = out_sz;
358 return (unsigned char *)out_buf;
359
360 cleanup:
361 free(out_buf);
362 return NULL;
363 }
364
365
366 #ifndef NO_THREADS
367 /*
368 * Thread local storage per thread in the pool.
369 * This avoids needing to memset/calloc F and syms in the encoder,
370 * which can be speed things this encoder up a little.
371 */
372 static pthread_once_t rans_enc_once = PTHREAD_ONCE_INIT;
373 static pthread_key_t rans_enc_key;
374
375 typedef struct {
376 RansEncSymbol (*syms)[256];
377 int (*F)[256];
378 } thread_enc_data;
379
rans_enc_free(void * vp)380 static void rans_enc_free(void *vp) {
381 thread_enc_data *te = (thread_enc_data *)vp;
382 if (!te)
383 return;
384 free(te->F);
385 free(te->syms);
386 free(te);
387 }
388
rans_enc_alloc(void)389 static thread_enc_data *rans_enc_alloc(void) {
390 thread_enc_data *te = malloc(sizeof(*te));
391 if (!te)
392 return NULL;
393 te->F = calloc(256, sizeof(*te->F));
394 te->syms = calloc(256, sizeof(*te->syms));
395 if (!te->F || !te->syms) {
396 rans_enc_free(te);
397 return NULL;
398 }
399
400 return te;
401 }
402
rans_tls_enc_init(void)403 static void rans_tls_enc_init(void) {
404 pthread_key_create(&rans_enc_key, rans_enc_free);
405 }
406 #endif
407
408 static
rans_compress_O1(unsigned char * in,unsigned int in_size,unsigned int * out_size)409 unsigned char *rans_compress_O1(unsigned char *in, unsigned int in_size,
410 unsigned int *out_size) {
411 unsigned char *out_buf = NULL, *out_end, *cp;
412 unsigned int tab_size, rle_i, rle_j;
413
414
415 #ifndef NO_THREADS
416 pthread_once(&rans_enc_once, rans_tls_enc_init);
417 thread_enc_data *te = pthread_getspecific(rans_enc_key);
418 if (!te) {
419 if (!(te = rans_enc_alloc()))
420 return NULL;
421 pthread_setspecific(rans_enc_key, te);
422 }
423 RansEncSymbol (*syms)[256] = te->syms;
424 int (*F)[256] = te->F;
425 memset(F, 0, 256*sizeof(*F));
426 #else
427 #ifdef USE_HEAP
428 RansEncSymbol (*syms)[256] = malloc(256 * sizeof(*syms));
429 int (*F)[256] = calloc(256, sizeof(*F));
430 #else
431 RansEncSymbol syms[256][256];
432 int F[256][256] = {{0}};
433 #endif
434 #endif
435 int T[256+MAGIC] = {0};
436 int i, j;
437
438 if (in_size < 4)
439 return rans_compress_O0(in, in_size, out_size);
440
441 #ifdef USE_HEAP
442 if (!syms) goto cleanup;
443 if (!F) goto cleanup;
444 #endif
445
446 out_buf = malloc(1.05*in_size + 257*257*3 + 9);
447 if (!out_buf) goto cleanup;
448
449 out_end = out_buf + (int)(1.05*in_size) + 257*257*3 + 9;
450 cp = out_buf+9;
451
452 hist1_4(in, in_size, (uint32_t (*)[256])F, (uint32_t *)T);
453
454 F[0][in[1*(in_size>>2)]]++;
455 F[0][in[2*(in_size>>2)]]++;
456 F[0][in[3*(in_size>>2)]]++;
457 T[0]+=3;
458
459
460 // Normalise so T[i] == TOTFREQ
461 for (rle_i = i = 0; i < 256; i++) {
462 int t2, m, M;
463 unsigned int x;
464
465 if (T[i] == 0)
466 continue;
467
468 //uint64_t p = (TOTFREQ * TOTFREQ) / t;
469 double p = ((double)TOTFREQ)/T[i];
470 normalise_harder:
471 for (t2 = m = M = j = 0; j < 256; j++) {
472 if (!F[i][j])
473 continue;
474
475 if (m < F[i][j])
476 m = F[i][j], M = j;
477
478 //if ((F[i][j] = (F[i][j] * p) / TOTFREQ) == 0)
479 if ((F[i][j] *= p) == 0)
480 F[i][j] = 1;
481 t2 += F[i][j];
482 }
483
484 t2++;
485 if (t2 < TOTFREQ) {
486 F[i][M] += TOTFREQ-t2;
487 } else if (t2-TOTFREQ >= F[i][M]/2) {
488 // Corner case to avoid excessive frequency reduction
489 p = .98; goto normalise_harder;
490 } else {
491 F[i][M] -= t2-TOTFREQ;
492 }
493
494 // Store frequency table
495 // i
496 if (rle_i) {
497 rle_i--;
498 } else {
499 *cp++ = i;
500 // FIXME: could use order-0 statistics to observe which alphabet
501 // symbols are present and base RLE on that ordering instead.
502 if (i && T[i-1]) {
503 for(rle_i=i+1; rle_i<256 && T[rle_i]; rle_i++)
504 ;
505 rle_i -= i+1;
506 *cp++ = rle_i;
507 }
508 }
509
510 int *F_i_ = F[i];
511 x = 0;
512 rle_j = 0;
513 for (j = 0; j < 256; j++) {
514 if (F_i_[j]) {
515 //fprintf(stderr, "F[%d][%d]=%d, x=%d\n", i, j, F_i_[j], x);
516
517 // j
518 if (rle_j) {
519 rle_j--;
520 } else {
521 *cp++ = j;
522 if (!rle_j && j && F_i_[j-1]) {
523 for(rle_j=j+1; rle_j<256 && F_i_[rle_j]; rle_j++)
524 ;
525 rle_j -= j+1;
526 *cp++ = rle_j;
527 }
528 }
529
530 // F_i_[j]
531 if (F_i_[j]<128) {
532 *cp++ = F_i_[j];
533 } else {
534 *cp++ = 128 | (F_i_[j]>>8);
535 *cp++ = F_i_[j]&0xff;
536 }
537
538 RansEncSymbolInit(&syms[i][j], x, F_i_[j], TF_SHIFT);
539 x += F_i_[j];
540 }
541 }
542 *cp++ = 0;
543 }
544 *cp++ = 0;
545
546 //write(2, out_buf+4, cp-(out_buf+4));
547 tab_size = cp - out_buf;
548 assert(tab_size < 257*257*3);
549
550 RansState rans0, rans1, rans2, rans3;
551 RansEncInit(&rans0);
552 RansEncInit(&rans1);
553 RansEncInit(&rans2);
554 RansEncInit(&rans3);
555
556 uint8_t* ptr = out_end;
557
558 int isz4 = in_size>>2;
559 int i0 = 1*isz4-2;
560 int i1 = 2*isz4-2;
561 int i2 = 3*isz4-2;
562 int i3 = 4*isz4-2;
563
564 unsigned char l0 = in[i0+1];
565 unsigned char l1 = in[i1+1];
566 unsigned char l2 = in[i2+1];
567 unsigned char l3 = in[i3+1];
568
569 // Deal with the remainder
570 l3 = in[in_size-1];
571 for (i3 = in_size-2; i3 > 4*isz4-2; i3--) {
572 unsigned char c3 = in[i3];
573 RansEncPutSymbol(&rans3, &ptr, &syms[c3][l3]);
574 l3 = c3;
575 }
576
577 for (; i0 >= 0; i0--, i1--, i2--, i3--) {
578 unsigned char c3 = in[i3];
579 unsigned char c2 = in[i2];
580 unsigned char c1 = in[i1];
581 unsigned char c0 = in[i0];
582
583 RansEncSymbol *s3 = &syms[c3][l3];
584 RansEncSymbol *s2 = &syms[c2][l2];
585 RansEncSymbol *s1 = &syms[c1][l1];
586 RansEncSymbol *s0 = &syms[c0][l0];
587
588 RansEncPutSymbol4(&rans3, &rans2, &rans1, &rans0, &ptr,
589 s3, s2, s1, s0);
590
591 l3 = c3;
592 l2 = c2;
593 l1 = c1;
594 l0 = c0;
595 }
596
597 RansEncPutSymbol(&rans3, &ptr, &syms[0][l3]);
598 RansEncPutSymbol(&rans2, &ptr, &syms[0][l2]);
599 RansEncPutSymbol(&rans1, &ptr, &syms[0][l1]);
600 RansEncPutSymbol(&rans0, &ptr, &syms[0][l0]);
601
602 RansEncFlush(&rans3, &ptr);
603 RansEncFlush(&rans2, &ptr);
604 RansEncFlush(&rans1, &ptr);
605 RansEncFlush(&rans0, &ptr);
606
607 *out_size = (out_end - ptr) + tab_size;
608
609 cp = out_buf;
610 *cp++ = 1; // order
611
612 *cp++ = ((*out_size-9)>> 0) & 0xff;
613 *cp++ = ((*out_size-9)>> 8) & 0xff;
614 *cp++ = ((*out_size-9)>>16) & 0xff;
615 *cp++ = ((*out_size-9)>>24) & 0xff;
616
617 *cp++ = (in_size>> 0) & 0xff;
618 *cp++ = (in_size>> 8) & 0xff;
619 *cp++ = (in_size>>16) & 0xff;
620 *cp++ = (in_size>>24) & 0xff;
621
622 memmove(out_buf + tab_size, ptr, out_end-ptr);
623
624 cleanup:
625 #ifdef USE_HEAP
626 free(syms);
627 free(F);
628 #endif
629
630 return out_buf;
631 }
632
633 #ifndef NO_THREADS
634 /*
635 * Thread local storage per thread in the pool.
636 * This avoids needing to memset/calloc D and syms in the decoder,
637 * which can be speed things this decoder up a little (~10%).
638 */
639 static pthread_once_t rans_once = PTHREAD_ONCE_INIT;
640 static pthread_key_t rans_key;
641
642 typedef struct {
643 ari_decoder *D;
644 RansDecSymbol32 (*syms)[256];
645 } thread_data;
646
rans_tb_free(void * vp)647 static void rans_tb_free(void *vp) {
648 thread_data *tb = (thread_data *)vp;
649 if (!tb)
650 return;
651 free(tb->D);
652 free(tb->syms);
653 free(tb);
654 }
655
rans_tb_alloc(void)656 static thread_data *rans_tb_alloc(void) {
657 thread_data *tb = malloc(sizeof(*tb));
658 if (!tb)
659 return NULL;
660 tb->D = calloc(256, sizeof(*tb->D));
661 tb->syms = calloc(256, sizeof(*tb->syms));
662 if (!tb->D || !tb->syms) {
663 rans_tb_free(tb);
664 return NULL;
665 }
666
667 return tb;
668 }
669
rans_tls_init(void)670 static void rans_tls_init(void) {
671 pthread_key_create(&rans_key, rans_tb_free);
672 }
673 #endif
674
675 static
rans_uncompress_O1(unsigned char * in,unsigned int in_size,unsigned int * out_size)676 unsigned char *rans_uncompress_O1(unsigned char *in, unsigned int in_size,
677 unsigned int *out_size) {
678 /* Load in the static tables */
679 unsigned char *cp = in + 9;
680 unsigned char *ptr_end = in + in_size;
681 int i, j = -999, rle_i, rle_j;
682 unsigned int x;
683 unsigned int out_sz, in_sz;
684 char *out_buf = NULL;
685 // D[] is 1Mb and syms[][] is 0.5Mb.
686 #ifndef NO_THREADS
687 pthread_once(&rans_once, rans_tls_init);
688 thread_data *tb = pthread_getspecific(rans_key);
689 if (!tb) {
690 if (!(tb = rans_tb_alloc()))
691 return NULL;
692 pthread_setspecific(rans_key, tb);
693 }
694 ari_decoder *const D = tb->D;
695 RansDecSymbol32 (*const syms)[256] = tb->syms;
696 #else
697 #ifdef USE_HEAP
698 //ari_decoder *const D = malloc(256 * sizeof(*D));
699 ari_decoder *const D = calloc(256, sizeof(*D));
700 RansDecSymbol32 (*const syms)[256] = malloc(256 * sizeof(*syms));
701 for (i = 1; i < 256; i++) memset(&syms[i][0], 0, sizeof(syms[0][0]));
702 #else
703 ari_decoder D[256] = {{{0}}}; //256*4k => 1.0Mb
704 RansDecSymbol32 syms[256][256+6] = {{{0}}}; //256*262*8 => 0.5Mb
705 #endif
706 #endif
707 int16_t map[256], map_i = 0;
708
709 memset(map, -1, 256*sizeof(*map));
710
711 if (in_size < 27) // Need at least this many bytes to start
712 return NULL;
713
714 if (*in++ != 1) // Order-1 check
715 return NULL;
716
717 in_sz = ((in[0])<<0) | ((in[1])<<8) | ((in[2])<<16) | (((uint32_t)in[3])<<24);
718 out_sz = ((in[4])<<0) | ((in[5])<<8) | ((in[6])<<16) | (((uint32_t)in[7])<<24);
719 if (in_sz != in_size-9)
720 return NULL;
721
722 if (out_sz >= INT_MAX)
723 return NULL; // protect against some overflow cases
724
725 // For speeding up the fuzzer only.
726 // Small input can lead to large uncompressed data.
727 // We reject this as it just slows things up instead of testing more code
728 // paths (once we've verified a few times for large data).
729 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
730 if (out_sz > 100000)
731 return NULL;
732 #endif
733
734 #if defined(USE_HEAP)
735 if (!D || !syms) goto cleanup;
736 /* These memsets prevent illegal memory access in syms due to
737 broken compressed data. As D is calloc'd, all illegal transitions
738 will end up in either row or column 0 of syms. */
739 memset(&syms[0], 0, sizeof(syms[0]));
740 for (i = 0; i < 256; i++)
741 memset(&syms[i][0], 0, sizeof(syms[0][0]));
742 #endif
743
744 //fprintf(stderr, "out_sz=%d\n", out_sz);
745
746 //i = *cp++;
747 rle_i = 0;
748 i = *cp++;
749 do {
750 // Map arbitrary a,b,c to 0,1,2 to improve cache locality.
751 if (map[i] == -1)
752 map[i] = map_i++;
753 int m_i = map[i];
754
755 rle_j = x = 0;
756 j = *cp++;
757 do {
758 if (map[j] == -1)
759 map[j] = map_i++;
760
761 int F, C;
762 if (cp > ptr_end - 16) goto cleanup; // Not enough input bytes left
763 if ((F = *cp++) >= 128) {
764 F &= ~128;
765 F = ((F & 127) << 8) | *cp++;
766 }
767 C = x;
768
769 //fprintf(stderr, "i=%d j=%d F=%d C=%d\n", i, j, F, C);
770
771 if (!F)
772 F = TOTFREQ;
773
774 RansDecSymbolInit32(&syms[m_i][j], C, F);
775
776 /* Build reverse lookup table */
777 //if (!D[i].R) D[i].R = (unsigned char *)malloc(TOTFREQ);
778 if (x + F > TOTFREQ)
779 goto cleanup;
780
781 memset(&D[m_i].R[x], j, F);
782 x += F;
783
784 if (!rle_j && j+1 == *cp) {
785 j = *cp++;
786 rle_j = *cp++;
787 } else if (rle_j) {
788 rle_j--;
789 j++;
790 if (j > 255)
791 goto cleanup;
792 } else {
793 j = *cp++;
794 }
795 } while(j);
796
797 if (x < TOTFREQ-1 || x > TOTFREQ)
798 goto cleanup;
799 if (x < TOTFREQ) // historically we fill 4095, not 4096
800 D[i].R[x] = D[i].R[x-1];
801
802 if (!rle_i && i+1 == *cp) {
803 i = *cp++;
804 rle_i = *cp++;
805 } else if (rle_i) {
806 rle_i--;
807 i++;
808 if (i > 255)
809 goto cleanup;
810 } else {
811 i = *cp++;
812 }
813 } while (i);
814 for (i = 0; i < 256; i++)
815 if (map[i] == -1)
816 map[i] = 0;
817
818 RansState rans0, rans1, rans2, rans3;
819 uint8_t *ptr = cp;
820 if (cp > ptr_end - 16) goto cleanup; // Not enough input bytes left
821 RansDecInit(&rans0, &ptr); if (rans0 < RANS_BYTE_L) return NULL;
822 RansDecInit(&rans1, &ptr); if (rans1 < RANS_BYTE_L) return NULL;
823 RansDecInit(&rans2, &ptr); if (rans2 < RANS_BYTE_L) return NULL;
824 RansDecInit(&rans3, &ptr); if (rans3 < RANS_BYTE_L) return NULL;
825
826 RansState R[4];
827 R[0] = rans0;
828 R[1] = rans1;
829 R[2] = rans2;
830 R[3] = rans3;
831
832 unsigned int isz4 = out_sz>>2;
833 uint32_t l0 = 0;
834 uint32_t l1 = 0;
835 uint32_t l2 = 0;
836 uint32_t l3 = 0;
837
838 unsigned int i4[] = {0*isz4, 1*isz4, 2*isz4, 3*isz4};
839
840 /* Allocate output buffer */
841 out_buf = malloc(out_sz);
842 if (!out_buf) goto cleanup;
843
844 uint8_t cc0 = D[map[l0]].R[R[0] & ((1u << TF_SHIFT)-1)];
845 uint8_t cc1 = D[map[l1]].R[R[1] & ((1u << TF_SHIFT)-1)];
846 uint8_t cc2 = D[map[l2]].R[R[2] & ((1u << TF_SHIFT)-1)];
847 uint8_t cc3 = D[map[l3]].R[R[3] & ((1u << TF_SHIFT)-1)];
848
849 ptr_end -= 8;
850 for (; i4[0] < isz4; i4[0]++, i4[1]++, i4[2]++, i4[3]++) {
851 out_buf[i4[0]] = cc0;
852 out_buf[i4[1]] = cc1;
853 out_buf[i4[2]] = cc2;
854 out_buf[i4[3]] = cc3;
855
856 //RansDecAdvanceStep(&R[0], syms[l0][cc0].start, syms[l0][cc0].freq, TF_SHIFT);
857 //RansDecAdvanceStep(&R[1], syms[l1][cc1].start, syms[l1][cc1].freq, TF_SHIFT);
858 //RansDecAdvanceStep(&R[2], syms[l2][cc2].start, syms[l2][cc2].freq, TF_vSHIFT);
859 //RansDecAdvanceStep(&R[3], syms[l3][cc3].start, syms[l3][cc3].freq, TF_SHIFT);
860
861 {
862 uint32_t m[4];
863
864 // Ordering to try and improve OoO cpu instructions
865 m[0] = R[0] & ((1u << TF_SHIFT)-1);
866 R[0] = syms[l0][cc0].freq * (R[0]>>TF_SHIFT);
867 m[1] = R[1] & ((1u << TF_SHIFT)-1);
868 R[0] += m[0] - syms[l0][cc0].start;
869 R[1] = syms[l1][cc1].freq * (R[1]>>TF_SHIFT);
870 m[2] = R[2] & ((1u << TF_SHIFT)-1);
871 R[1] += m[1] - syms[l1][cc1].start;
872 R[2] = syms[l2][cc2].freq * (R[2]>>TF_SHIFT);
873 m[3] = R[3] & ((1u << TF_SHIFT)-1);
874 R[3] = syms[l3][cc3].freq * (R[3]>>TF_SHIFT);
875 R[2] += m[2] - syms[l2][cc2].start;
876 R[3] += m[3] - syms[l3][cc3].start;
877 }
878
879 l0 = map[cc0];
880 l1 = map[cc1];
881 l2 = map[cc2];
882 l3 = map[cc3];
883
884 if (ptr < ptr_end) {
885 RansDecRenorm2(&R[0], &R[1], &ptr);
886 RansDecRenorm2(&R[2], &R[3], &ptr);
887 } else {
888 RansDecRenormSafe(&R[0], &ptr, ptr_end+8);
889 RansDecRenormSafe(&R[1], &ptr, ptr_end+8);
890 RansDecRenormSafe(&R[2], &ptr, ptr_end+8);
891 RansDecRenormSafe(&R[3], &ptr, ptr_end+8);
892 }
893
894 cc0 = D[l0].R[R[0] & ((1u << TF_SHIFT)-1)];
895 cc1 = D[l1].R[R[1] & ((1u << TF_SHIFT)-1)];
896 cc2 = D[l2].R[R[2] & ((1u << TF_SHIFT)-1)];
897 cc3 = D[l3].R[R[3] & ((1u << TF_SHIFT)-1)];
898 }
899
900 // Remainder
901 for (; i4[3] < out_sz; i4[3]++) {
902 unsigned char c3 = D[l3].R[RansDecGet(&R[3], TF_SHIFT)];
903 out_buf[i4[3]] = c3;
904
905 uint32_t m = R[3] & ((1u << TF_SHIFT)-1);
906 R[3] = syms[l3][c3].freq * (R[3]>>TF_SHIFT) + m - syms[l3][c3].start;
907 RansDecRenormSafe(&R[3], &ptr, ptr_end+8);
908 l3 = map[c3];
909 }
910
911 *out_size = out_sz;
912
913 cleanup:
914 #if defined(USE_HEAP)
915 if (D)
916 free(D);
917
918 free(syms);
919 #endif
920
921 return (unsigned char *)out_buf;
922 }
923
924 /*-----------------------------------------------------------------------------
925 * Simple interface to the order-0 vs order-1 encoders and decoders.
926 */
rans_compress(unsigned char * in,unsigned int in_size,unsigned int * out_size,int order)927 unsigned char *rans_compress(unsigned char *in, unsigned int in_size,
928 unsigned int *out_size, int order) {
929 return order
930 ? rans_compress_O1(in, in_size, out_size)
931 : rans_compress_O0(in, in_size, out_size);
932 }
933
rans_uncompress(unsigned char * in,unsigned int in_size,unsigned int * out_size)934 unsigned char *rans_uncompress(unsigned char *in, unsigned int in_size,
935 unsigned int *out_size) {
936 /* Both rans_uncompress functions need to be able to read at least 9
937 bytes. */
938 if (in_size < 9)
939 return NULL;
940 return in[0]
941 ? rans_uncompress_O1(in, in_size, out_size)
942 : rans_uncompress_O0(in, in_size, out_size);
943 }
944