1 /*
2 * Copyright (c) 2017-2020 Genome Research Ltd.
3 * Author(s): James Bonfield
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above
12 * copyright notice, this list of conditions and the following
13 * disclaimer in the documentation and/or other materials provided
14 * with the distribution.
15 *
16 * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17 * Institute nor the names of its contributors may be used to endorse
18 * or promote products derived from this software without specific
19 * prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 // As per standard rANS_static but using optional RLE or bit-packing
35 // techniques prior to entropy encoding. This is a significant
36 // reduction in some data sets.
37
38 // top bits in order byte
39 #define X_PACK 0x80 // Pack 2,4,8 or infinite symbols into a byte.
40 #define X_RLE 0x40 // Run length encoding with runs & lits encoded separately
41 #define X_CAT 0x20 // Nop; for tiny segments where rANS overhead is too big
42 #define X_NOSZ 0x10 // Don't store the original size; used by STRIPE mode
43 #define X_STRIPE 0x08 // For N-byte integer data; rotate & encode N streams.
44
45 // FIXME Can we get decoder to return the compressed sized read, avoiding
46 // us needing to store it? Yes we can. See c-size comments. If we added all these
47 // together we could get rans_uncompress_to_4x16 to return the number of bytes
48 // consumed, avoiding the calling code from needed to explicitly stored the size.
49 // However the effect on name tokeniser is to save 0.1 to 0.2% so not worth it.
50
51 /*-------------------------------------------------------------------------- */
52 /*
53 * Example wrapper to use the rans_byte.h functions included above.
54 *
55 * This demonstrates how to use, and unroll, an order-0 and order-1 frequency
56 * model.
57 */
58
59 #include "config.h"
60
61 #include <stdint.h>
62 #include <stdlib.h>
63 #include <stdio.h>
64 #include <unistd.h>
65 #include <assert.h>
66 #include <string.h>
67 #include <sys/time.h>
68 #include <limits.h>
69 #include <math.h>
70 #ifndef NO_THREADS
71 #include <pthread.h>
72 #endif
73
74 #include "rANS_word.h"
75 #include "rANS_static4x16.h"
76 #include "varint.h"
77 #include "pack.h"
78 #include "rle.h"
79 #include "utils.h"
80
81 #define TF_SHIFT 12
82 #define TOTFREQ (1<<TF_SHIFT)
83
84 // 9-11 is considerably faster in the O1 variant due to reduced table size.
85 // We auto-tune between 10 and 12 though. Anywhere from 9 to 14 are viable.
86 #ifndef TF_SHIFT_O1
87 #define TF_SHIFT_O1 12
88 #endif
89 #ifndef TF_SHIFT_O1_FAST
90 #define TF_SHIFT_O1_FAST 10
91 #endif
92 #define TOTFREQ_O1 (1<<TF_SHIFT_O1)
93 #define TOTFREQ_O1_FAST (1<<TF_SHIFT_O1_FAST)
94
95
96 /*-----------------------------------------------------------------------------
97 * Memory to memory compression functions.
98 *
99 * These are original versions without any manual loop unrolling. They
100 * are easier to understand, but can be up to 2x slower.
101 */
102
103 // Rounds to next power of 2.
104 // credit to http://graphics.stanford.edu/~seander/bithacks.html
round2(uint32_t v)105 static uint32_t round2(uint32_t v) {
106 v--;
107 v |= v >> 1;
108 v |= v >> 2;
109 v |= v >> 4;
110 v |= v >> 8;
111 v |= v >> 16;
112 v++;
113 return v;
114 }
115
normalise_freq(uint32_t * F,int size,uint32_t tot)116 static int normalise_freq(uint32_t *F, int size, uint32_t tot) {
117 int m, M, j, loop = 0;
118 uint64_t tr;
119 if (!size)
120 return 0;
121
122 again:
123 tr = ((uint64_t)tot<<31)/size + (1<<30)/size;
124
125 for (size = m = M = j = 0; j < 256; j++) {
126 if (!F[j])
127 continue;
128
129 if (m < F[j])
130 m = F[j], M = j;
131
132 if ((F[j] = (F[j]*tr)>>31) == 0)
133 F[j] = 1;
134 size += F[j];
135 }
136
137 int adjust = tot - size;
138 if (adjust > 0) {
139 F[M] += adjust;
140 } else if (adjust < 0) {
141 if (F[M] > -adjust && (loop == 1 || F[M]/2 >= -adjust)) {
142 F[M] += adjust;
143 } else {
144 if (loop < 1) {
145 loop++;
146 goto again;
147 }
148 adjust += F[M]-1;
149 F[M] = 1;
150 for (j = 0; adjust && j < 256; j++) {
151 if (F[j] < 2) continue;
152
153 int d = F[j] > -adjust;
154 int m = d ? adjust : 1-F[j];
155 F[j] += m;
156 adjust -= m;
157 }
158 }
159 }
160
161 //printf("F[%d]=%d\n", M, F[M]);
162 return F[M]>0 ? 0 : -1;
163 }
164
165 // A specialised version of normalise_freq_shift where the input size
166 // is already normalised to a power of 2, meaning we can just perform
167 // shifts instead of hard to define multiplications and adjustments.
normalise_freq_shift(uint32_t * F,uint32_t size,uint32_t max_tot)168 static void normalise_freq_shift(uint32_t *F, uint32_t size,
169 uint32_t max_tot) {
170 if (size == 0 || size == max_tot)
171 return;
172
173 int shift = 0, i;
174 while (size < max_tot)
175 size*=2, shift++;
176
177 for (i = 0; i < 256; i++)
178 F[i] <<= shift;
179 }
180
181 // symbols only
encode_alphabet(uint8_t * cp,uint32_t * F)182 static int encode_alphabet(uint8_t *cp, uint32_t *F) {
183 uint8_t *op = cp;
184 int rle, j;
185
186 for (rle = j = 0; j < 256; j++) {
187 if (F[j]) {
188 // j
189 if (rle) {
190 rle--;
191 } else {
192 *cp++ = j;
193 if (!rle && j && F[j-1]) {
194 for(rle=j+1; rle<256 && F[rle]; rle++)
195 ;
196 rle -= j+1;
197 *cp++ = rle;
198 }
199 //fprintf(stderr, "%d: %d %d\n", j, rle, N[j]);
200 }
201 }
202 }
203 *cp++ = 0;
204
205 return cp - op;
206 }
207
decode_alphabet(uint8_t * cp,uint8_t * cp_end,uint32_t * F)208 static int decode_alphabet(uint8_t *cp, uint8_t *cp_end, uint32_t *F) {
209 if (cp == cp_end)
210 return 0;
211
212 uint8_t *op = cp;
213 int rle = 0;
214 int j = *cp++;
215 if (cp+2 >= cp_end)
216 goto carefully;
217
218 do {
219 F[j] = 1;
220 if (!rle && j+1 == *cp) {
221 j = *cp++;
222 rle = *cp++;
223 } else if (rle) {
224 rle--;
225 j++;
226 if (j > 255)
227 return 0;
228 } else {
229 j = *cp++;
230 }
231 } while(j && cp+2 < cp_end);
232
233 carefully:
234 if (j) {
235 do {
236 F[j] = 1;
237 if(cp >= cp_end) return 0;
238 if (!rle && j+1 == *cp) {
239 if (cp+1 >= cp_end) return 0;
240 j = *cp++;
241 rle = *cp++;
242 } else if (rle) {
243 rle--;
244 j++;
245 if (j > 255)
246 return 0;
247 } else {
248 if (cp >= cp_end) return 0;
249 j = *cp++;
250 }
251 } while(j && cp < cp_end);
252 }
253
254 return cp - op;
255 }
256
encode_freq(uint8_t * cp,uint32_t * F)257 static int encode_freq(uint8_t *cp, uint32_t *F) {
258 uint8_t *op = cp;
259 int j;
260
261 cp += encode_alphabet(cp, F);
262
263 for (j = 0; j < 256; j++) {
264 if (F[j])
265 cp += var_put_u32(cp, NULL, F[j]);
266 }
267
268 return cp - op;
269 }
270
decode_freq(uint8_t * cp,uint8_t * cp_end,uint32_t * F,uint32_t * fsum)271 static int decode_freq(uint8_t *cp, uint8_t *cp_end, uint32_t *F,
272 uint32_t *fsum) {
273 if (cp == cp_end)
274 return 0;
275
276 uint8_t *op = cp;
277 cp += decode_alphabet(cp, cp_end, F);
278
279 int j, tot = 0;
280 for (j = 0; j < 256; j++) {
281 if (F[j]) {
282 cp += var_get_u32(cp, cp_end, (unsigned int *)&F[j]);
283 tot += F[j];
284 }
285 }
286
287 *fsum = tot;
288 return cp - op;
289 }
290
291
292 // Use the order-0 freqs in F0 to encode the order-1 stats in F.
293 // All symbols present in F are present in F0, but some in F0 will
294 // be empty in F. Thus we run-length encode the 0 frequencies.
encode_freq_d(uint8_t * cp,uint32_t * F0,uint32_t * F)295 static int encode_freq_d(uint8_t *cp, uint32_t *F0, uint32_t *F) {
296 uint8_t *op = cp;
297 int j, dz;
298
299 for (dz = j = 0; j < 256; j++) {
300 if (F0[j]) {
301 if (F[j] != 0) {
302 if (dz) {
303 // Replace dz zeros with zero + dz-1 run length
304 cp -= dz-1;
305 *cp++ = dz-1;
306 }
307 dz = 0;
308 cp += var_put_u32(cp, NULL, F[j]);
309 } else {
310 //fprintf(stderr, "2: j=%d F0[j]=%d, F[j]=%d, dz=%d\n", j, F0[j], F[j], dz);
311 dz++;
312 *cp++ = 0;
313 }
314 } else {
315 assert(F[j] == 0);
316 }
317 }
318
319 if (dz) {
320 cp -= dz-1;
321 *cp++ = dz-1;
322 }
323
324 return cp - op;
325 }
326
decode_freq_d(uint8_t * cp,uint8_t * cp_end,uint32_t * F0,uint32_t * F,uint32_t * total)327 static int decode_freq_d(uint8_t *cp, uint8_t *cp_end, uint32_t *F0,
328 uint32_t *F, uint32_t *total) {
329 if (cp == cp_end)
330 return 0;
331
332 uint8_t *op = cp;
333 int j, dz, T = 0;
334
335 for (j = dz = 0; j < 256 && cp < cp_end; j++) {
336 //if (F0[j]) fprintf(stderr, "F0[%d]=%d\n", j, F0[j]);
337 if (!F0[j])
338 continue;
339
340 uint32_t f;
341 if (dz) {
342 f = 0;
343 dz--;
344 } else {
345 if (cp >= cp_end) return 0;
346 cp += var_get_u32(cp, cp_end, &f);
347 if (f == 0) {
348 if (cp >= cp_end) return 0;
349 dz = *cp++;
350 }
351 }
352 F[j] = f;
353 T += f;
354 }
355
356 if (total) *total = T;
357 return cp - op;
358 }
359
rans_compress_bound_4x16(unsigned int size,int order)360 unsigned int rans_compress_bound_4x16(unsigned int size, int order) {
361 int N = order>>8;
362 if (!N) N=4;
363
364 order &= 0xff;
365 int sz = (order == 0
366 ? 1.05*size + 257*3 + 4
367 : 1.05*size + 257*257*3 + 4 + 257*3+4) +
368 ((order & X_PACK) ? 1 : 0) +
369 ((order & X_RLE) ? 1 + 257*3+4: 0) + 20 +
370 ((order & X_STRIPE) ? 1 + 5*N: 0);
371 return sz + (sz&1) + 2; // make this even so buffers are word aligned
372 }
373
374 // Compresses in_size bytes from 'in' to *out_size bytes in 'out'.
375 //
376 // NB: The output buffer does not hold the original size, so it is up to
377 // the caller to store this.
378 static
rans_compress_O0_4x16(unsigned char * in,unsigned int in_size,unsigned char * out,unsigned int * out_size)379 unsigned char *rans_compress_O0_4x16(unsigned char *in, unsigned int in_size,
380 unsigned char *out, unsigned int *out_size) {
381 unsigned char *cp, *out_end;
382 RansEncSymbol syms[256];
383 RansState rans0;
384 RansState rans2;
385 RansState rans1;
386 RansState rans3;
387 uint8_t* ptr;
388 uint32_t F[256+MAGIC] = {0};
389 int i, j, tab_size = 0, rle, x;
390 int bound = rans_compress_bound_4x16(in_size,0)-20; // -20 for order/size/meta
391
392 if (!out) {
393 *out_size = bound;
394 out = malloc(*out_size);
395 }
396 if (!out || bound > *out_size)
397 return NULL;
398
399 // If "out" isn't word aligned, tweak out_end/ptr to ensure it is.
400 // We already added more round in bound to allow for this.
401 if (((size_t)out)&1)
402 bound--;
403 ptr = out_end = out + bound;
404
405 if (in_size == 0)
406 goto empty;
407
408 // Compute statistics
409 hist8(in, in_size, F);
410
411 // Normalise so frequences sum to power of 2
412 uint32_t fsum = in_size;
413 uint32_t max_val = round2(fsum);
414 if (max_val > TOTFREQ)
415 max_val = TOTFREQ;
416
417 if (normalise_freq(F, fsum, max_val) < 0)
418 return NULL;
419 fsum=max_val;
420
421 cp = out;
422 cp += encode_freq(cp, F);
423 tab_size = cp-out;
424 //write(2, out+4, cp-(out+4));
425
426 if (normalise_freq(F, fsum, TOTFREQ) < 0)
427 return NULL;
428
429 // Encode statistics.
430 for (x = rle = j = 0; j < 256; j++) {
431 if (F[j]) {
432 RansEncSymbolInit(&syms[j], x, F[j], TF_SHIFT);
433 x += F[j];
434 }
435 }
436
437 RansEncInit(&rans0);
438 RansEncInit(&rans1);
439 RansEncInit(&rans2);
440 RansEncInit(&rans3);
441
442 switch (i=(in_size&3)) {
443 case 3: RansEncPutSymbol(&rans2, &ptr, &syms[in[in_size-(i-2)]]);
444 case 2: RansEncPutSymbol(&rans1, &ptr, &syms[in[in_size-(i-1)]]);
445 case 1: RansEncPutSymbol(&rans0, &ptr, &syms[in[in_size-(i-0)]]);
446 case 0:
447 break;
448 }
449 for (i=(in_size &~3); i>0; i-=4) {
450 RansEncSymbol *s3 = &syms[in[i-1]];
451 RansEncSymbol *s2 = &syms[in[i-2]];
452 RansEncSymbol *s1 = &syms[in[i-3]];
453 RansEncSymbol *s0 = &syms[in[i-4]];
454
455 #if 1
456 RansEncPutSymbol(&rans3, &ptr, s3);
457 RansEncPutSymbol(&rans2, &ptr, s2);
458 RansEncPutSymbol(&rans1, &ptr, s1);
459 RansEncPutSymbol(&rans0, &ptr, s0);
460 #else
461 // Slightly beter on gcc, much better on clang
462 uint16_t *ptr16 = (uint16_t *)ptr;
463
464 if (rans3 >= s3->x_max) *--ptr16 = (uint16_t)rans3, rans3 >>= 16;
465 if (rans2 >= s2->x_max) *--ptr16 = (uint16_t)rans2, rans2 >>= 16;
466 uint32_t q3 = (uint32_t) (((uint64_t)rans3 * s3->rcp_freq) >> s3->rcp_shift);
467 uint32_t q2 = (uint32_t) (((uint64_t)rans2 * s2->rcp_freq) >> s2->rcp_shift);
468 rans3 += s3->bias + q3 * s3->cmpl_freq;
469 rans2 += s2->bias + q2 * s2->cmpl_freq;
470
471 if (rans1 >= s1->x_max) *--ptr16 = (uint16_t)rans1, rans1 >>= 16;
472 if (rans0 >= s0->x_max) *--ptr16 = (uint16_t)rans0, rans0 >>= 16;
473 uint32_t q1 = (uint32_t) (((uint64_t)rans1 * s1->rcp_freq) >> s1->rcp_shift);
474 uint32_t q0 = (uint32_t) (((uint64_t)rans0 * s0->rcp_freq) >> s0->rcp_shift);
475 rans1 += s1->bias + q1 * s1->cmpl_freq;
476 rans0 += s0->bias + q0 * s0->cmpl_freq;
477
478 ptr = (uint8_t *)ptr16;
479 #endif
480 }
481
482 RansEncFlush(&rans3, &ptr);
483 RansEncFlush(&rans2, &ptr);
484 RansEncFlush(&rans1, &ptr);
485 RansEncFlush(&rans0, &ptr);
486
487 empty:
488 // Finalise block size and return it
489 *out_size = (out_end - ptr) + tab_size;
490
491 memmove(out + tab_size, ptr, out_end-ptr);
492
493 return out;
494 }
495
496 typedef struct {
497 unsigned char R[TOTFREQ];
498 } ari_decoder;
499
500 static
rans_uncompress_O0_4x16(unsigned char * in,unsigned int in_size,unsigned char * out,unsigned int out_sz)501 unsigned char *rans_uncompress_O0_4x16(unsigned char *in, unsigned int in_size,
502 unsigned char *out, unsigned int out_sz) {
503 if (in_size < 16) // 4-states at least
504 return NULL;
505
506 if (out_sz >= INT_MAX)
507 return NULL; // protect against some overflow cases
508
509 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
510 if (out_sz > 100000)
511 return NULL;
512 #endif
513
514 /* Load in the static tables */
515 unsigned char *cp = in, *out_free = NULL;
516 unsigned char *cp_end = in + in_size - 8; // within 8 => be extra safe
517 int i, j;
518 unsigned int x, y;
519 uint16_t sfreq[TOTFREQ+32];
520 uint16_t sbase[TOTFREQ+32]; // faster to use 32-bit on clang
521 uint8_t ssym [TOTFREQ+64]; // faster to use 16-bit on clang
522
523 if (!out)
524 out_free = out = malloc(out_sz);
525 if (!out)
526 return NULL;
527
528 // Precompute reverse lookup of frequency.
529 uint32_t F[256] = {0}, fsum;
530 int fsz = decode_freq(cp, cp_end, F, &fsum);
531 if (!fsz)
532 goto err;
533 cp += fsz;
534
535 normalise_freq_shift(F, fsum, TOTFREQ);
536
537 // Build symbols; fixme, do as part of decode, see the _d variant
538 for (j = x = 0; j < 256; j++) {
539 if (F[j]) {
540 if (F[j] > TOTFREQ - x)
541 goto err;
542 for (y = 0; y < F[j]; y++) {
543 ssym [y + x] = j;
544 sfreq[y + x] = F[j];
545 sbase[y + x] = y;
546 }
547 x += F[j];
548 }
549 }
550
551 if (x != TOTFREQ)
552 goto err;
553
554 if (cp+16 > cp_end+8)
555 goto err;
556
557 RansState R[4];
558 RansDecInit(&R[0], &cp); if (R[0] < RANS_BYTE_L) goto err;
559 RansDecInit(&R[1], &cp); if (R[1] < RANS_BYTE_L) goto err;
560 RansDecInit(&R[2], &cp); if (R[2] < RANS_BYTE_L) goto err;
561 RansDecInit(&R[3], &cp); if (R[3] < RANS_BYTE_L) goto err;
562
563 // Simple version is comparable to below, but only with -O3
564 //
565 // for (i = 0; cp < cp_end-8 && i < (out_sz&~7); i+=8) {
566 // for(j=0; j<8;j++) {
567 // RansState m = RansDecGet(&R[j%4], TF_SHIFT);
568 // R[j%4] = sfreq[m] * (R[j%4] >> TF_SHIFT) + sbase[m];
569 // out[i+j] = ssym[m];
570 // RansDecRenorm(&R[j%4], &cp);
571 // }
572 // }
573
574 for (i = 0; cp < cp_end-8 && i < (out_sz&~7); i+=8) {
575 for (j = 0; j < 8; j+=4) {
576 RansState m0 = RansDecGet(&R[0], TF_SHIFT);
577 RansState m1 = RansDecGet(&R[1], TF_SHIFT);
578 R[0] = sfreq[m0] * (R[0] >> TF_SHIFT) + sbase[m0];
579 R[1] = sfreq[m1] * (R[1] >> TF_SHIFT) + sbase[m1];
580
581 RansDecRenorm(&R[0], &cp);
582 RansDecRenorm(&R[1], &cp);
583
584 out[i+j+0] = ssym[m0];
585 out[i+j+1] = ssym[m1];
586
587 RansState m3 = RansDecGet(&R[2], TF_SHIFT);
588 RansState m4 = RansDecGet(&R[3], TF_SHIFT);
589
590 R[2] = sfreq[m3] * (R[2] >> TF_SHIFT) + sbase[m3];
591 R[3] = sfreq[m4] * (R[3] >> TF_SHIFT) + sbase[m4];
592
593 out[i+j+2] = ssym[m3];
594 out[i+j+3] = ssym[m4];
595
596 RansDecRenorm(&R[2], &cp);
597 RansDecRenorm(&R[3], &cp);
598 }
599 }
600
601 // remainder
602 for (; i < out_sz; i++) {
603 RansState m = RansDecGet(&R[i%4], TF_SHIFT);
604 R[i%4] = sfreq[m] * (R[i%4] >> TF_SHIFT) + sbase[m];
605 out[i] = ssym[m];
606 RansDecRenormSafe(&R[i%4], &cp, cp_end+8);
607 }
608
609 //fprintf(stderr, " 0 Decoded %d bytes\n", (int)(cp-in)); //c-size
610
611 return out;
612
613 err:
614 free(out_free);
615 return NULL;
616 }
617
618 //-----------------------------------------------------------------------------
619
fast_log(double a)620 double fast_log(double a) {
621 union { double d; long long x; } u = { a };
622 return (u.x - 4606921278410026770) * 1.539095918623324e-16; /* 1 / 6497320848556798.0; */
623 }
624
625 // Compute the entropy of 12-bit vs 10-bit frequency tables.
626 // 10 bit means smaller memory footprint when decoding and
627 // more speed due to cache hits, but it *may* be a poor
628 // compression fit.
compute_shift(uint32_t * F0,uint32_t (* F)[256],uint32_t * T,int * S)629 static int compute_shift(uint32_t *F0, uint32_t (*F)[256], uint32_t *T,
630 int *S) {
631 int i, j;
632
633 double e10 = 0, e12 = 0;
634 int max_tot = 0;
635 for (i = 0; i < 256; i++) {
636 if (F0[i] == 0)
637 continue;
638 int max_val = round2(T[i]);
639 int ns = 0;
640 #define MAX(a,b) ((a)>(b)?(a):(b))
641
642 // Number of samples that get their freq bumped to 1
643 int sm10 = 0, sm12 = 0;
644 for (j = 0; j < 256; j++) {
645 if (F[i][j] && max_val / F[i][j] > TOTFREQ_O1_FAST)
646 sm10++;
647 if (F[i][j] && max_val / F[i][j] > TOTFREQ_O1)
648 sm12++;
649 }
650
651 double l10 = log(TOTFREQ_O1_FAST + sm10);
652 double l12 = log(TOTFREQ_O1 + sm12);
653
654 for (j = 0; j < 256; j++) {
655 if (F[i][j]) {
656 ns++;
657
658 int x = (double)TOTFREQ_O1_FAST * F[i][j]/T[i];
659 e10 -= F[i][j] * (fast_log(MAX(x,1)) - l10);
660
661 x = (double)TOTFREQ_O1 * F[i][j]/T[i];
662 e12 -= F[i][j] * (fast_log(MAX(x,1)) - l12);
663
664 // Estimation of compressedf symbol freq table too.
665 e10 += 4;
666 e12 += 6;
667 }
668 }
669
670 // Order-1 frequencies often end up totalling under TOTFREQ.
671 // In this case it's smaller to output the real frequencies
672 // prior to normalisation and normalise after (with an extra
673 // normalisation step needed in the decoder too).
674 //
675 // Thus we normalise to a power of 2 only, store those,
676 // and renormalise later here (and in decoder) by bit-shift
677 // to get to the fixed size.
678 if (ns < 64 && max_val > 128) max_val /= 2;
679 if (max_val > 1024) max_val /= 2;
680 if (max_val > TOTFREQ_O1) max_val = TOTFREQ_O1;
681 S[i] = max_val; // scale to max this
682 if (max_tot < max_val)
683 max_tot = max_val;
684 }
685 int shift = e10/e12 < 1.01 || max_tot <= TOTFREQ_O1_FAST ? TF_SHIFT_O1_FAST : TF_SHIFT_O1;
686
687 // fprintf(stderr, "e10/12 = %f %f %f, shift %d\n",
688 // e10/log(256), e12/log(256), e10/e12, shift);
689
690 return shift;
691 }
692
693
694 static
rans_compress_O1_4x16(unsigned char * in,unsigned int in_size,unsigned char * out,unsigned int * out_size)695 unsigned char *rans_compress_O1_4x16(unsigned char *in, unsigned int in_size,
696 unsigned char *out, unsigned int *out_size) {
697 unsigned char *cp, *out_end, *op;
698 unsigned int tab_size;
699 RansEncSymbol syms[256][256];
700 int bound = rans_compress_bound_4x16(in_size,1)-20; // -20 for order/size/meta
701
702 if (!out) {
703 *out_size = bound;
704 out = malloc(*out_size);
705 }
706 if (!out || bound > *out_size)
707 return NULL;
708
709 if (((size_t)out)&1)
710 bound--;
711 out_end = out + bound;
712
713 uint32_t F[256][256] = {{0}}, T[256+MAGIC] = {0};
714 int i, j;
715
716 //memset(F, 0, 256*256*sizeof(int));
717 //memset(T, 0, 256*sizeof(int));
718
719 hist1_4(in, in_size, F, T);
720 F[0][in[1*(in_size>>2)]]++;
721 F[0][in[2*(in_size>>2)]]++;
722 F[0][in[3*(in_size>>2)]]++;
723 T[0]+=3;
724
725 op = cp = out;
726 *cp++ = 0; // uncompressed header marker
727
728 // Encode the order-0 symbols for use in the order-1 frequency tables
729 uint32_t F0[256+MAGIC] = {0};
730 present8(in, in_size, F0);
731 F0[0]=1;
732 cp += encode_alphabet(cp, F0);
733
734 // Decide between 10-bit and 12-bit freqs.
735 // Fills out S[] to hold the new scaled maximum value.
736 int S[256] = {0};
737 int shift = compute_shift(F0, F, T, S);
738
739 // Normalise so T[i] == TOTFREQ_O1
740 for (i = 0; i < 256; i++) {
741 unsigned int x;
742
743 if (F0[i] == 0)
744 continue;
745
746 int max_val = S[i];
747 if (shift == TF_SHIFT_O1_FAST && max_val > TOTFREQ_O1_FAST)
748 max_val = TOTFREQ_O1_FAST;
749
750 if (normalise_freq(F[i], T[i], max_val) < 0)
751 return NULL;
752 T[i]=max_val;
753
754 cp += encode_freq_d(cp, F0, F[i]);
755
756 normalise_freq_shift(F[i], T[i], 1<<shift); T[i]=1<<shift;
757
758 uint32_t *F_i_ = F[i];
759 for (x = j = 0; j < 256; j++) {
760 RansEncSymbolInit(&syms[i][j], x, F_i_[j], shift);
761 x += F_i_[j];
762 }
763
764 }
765
766 *op = shift<<4;
767 if (cp - op > 1000) {
768 // try rans0 compression of header
769 unsigned int u_freq_sz = cp-(op+1);
770 unsigned int c_freq_sz;
771 unsigned char *c_freq = rans_compress_O0_4x16(op+1, u_freq_sz, NULL, &c_freq_sz);
772 if (c_freq && c_freq_sz + 6 < cp-op) {
773 *op++ |= 1; // compressed
774 op += var_put_u32(op, NULL, u_freq_sz);
775 op += var_put_u32(op, NULL, c_freq_sz);
776 memcpy(op, c_freq, c_freq_sz);
777 cp = op+c_freq_sz;
778 }
779 free(c_freq);
780 }
781
782 //write(2, out+4, cp-(out+4));
783 tab_size = cp - out;
784 assert(tab_size < 257*257*3);
785
786 RansState rans0, rans1, rans2, rans3;
787 RansEncInit(&rans0);
788 RansEncInit(&rans1);
789 RansEncInit(&rans2);
790 RansEncInit(&rans3);
791
792 uint8_t* ptr = out_end;
793
794 int isz4 = in_size>>2;
795 int i0 = 1*isz4-2;
796 int i1 = 2*isz4-2;
797 int i2 = 3*isz4-2;
798 int i3 = 4*isz4-2;
799
800 unsigned char l0 = in[i0+1];
801 unsigned char l1 = in[i1+1];
802 unsigned char l2 = in[i2+1];
803 unsigned char l3 = in[i3+1];
804
805 // Deal with the remainder
806 l3 = in[in_size-1];
807 for (i3 = in_size-2; i3 > 4*isz4-2; i3--) {
808 unsigned char c3 = in[i3];
809 RansEncPutSymbol(&rans3, &ptr, &syms[c3][l3]);
810 l3 = c3;
811 }
812
813 for (; i0 >= 0; i0--, i1--, i2--, i3--) {
814 unsigned char c0, c1, c2, c3;
815 RansEncSymbol *s3 = &syms[c3 = in[i3]][l3];
816 RansEncSymbol *s2 = &syms[c2 = in[i2]][l2];
817 RansEncSymbol *s1 = &syms[c1 = in[i1]][l1];
818 RansEncSymbol *s0 = &syms[c0 = in[i0]][l0];
819
820 RansEncPutSymbol(&rans3, &ptr, s3);
821 RansEncPutSymbol(&rans2, &ptr, s2);
822 RansEncPutSymbol(&rans1, &ptr, s1);
823 RansEncPutSymbol(&rans0, &ptr, s0);
824
825 l0 = c0;
826 l1 = c1;
827 l2 = c2;
828 l3 = c3;
829 }
830
831 RansEncPutSymbol(&rans3, &ptr, &syms[0][l3]);
832 RansEncPutSymbol(&rans2, &ptr, &syms[0][l2]);
833 RansEncPutSymbol(&rans1, &ptr, &syms[0][l1]);
834 RansEncPutSymbol(&rans0, &ptr, &syms[0][l0]);
835
836 RansEncFlush(&rans3, &ptr);
837 RansEncFlush(&rans2, &ptr);
838 RansEncFlush(&rans1, &ptr);
839 RansEncFlush(&rans0, &ptr);
840
841 *out_size = (out_end - ptr) + tab_size;
842
843 cp = out;
844 memmove(out + tab_size, ptr, out_end-ptr);
845
846 return out;
847 }
848
849 #ifndef NO_THREADS
850 /*
851 * Thread local storage per thread in the pool.
852 */
853 pthread_once_t rans_once = PTHREAD_ONCE_INIT;
854 pthread_key_t rans_key;
855
rans_tls_init(void)856 static void rans_tls_init(void) {
857 pthread_key_create(&rans_key, free);
858 }
859 #endif
860
861 //#define MAGIC2 111
862 #define MAGIC2 179
863 //#define MAGIC2 0
864 typedef struct {
865 uint16_t f;
866 uint16_t b;
867 } fb_t;
868
869 static
rans_uncompress_O1_4x16(unsigned char * in,unsigned int in_size,unsigned char * out,unsigned int out_sz)870 unsigned char *rans_uncompress_O1_4x16(unsigned char *in, unsigned int in_size,
871 unsigned char *out, unsigned int out_sz) {
872 if (in_size < 16) // 4-states at least
873 return NULL;
874
875 if (out_sz >= INT_MAX)
876 return NULL; // protect against some overflow cases
877
878 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
879 if (out_sz > 100000)
880 return NULL;
881 #endif
882
883 /* Load in the static tables */
884 unsigned char *cp = in, *cp_end = in+in_size, *out_free = NULL;
885 unsigned char *c_freq = NULL;
886 int i, j = -999;
887 unsigned int x;
888
889 #ifndef NO_THREADS
890 /*
891 * The calloc below is expensive as it's a large structure. We
892 * could use malloc, but we're only initialising parts of the structure
893 * that we need to, as dictated by the frequency table. This is far
894 * faster than initialising everything (ie malloc+memset => calloc).
895 * Not initialising the data means malformed input with mismatching
896 * frequency tables to actual data can lead to accessing of the
897 * uninitialised sfb table and in turn potential leakage of the
898 * uninitialised memory returned by malloc. That could be anything at
899 * all, including important encryption keys used within a server (for
900 * example).
901 *
902 * However (I hope!) we don't care about leaking about the sfb symbol
903 * frequencies previously computed by an earlier execution of *this*
904 * code. So calloc once and reuse is the fastest alternative.
905 *
906 * We do this through pthread local storage as we don't know if this
907 * code is being executed in many threads simultaneously.
908 */
909 pthread_once(&rans_once, rans_tls_init);
910
911 uint8_t *sfb_ = pthread_getspecific(rans_key);
912 if (!sfb_) {
913 sfb_ = calloc(256*(TOTFREQ_O1+MAGIC2), sizeof(*sfb_));
914 pthread_setspecific(rans_key, sfb_);
915 }
916 #else
917 uint8_t *sfb_ = calloc(256*(TOTFREQ_O1+MAGIC2), sizeof(*sfb_));
918 #endif
919
920 if (!sfb_)
921 return NULL;
922 fb_t fb[256][256];
923 uint8_t *sfb[256];
924 if ((*cp >> 4) == TF_SHIFT_O1) {
925 for (i = 0; i < 256; i++)
926 sfb[i]= sfb_ + i*(TOTFREQ_O1+MAGIC2);
927 } else {
928 for (i = 0; i < 256; i++)
929 sfb[i]= sfb_ + i*(TOTFREQ_O1_FAST+MAGIC2);
930 }
931
932 if (!out)
933 out_free = out = malloc(out_sz);
934
935 if (!out)
936 goto err;
937
938 //fprintf(stderr, "out_sz=%d\n", out_sz);
939
940 // compressed header? If so uncompress it
941 unsigned char *tab_end = NULL;
942 unsigned char *c_freq_end = cp_end;
943 unsigned int shift = *cp >> 4;
944 if (*cp++ & 1) {
945 uint32_t u_freq_sz, c_freq_sz;
946 cp += var_get_u32(cp, cp_end, &u_freq_sz);
947 cp += var_get_u32(cp, cp_end, &c_freq_sz);
948 if (c_freq_sz >= cp_end - cp - 16)
949 goto err;
950 tab_end = cp + c_freq_sz;
951 if (!(c_freq = rans_uncompress_O0_4x16(cp, c_freq_sz, NULL, u_freq_sz)))
952 goto err;
953 cp = c_freq;
954 c_freq_end = c_freq + u_freq_sz;
955 }
956
957 // Decode order-0 symbol list; avoids needing in order-1 tables
958 uint32_t F0[256] = {0};
959 int fsz = decode_alphabet(cp, c_freq_end, F0);
960 if (!fsz)
961 goto err;
962 cp += fsz;
963
964 if (cp >= c_freq_end)
965 goto err;
966
967 for (i = 0; i < 256; i++) {
968 if (F0[i] == 0)
969 continue;
970
971 uint32_t F[256] = {0}, T = 0;
972 fsz = decode_freq_d(cp, c_freq_end, F0, F, &T);
973 if (!fsz)
974 goto err;
975 cp += fsz;
976
977 if (!T) {
978 //fprintf(stderr, "No freq for F_%d\n", i);
979 continue;
980 }
981
982 normalise_freq_shift(F, T, 1<<shift);
983
984 // Build symbols; fixme, do as part of decode, see the _d variant
985 for (j = x = 0; j < 256; j++) {
986 if (F[j]) {
987 if (F[j] > (1<<shift) - x)
988 goto err;
989
990 memset(&sfb[i][x], j, F[j]);
991 fb[i][j].f = F[j];
992 fb[i][j].b = x;
993 x += F[j];
994 }
995 }
996 if (x != (1<<shift))
997 goto err;
998 }
999
1000 if (tab_end)
1001 cp = tab_end;
1002 free(c_freq);
1003 c_freq = NULL;
1004
1005 if (cp+16 > cp_end)
1006 goto err;
1007
1008 RansState rans0, rans1, rans2, rans3;
1009 uint8_t *ptr = cp, *ptr_end = in + in_size - 8;
1010 RansDecInit(&rans0, &ptr); if (rans0 < RANS_BYTE_L) goto err;
1011 RansDecInit(&rans1, &ptr); if (rans1 < RANS_BYTE_L) goto err;
1012 RansDecInit(&rans2, &ptr); if (rans2 < RANS_BYTE_L) goto err;
1013 RansDecInit(&rans3, &ptr); if (rans3 < RANS_BYTE_L) goto err;
1014
1015 unsigned int isz4 = out_sz>>2;
1016 int l0 = 0, l1 = 0, l2 = 0, l3 = 0;
1017 unsigned int i4[] = {0*isz4, 1*isz4, 2*isz4, 3*isz4};
1018
1019 RansState R[4];
1020 R[0] = rans0;
1021 R[1] = rans1;
1022 R[2] = rans2;
1023 R[3] = rans3;
1024
1025 // Around 15% faster to specialise for 10/12 than to have one
1026 // loop with shift as a variable.
1027 if (shift == TF_SHIFT_O1) {
1028 // TF_SHIFT_O1 = 12
1029
1030 const uint32_t mask = ((1u << TF_SHIFT_O1)-1);
1031 for (; i4[0] < isz4; i4[0]++, i4[1]++, i4[2]++, i4[3]++) {
1032 uint16_t m, c;
1033 c = sfb[l0][m = R[0] & mask];
1034 R[0] = fb[l0][c].f * (R[0]>>TF_SHIFT_O1) + m - fb[l0][c].b;
1035 out[i4[0]] = l0 = c;
1036
1037 c = sfb[l1][m = R[1] & mask];
1038 R[1] = fb[l1][c].f * (R[1]>>TF_SHIFT_O1) + m - fb[l1][c].b;
1039 out[i4[1]] = l1 = c;
1040
1041 c = sfb[l2][m = R[2] & mask];
1042 R[2] = fb[l2][c].f * (R[2]>>TF_SHIFT_O1) + m - fb[l2][c].b;
1043 out[i4[2]] = l2 = c;
1044
1045 c = sfb[l3][m = R[3] & mask];
1046 R[3] = fb[l3][c].f * (R[3]>>TF_SHIFT_O1) + m - fb[l3][c].b;
1047 out[i4[3]] = l3 = c;
1048
1049 if (ptr < ptr_end) {
1050 RansDecRenorm(&R[0], &ptr);
1051 RansDecRenorm(&R[1], &ptr);
1052 RansDecRenorm(&R[2], &ptr);
1053 RansDecRenorm(&R[3], &ptr);
1054 } else {
1055 RansDecRenormSafe(&R[0], &ptr, ptr_end+8);
1056 RansDecRenormSafe(&R[1], &ptr, ptr_end+8);
1057 RansDecRenormSafe(&R[2], &ptr, ptr_end+8);
1058 RansDecRenormSafe(&R[3], &ptr, ptr_end+8);
1059 }
1060 }
1061
1062 // Remainder
1063 for (; i4[3] < out_sz; i4[3]++) {
1064 uint32_t m3 = R[3] & ((1u<<TF_SHIFT_O1)-1);
1065 unsigned char c3 = sfb[l3][m3];
1066 out[i4[3]] = c3;
1067 R[3] = fb[l3][c3].f * (R[3]>>TF_SHIFT_O1) + m3 - fb[l3][c3].b;
1068 RansDecRenormSafe(&R[3], &ptr, ptr_end + 8);
1069 l3 = c3;
1070 }
1071 } else {
1072 // TF_SHIFT_O1 = 10
1073 const uint32_t mask = ((1u << TF_SHIFT_O1_FAST)-1);
1074 for (; i4[0] < isz4; i4[0]++, i4[1]++, i4[2]++, i4[3]++) {
1075 uint16_t m, c;
1076 c = sfb[l0][m = R[0] & mask];
1077 R[0] = fb[l0][c].f * (R[0]>>TF_SHIFT_O1_FAST) + m - fb[l0][c].b;
1078 out[i4[0]] = l0 = c;
1079
1080 c = sfb[l1][m = R[1] & mask];
1081 R[1] = fb[l1][c].f * (R[1]>>TF_SHIFT_O1_FAST) + m - fb[l1][c].b;
1082 out[i4[1]] = l1 = c;
1083
1084 c = sfb[l2][m = R[2] & mask];
1085 R[2] = fb[l2][c].f * (R[2]>>TF_SHIFT_O1_FAST) + m - fb[l2][c].b;
1086 out[i4[2]] = l2 = c;
1087
1088 c = sfb[l3][m = R[3] & mask];
1089 R[3] = fb[l3][c].f * (R[3]>>TF_SHIFT_O1_FAST) + m - fb[l3][c].b;
1090 out[i4[3]] = l3 = c;
1091
1092 if (ptr < ptr_end) {
1093 RansDecRenorm(&R[0], &ptr);
1094 RansDecRenorm(&R[1], &ptr);
1095 RansDecRenorm(&R[2], &ptr);
1096 RansDecRenorm(&R[3], &ptr);
1097 } else {
1098 RansDecRenormSafe(&R[0], &ptr, ptr_end+8);
1099 RansDecRenormSafe(&R[1], &ptr, ptr_end+8);
1100 RansDecRenormSafe(&R[2], &ptr, ptr_end+8);
1101 RansDecRenormSafe(&R[3], &ptr, ptr_end+8);
1102 }
1103 }
1104
1105 // Remainder
1106 for (; i4[3] < out_sz; i4[3]++) {
1107 uint32_t m3 = R[3] & ((1u<<TF_SHIFT_O1_FAST)-1);
1108 unsigned char c3 = sfb[l3][m3];
1109 out[i4[3]] = c3;
1110 R[3] = fb[l3][c3].f * (R[3]>>TF_SHIFT_O1_FAST) + m3 - fb[l3][c3].b;
1111 RansDecRenormSafe(&R[3], &ptr, ptr_end + 8);
1112 l3 = c3;
1113 }
1114 }
1115 //fprintf(stderr, " 1 Decoded %d bytes\n", (int)(ptr-in)); //c-size
1116
1117 #ifdef NO_THREADS
1118 free(sfb_);
1119 #endif
1120 return out;
1121
1122 err:
1123 #ifdef NO_THREADS
1124 free(sfb_);
1125 #endif
1126 free(out_free);
1127 free(c_freq);
1128
1129 return NULL;
1130 }
1131
1132
1133 /*-----------------------------------------------------------------------------
1134 * Simple interface to the order-0 vs order-1 encoders and decoders.
1135 *
1136 * Smallest is method, <in_size> <input>, so worst case 2 bytes longer.
1137 */
rans_compress_to_4x16(unsigned char * in,unsigned int in_size,unsigned char * out,unsigned int * out_size,int order)1138 unsigned char *rans_compress_to_4x16(unsigned char *in, unsigned int in_size,
1139 unsigned char *out, unsigned int *out_size,
1140 int order) {
1141 unsigned int c_meta_len;
1142 uint8_t *meta = NULL, *rle = NULL, *packed = NULL;
1143
1144 if (!out) {
1145 *out_size = rans_compress_bound_4x16(in_size, order);
1146 if (!(out = malloc(*out_size)))
1147 return NULL;
1148 }
1149 unsigned char *out_end = out + *out_size;
1150
1151 if (in_size <= 20)
1152 order &= ~X_STRIPE;
1153
1154 if (order & X_STRIPE) {
1155 int N = (order>>8);
1156 if (N == 0) N = 4; // default for compatibility with old tests
1157
1158 if (N > 255)
1159 return NULL;
1160
1161 unsigned char *transposed = malloc(in_size);
1162 unsigned int part_len[256];
1163 unsigned int idx[256];
1164 if (!transposed)
1165 return NULL;
1166 int i, j, x;
1167
1168 for (i = 0; i < N; i++) {
1169 part_len[i] = in_size / N + ((in_size % N) > i);
1170 idx[i] = i ? idx[i-1] + part_len[i-1] : 0; // cumulative index
1171 }
1172
1173 for (i = x = 0; i < in_size-N; i += N, x++) {
1174 for (j = 0; j < N; j++)
1175 transposed[idx[j]+x] = in[i+j];
1176 }
1177 for (; i < in_size; i += N, x++) {
1178 for (j = 0; i+j < in_size; j++)
1179 transposed[idx[j]+x] = in[i+j];
1180 }
1181
1182 unsigned int olen2;
1183 unsigned char *out2, *out2_start;
1184 c_meta_len = 1;
1185 *out = order & ~X_NOSZ;
1186 c_meta_len += var_put_u32(out+c_meta_len, out_end, in_size);
1187 out[c_meta_len++] = N;
1188
1189 out2_start = out2 = out+2+5*N; // shares a buffer with c_meta
1190 for (i = 0; i < N; i++) {
1191 // Brute force try all methods.
1192 int j, m[] = {1,64,128,0}, best_j = 0, best_sz = in_size+10;
1193 for (j = 0; j < 4; j++) {
1194 if ((order & m[j]) != m[j])
1195 continue;
1196 olen2 = *out_size - (out2 - out);
1197 rans_compress_to_4x16(transposed+idx[i], part_len[i],
1198 out2, &olen2, m[j] | X_NOSZ);
1199 if (best_sz > olen2) {
1200 best_sz = olen2;
1201 best_j = j;
1202 }
1203 }
1204 if (best_j != j-1) {
1205 olen2 = *out_size - (out2 - out);
1206 rans_compress_to_4x16(transposed+idx[i], part_len[i],
1207 out2, &olen2, m[best_j] | X_NOSZ);
1208 }
1209 out2 += olen2;
1210 c_meta_len += var_put_u32(out+c_meta_len, out_end, olen2);
1211 }
1212 memmove(out+c_meta_len, out2_start, out2-out2_start);
1213 free(transposed);
1214 *out_size = c_meta_len + out2-out2_start;
1215 return out;
1216 }
1217
1218 if (order & X_CAT) {
1219 out[0] = X_CAT;
1220 c_meta_len = 1;
1221 c_meta_len += var_put_u32(&out[1], out_end, in_size);
1222 memcpy(out+c_meta_len, in, in_size);
1223 *out_size = c_meta_len + in_size;
1224 return out;
1225 }
1226
1227 int do_pack = order & X_PACK;
1228 int do_rle = order & X_RLE;
1229 int no_size = order & X_NOSZ;
1230
1231 out[0] = order;
1232 c_meta_len = 1;
1233
1234 if (!no_size)
1235 c_meta_len += var_put_u32(&out[1], out_end, in_size);
1236
1237 order &= 0xf;
1238
1239 // Format is compressed meta-data, compressed data.
1240 // Meta-data can be empty, pack, rle lengths, or pack + rle lengths.
1241 // Data is either the original data, bit-packed packed, rle literals or
1242 // packed + rle literals.
1243
1244 if (do_pack && in_size) {
1245 // PACK 2, 4 or 8 symbols into one byte.
1246 int pmeta_len;
1247 uint64_t packed_len;
1248 packed = hts_pack(in, in_size, out+c_meta_len, &pmeta_len, &packed_len);
1249 if (!packed || (pmeta_len == 1 && out[c_meta_len] > 16)) {
1250 out[0] &= ~X_PACK;
1251 do_pack = 0;
1252 free(packed);
1253 packed = NULL;
1254 } else {
1255 in = packed;
1256 in_size = packed_len;
1257 c_meta_len += pmeta_len;
1258
1259 // Could derive this rather than storing verbatim.
1260 // Orig size * 8/nbits (+1 if not multiple of 8/n)
1261 int sz = var_put_u32(out+c_meta_len, out_end, in_size);
1262 c_meta_len += sz;
1263 *out_size -= sz;
1264 }
1265 } else if (do_pack) {
1266 out[0] &= ~X_PACK;
1267 }
1268
1269 if (do_rle && in_size) {
1270 // RLE 'in' -> rle_length + rle_literals arrays
1271 unsigned int rmeta_len, c_rmeta_len;
1272 uint64_t rle_len;
1273 c_rmeta_len = in_size+257;
1274 if (!(meta = malloc(c_rmeta_len)))
1275 return NULL;
1276
1277 uint8_t rle_syms[256];
1278 int rle_nsyms = 0;
1279 uint64_t rmeta_len64;
1280 rle = rle_encode(in, in_size, meta, &rmeta_len64,
1281 rle_syms, &rle_nsyms, NULL, &rle_len);
1282 memmove(meta+1+rle_nsyms, meta, rmeta_len64);
1283 meta[0] = rle_nsyms;
1284 memcpy(meta+1, rle_syms, rle_nsyms);
1285 rmeta_len = rmeta_len64 + rle_nsyms+1;
1286
1287 if (!rle || rle_len + rmeta_len >= .99*in_size) {
1288 // Not worth the speed hit.
1289 out[0] &= ~X_RLE;
1290 do_rle = 0;
1291 free(rle);
1292 rle = NULL;
1293 } else {
1294 // Compress lengths with O0 and literals with O0/O1 ("order" param)
1295 int sz = var_put_u32(out+c_meta_len, out_end, rmeta_len*2), sz2;
1296 sz += var_put_u32(out+c_meta_len+sz, out_end, rle_len);
1297 c_rmeta_len = *out_size - (c_meta_len+sz+5);
1298 rans_compress_O0_4x16(meta, rmeta_len, out+c_meta_len+sz+5, &c_rmeta_len);
1299 if (c_rmeta_len < rmeta_len) {
1300 sz2 = var_put_u32(out+c_meta_len+sz, out_end, c_rmeta_len);
1301 memmove(out+c_meta_len+sz+sz2, out+c_meta_len+sz+5, c_rmeta_len);
1302 } else {
1303 // Uncompressed RLE meta-data as too small
1304 sz = var_put_u32(out+c_meta_len, out_end, rmeta_len*2+1);
1305 sz2 = var_put_u32(out+c_meta_len+sz, out_end, rle_len);
1306 memcpy(out+c_meta_len+sz+sz2, meta, rmeta_len);
1307 c_rmeta_len = rmeta_len;
1308 }
1309
1310 c_meta_len += sz + sz2 + c_rmeta_len;
1311
1312 in = rle;
1313 in_size = rle_len;
1314 }
1315
1316 free(meta);
1317 } else if (do_rle) {
1318 out[0] &= ~X_RLE;
1319 }
1320
1321 *out_size -= c_meta_len;
1322 if (order && in_size < 8) {
1323 out[0] &= ~1;
1324 order &= ~1;
1325 }
1326
1327 if (order == 1)
1328 rans_compress_O1_4x16(in, in_size, out+c_meta_len, out_size);
1329 else
1330 rans_compress_O0_4x16(in, in_size, out+c_meta_len, out_size);
1331
1332 if (*out_size >= in_size) {
1333 out[0] &= ~3;
1334 out[0] |= X_CAT | no_size;
1335 memcpy(out+c_meta_len, in, in_size);
1336 *out_size = in_size;
1337 }
1338
1339 free(rle);
1340 free(packed);
1341
1342 *out_size += c_meta_len;
1343
1344 return out;
1345 }
1346
rans_compress_4x16(unsigned char * in,unsigned int in_size,unsigned int * out_size,int order)1347 unsigned char *rans_compress_4x16(unsigned char *in, unsigned int in_size,
1348 unsigned int *out_size, int order) {
1349 return rans_compress_to_4x16(in, in_size, NULL, out_size, order);
1350 }
1351
rans_uncompress_to_4x16(unsigned char * in,unsigned int in_size,unsigned char * out,unsigned int * out_size)1352 unsigned char *rans_uncompress_to_4x16(unsigned char *in, unsigned int in_size,
1353 unsigned char *out, unsigned int *out_size) {
1354 unsigned char *in_end = in + in_size;
1355 unsigned char *out_free = NULL, *tmp_free = NULL, *meta_free = NULL;
1356
1357 if (in_size == 0)
1358 return NULL;
1359
1360 if (*in & X_STRIPE) {
1361 unsigned int ulen, olen, c_meta_len = 1;
1362 int i;
1363 uint64_t clen_tot = 0;
1364
1365 // Decode lengths
1366 c_meta_len += var_get_u32(in+c_meta_len, in_end, &ulen);
1367 if (c_meta_len >= in_size)
1368 return NULL;
1369 unsigned int N = in[c_meta_len++];
1370 unsigned int clenN[256], ulenN[256], idxN[256];
1371 if (!out) {
1372 if (ulen >= INT_MAX)
1373 return NULL;
1374 if (!(out_free = out = malloc(ulen))) {
1375 return NULL;
1376 }
1377 *out_size = ulen;
1378 }
1379 if (ulen != *out_size) {
1380 free(out_free);
1381 return NULL;
1382 }
1383
1384 for (i = 0; i < N; i++) {
1385 ulenN[i] = ulen / N + ((ulen % N) > i);
1386 idxN[i] = i ? idxN[i-1] + ulenN[i-1] : 0;
1387 c_meta_len += var_get_u32(in+c_meta_len, in_end, &clenN[i]);
1388 clen_tot += clenN[i];
1389 if (c_meta_len > in_size || clenN[i] > in_size || clenN[i] < 1) {
1390 free(out_free);
1391 return NULL;
1392 }
1393 }
1394
1395 // We can call this with a larger buffer, but once we've determined
1396 // how much we really use we limit it so the recursion becomes easier
1397 // to limit.
1398 if (c_meta_len + clen_tot > in_size) {
1399 free(out_free);
1400 return NULL;
1401 }
1402 in_size = c_meta_len + clen_tot;
1403
1404 //fprintf(stderr, " stripe meta %d\n", c_meta_len); //c-size
1405
1406 // Uncompress the N streams
1407 unsigned char *outN = malloc(ulen);
1408 if (!outN) {
1409 free(out_free);
1410 return NULL;
1411 }
1412 for (i = 0; i < N; i++) {
1413 olen = ulenN[i];
1414 if (in_size < c_meta_len) {
1415 free(out_free);
1416 free(outN);
1417 return NULL;
1418 }
1419 if (!rans_uncompress_to_4x16(in+c_meta_len, in_size-c_meta_len, outN + idxN[i], &olen)
1420 || olen != ulenN[i]) {
1421 free(out_free);
1422 free(outN);
1423 return NULL;
1424 }
1425 c_meta_len += clenN[i];
1426 }
1427
1428 unstripe(out, outN, ulen, N, idxN);
1429
1430 free(outN);
1431 *out_size = ulen;
1432 return out;
1433 }
1434
1435 int order = *in++; in_size--;
1436 int do_pack = order & X_PACK;
1437 int do_rle = order & X_RLE;
1438 int do_cat = order & X_CAT;
1439 int no_size = order & X_NOSZ;
1440 order &= 1;
1441
1442 int sz = 0;
1443 unsigned int osz;
1444 if (!no_size)
1445 sz = var_get_u32(in, in_end, &osz);
1446 else
1447 sz = 0, osz = *out_size;
1448 in += sz;
1449 in_size -= sz;
1450
1451 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1452 if (osz > 100000)
1453 return NULL;
1454 #endif
1455
1456 if (no_size && !out)
1457 goto err; // Need one or the other
1458
1459 if (!out) {
1460 *out_size = osz;
1461 if (!(out = out_free = malloc(*out_size)))
1462 return NULL;
1463 } else {
1464 if (*out_size < osz)
1465 goto err;
1466 *out_size = osz;
1467 }
1468
1469 // if (do_pack || do_rle) {
1470 // in += sz; // size field not needed when pure rANS
1471 // in_size -= sz;
1472 // }
1473
1474 uint32_t c_meta_size = 0;
1475 unsigned int tmp1_size = *out_size;
1476 unsigned int tmp2_size = *out_size;
1477 unsigned int tmp3_size = *out_size;
1478 unsigned char *tmp1 = NULL, *tmp2 = NULL, *tmp3 = NULL, *tmp = NULL;
1479
1480 // Need In, Out and Tmp buffers with temporary buffer of the same size
1481 // as output. All use rANS, but with optional transforms (none, RLE,
1482 // Pack, or both).
1483 //
1484 // rans unrle unpack
1485 // If none: in -> out
1486 // If RLE: in -> tmp -> out
1487 // If Pack: in -> tmp -> out
1488 // If RLE+Pack: in -> out -> tmp -> out
1489 // tmp1 tmp2 tmp3
1490 //
1491 // So rans is in -> tmp1
1492 // RLE is tmp1 -> tmp2
1493 // Unpack is tmp2 -> tmp3
1494
1495 // Format is meta data (Pack and RLE in that order if present),
1496 // followed by rANS compressed data.
1497
1498 if (do_pack || do_rle) {
1499 if (!(tmp = tmp_free = malloc(*out_size)))
1500 goto err;
1501 if (do_pack && do_rle) {
1502 tmp1 = out;
1503 tmp2 = tmp;
1504 tmp3 = out;
1505 } else if (do_pack) {
1506 tmp1 = tmp;
1507 tmp2 = tmp1;
1508 tmp3 = out;
1509 } else if (do_rle) {
1510 tmp1 = tmp;
1511 tmp2 = out;
1512 tmp3 = out;
1513 }
1514 } else {
1515 // neither
1516 tmp = NULL;
1517 tmp1 = out;
1518 tmp2 = out;
1519 tmp3 = out;
1520 }
1521
1522
1523 // Decode the bit-packing map.
1524 uint8_t map[16] = {0};
1525 int npacked_sym = 0;
1526 uint64_t unpacked_sz = 0; // FIXME: rename to packed_per_byte
1527 if (do_pack) {
1528 c_meta_size = hts_unpack_meta(in, in_size, *out_size, map, &npacked_sym);
1529 if (c_meta_size == 0)
1530 goto err;
1531
1532 unpacked_sz = osz;
1533 in += c_meta_size;
1534 in_size -= c_meta_size;
1535
1536 // New unpacked size. We could derive this bit from *out_size
1537 // and npacked_sym.
1538 unsigned int osz;
1539 sz = var_get_u32(in, in_end, &osz);
1540 in += sz;
1541 in_size -= sz;
1542 if (osz > tmp1_size)
1543 goto err;
1544 tmp1_size = osz;
1545 }
1546
1547 uint8_t *meta = NULL;
1548 uint32_t u_meta_size = 0;
1549 if (do_rle) {
1550 // Uncompress meta data
1551 uint32_t c_meta_size, rle_len, sz;
1552 sz = var_get_u32(in, in_end, &u_meta_size);
1553 sz += var_get_u32(in+sz, in_end, &rle_len);
1554 if (rle_len > tmp1_size) // should never grow
1555 goto err;
1556 if (u_meta_size & 1) {
1557 meta = in + sz;
1558 u_meta_size = u_meta_size/2 > (in_end-meta) ? (in_end-meta) : u_meta_size/2;
1559 c_meta_size = u_meta_size;
1560 } else {
1561 sz += var_get_u32(in+sz, in_end, &c_meta_size);
1562 u_meta_size /= 2;
1563 meta_free = meta = rans_uncompress_O0_4x16(in+sz, in_size-sz, NULL, u_meta_size);
1564 if (!meta)
1565 goto err;
1566 }
1567 if (c_meta_size+sz > in_size)
1568 goto err;
1569 in += c_meta_size+sz;
1570 in_size -= c_meta_size+sz;
1571 tmp1_size = rle_len;
1572 }
1573
1574 //fprintf(stderr, " meta_size %d bytes\n", (int)(in - orig_in)); //c-size
1575
1576 // uncompress RLE data. in -> tmp1
1577 if (in_size) {
1578 if (do_cat) {
1579 //fprintf(stderr, " CAT %d\n", tmp1_size); //c-size
1580 if (tmp1_size > in_size)
1581 goto err;
1582 if (tmp1_size > *out_size)
1583 goto err;
1584 memcpy(tmp1, in, tmp1_size);
1585 } else {
1586 tmp1 = order
1587 ? rans_uncompress_O1_4x16(in, in_size, tmp1, tmp1_size)
1588 : rans_uncompress_O0_4x16(in, in_size, tmp1, tmp1_size);
1589 if (!tmp1)
1590 goto err;
1591 }
1592 } else {
1593 tmp1 = NULL;
1594 tmp1_size = 0;
1595 }
1596 tmp2_size = tmp3_size = tmp1_size;
1597
1598 if (do_rle) {
1599 // Unpack RLE. tmp1 -> tmp2.
1600 if (u_meta_size == 0)
1601 goto err;
1602 uint64_t unrle_size = *out_size;
1603 int rle_nsyms = *meta ? *meta : 256;
1604 if (u_meta_size < 1+rle_nsyms)
1605 goto err;
1606 if (!rle_decode(tmp1, tmp1_size,
1607 meta+1+rle_nsyms, u_meta_size-(1+rle_nsyms),
1608 meta+1, rle_nsyms, tmp2, &unrle_size))
1609 goto err;
1610 tmp3_size = tmp2_size = unrle_size;
1611 free(meta_free);
1612 meta_free = NULL;
1613 }
1614 if (do_pack) {
1615 // Unpack bits via pack-map. tmp2 -> tmp3
1616 if (npacked_sym == 1)
1617 unpacked_sz = tmp2_size;
1618 //uint8_t *porig = unpack(tmp2, tmp2_size, unpacked_sz, npacked_sym, map);
1619 //memcpy(tmp3, porig, unpacked_sz);
1620 if (!hts_unpack(tmp2, tmp2_size, tmp3, unpacked_sz, npacked_sym, map))
1621 goto err;
1622 tmp3_size = unpacked_sz;
1623 }
1624
1625 if (tmp)
1626 free(tmp);
1627
1628 *out_size = tmp3_size;
1629 return tmp3;
1630
1631 err:
1632 free(meta_free);
1633 free(out_free);
1634 free(tmp_free);
1635 return NULL;
1636 }
1637
rans_uncompress_4x16(unsigned char * in,unsigned int in_size,unsigned int * out_size)1638 unsigned char *rans_uncompress_4x16(unsigned char *in, unsigned int in_size,
1639 unsigned int *out_size) {
1640 return rans_uncompress_to_4x16(in, in_size, NULL, out_size);
1641 }
1642