1 #pragma prototyped
2 
3 /*-------------------------------------------------------------*/
4 /*--- Compression machinery (not incl block sorting)        ---*/
5 /*---                                            compress.c ---*/
6 /*-------------------------------------------------------------*/
7 
8 /*--
9   This file is a part of bzip2 and/or libbzip2, a program and
10   library for lossless, block-sorting data compression.
11 
12   Copyright (C) 1996-1998 Julian R Seward.  All rights reserved.
13 
14   Redistribution and use in source and binary forms, with or without
15   modification, are permitted provided that the following conditions
16   are met:
17 
18   1. Redistributions of source code must retain the above copyright
19      notice, this list of conditions and the following disclaimer.
20 
21   2. The origin of this software must not be misrepresented; you must
22      not claim that you wrote the original software.  If you use this
23      software in a product, an acknowledgment in the product
24      documentation would be appreciated but is not required.
25 
26   3. Altered source versions must be plainly marked as such, and must
27      not be misrepresented as being the original software.
28 
29   4. The name of the author may not be used to endorse or promote
30      products derived from this software without specific prior written
31      permission.
32 
33   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
34   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
35   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
37   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
38   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
39   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
40   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
41   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
42   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
43   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44 
45   Julian Seward, Guildford, Surrey, UK.
46   jseward@acm.org
47   bzip2/libbzip2 version 0.9.0 of 28 June 1998
48 
49   This program is based on (at least) the work of:
50      Mike Burrows
51      David Wheeler
52      Peter Fenwick
53      Alistair Moffat
54      Radford Neal
55      Ian H. Witten
56      Robert Sedgewick
57      Jon L. Bentley
58 
59   For more information on these sources, see the manual.
60 --*/
61 
62 /*--
63    CHANGES
64    ~~~~~~~
65    0.9.0 -- original version.
66 
67    0.9.0a/b -- no changes in this file.
68 
69    0.9.0c
70       * changed setting of nGroups in sendMTFValues() so as to
71         do a bit better on small files
72 --*/
73 
74 #include "bzhdr.h"
75 
76 
77 /*---------------------------------------------------*/
78 /*--- Bit stream I/O                              ---*/
79 /*---------------------------------------------------*/
80 
81 /*---------------------------------------------------*/
bsInitWrite(EState * s)82 void bsInitWrite ( EState* s )
83 {
84    s->bsLive = 0;
85    s->bsBuff = 0;
86 }
87 
88 
89 /*---------------------------------------------------*/
90 static
bsFinishWrite(EState * s)91 void bsFinishWrite ( EState* s )
92 {
93    while (s->bsLive > 0) {
94       ((UChar*)(s->quadrant))[s->numZ] = (UChar)(s->bsBuff >> 24);
95       s->numZ++;
96       s->bsBuff <<= 8;
97       s->bsLive -= 8;
98    }
99 }
100 
101 
102 /*---------------------------------------------------*/
103 #define bsNEEDW(nz)                           \
104 {                                             \
105    while (s->bsLive >= 8) {                   \
106       ((UChar*)(s->quadrant))[s->numZ]        \
107          = (UChar)(s->bsBuff >> 24);          \
108       s->numZ++;                              \
109       s->bsBuff <<= 8;                        \
110       s->bsLive -= 8;                         \
111    }                                          \
112 }
113 
114 
115 /*---------------------------------------------------*/
116 static
bsW(EState * s,Int32 n,UInt32 v)117 void bsW ( EState* s, Int32 n, UInt32 v )
118 {
119    bsNEEDW ( n );
120    s->bsBuff |= (v << (32 - s->bsLive - n));
121    s->bsLive += n;
122 }
123 
124 
125 /*---------------------------------------------------*/
126 static
bsPutUInt32(EState * s,UInt32 u)127 void bsPutUInt32 ( EState* s, UInt32 u )
128 {
129    bsW ( s, 8, (u >> 24) & 0xffL );
130    bsW ( s, 8, (u >> 16) & 0xffL );
131    bsW ( s, 8, (u >>  8) & 0xffL );
132    bsW ( s, 8,  u        & 0xffL );
133 }
134 
135 
136 /*---------------------------------------------------*/
137 static
bsPutUChar(EState * s,UChar c)138 void bsPutUChar ( EState* s, UChar c )
139 {
140    bsW( s, 8, (UInt32)c );
141 }
142 
143 
144 /*---------------------------------------------------*/
145 /*--- The back end proper                         ---*/
146 /*---------------------------------------------------*/
147 
148 /*---------------------------------------------------*/
149 static
makeMaps_e(EState * s)150 void makeMaps_e ( EState* s )
151 {
152    Int32 i;
153    s->nInUse = 0;
154    for (i = 0; i < 256; i++)
155       if (s->inUse[i]) {
156          s->unseqToSeq[i] = s->nInUse;
157          s->nInUse++;
158       }
159 }
160 
161 
162 /*---------------------------------------------------*/
163 static
generateMTFValues(EState * s)164 void generateMTFValues ( EState* s )
165 {
166    UChar  yy[256];
167    Int32  i, j;
168    UChar  tmp;
169    UChar  tmp2;
170    Int32  zPend;
171    Int32  wr;
172    Int32  EOB;
173 
174    makeMaps_e ( s );
175    EOB = s->nInUse+1;
176 
177    for (i = 0; i <= EOB; i++) s->mtfFreq[i] = 0;
178 
179    wr = 0;
180    zPend = 0;
181    for (i = 0; i < s->nInUse; i++) yy[i] = (UChar) i;
182 
183    for (i = 0; i < s->nblock; i++) {
184       UChar ll_i;
185 
186       AssertD ( wr <= i, "generateMTFValues(1)" );
187       j = s->zptr[i]-1; if (j < 0) j += s->nblock;
188       ll_i = s->unseqToSeq[s->block[j]];
189       AssertD ( ll_i < s->nInUse, "generateMTFValues(2a)" );
190 
191       j = 0;
192       tmp = yy[j];
193       while ( ll_i != tmp ) {
194          j++;
195          tmp2 = tmp;
196          tmp = yy[j];
197          yy[j] = tmp2;
198       };
199       yy[0] = tmp;
200 
201       if (j == 0) {
202          zPend++;
203       } else {
204          if (zPend > 0) {
205             zPend--;
206             while (True) {
207                switch (zPend % 2) {
208                   case 0: s->szptr[wr] = BZ_RUNA; wr++; s->mtfFreq[BZ_RUNA]++; break;
209                   case 1: s->szptr[wr] = BZ_RUNB; wr++; s->mtfFreq[BZ_RUNB]++; break;
210                };
211                if (zPend < 2) break;
212                zPend = (zPend - 2) / 2;
213             };
214             zPend = 0;
215          }
216          s->szptr[wr] = j+1; wr++; s->mtfFreq[j+1]++;
217       }
218    }
219 
220    if (zPend > 0) {
221       zPend--;
222       while (True) {
223          switch (zPend % 2) {
224             case 0:  s->szptr[wr] = BZ_RUNA; wr++; s->mtfFreq[BZ_RUNA]++; break;
225             case 1:  s->szptr[wr] = BZ_RUNB; wr++; s->mtfFreq[BZ_RUNB]++; break;
226          };
227          if (zPend < 2) break;
228          zPend = (zPend - 2) / 2;
229       };
230    }
231 
232    s->szptr[wr] = EOB; wr++; s->mtfFreq[EOB]++;
233 
234    s->nMTF = wr;
235 }
236 
237 
238 /*---------------------------------------------------*/
239 #define BZ_LESSER_ICOST  0
240 #define BZ_GREATER_ICOST 15
241 
242 static
sendMTFValues(EState * s)243 void sendMTFValues ( EState* s )
244 {
245    Int32 v, t, i, j, gs, ge, totc, bt, bc, iter;
246    Int32 nSelectors, alphaSize, minLen, maxLen, selCtr;
247    Int32 nGroups, nBytes;
248 
249    /*--
250    UChar  len [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
251    is a global since the decoder also needs it.
252 
253    Int32  code[BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
254    Int32  rfreq[BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
255    are also globals only used in this proc.
256    Made global to keep stack frame size small.
257    --*/
258 
259 
260    UInt16 cost[BZ_N_GROUPS];
261    Int32  fave[BZ_N_GROUPS];
262 
263    if (s->verbosity >= 3)
264       VPrintf3( "      %d in block, %d after MTF & 1-2 coding, "
265                 "%d+2 syms in use\n",
266                 s->nblock, s->nMTF, s->nInUse );
267 
268    alphaSize = s->nInUse+2;
269    for (t = 0; t < BZ_N_GROUPS; t++)
270       for (v = 0; v < alphaSize; v++)
271          s->len[t][v] = BZ_GREATER_ICOST;
272 
273    /*--- Decide how many coding tables to use ---*/
274    AssertH ( s->nMTF > 0, 3001 );
275    if (s->nMTF < 200)  nGroups = 2; else
276    if (s->nMTF < 600)  nGroups = 3; else
277    if (s->nMTF < 1200) nGroups = 4; else
278    if (s->nMTF < 2400) nGroups = 5; else
279                        nGroups = 6;
280 
281    /*--- Generate an initial set of coding tables ---*/
282    {
283       Int32 nPart, remF, tFreq, aFreq;
284 
285       nPart = nGroups;
286       remF  = s->nMTF;
287       gs = 0;
288       while (nPart > 0) {
289          tFreq = remF / nPart;
290          ge = gs-1;
291          aFreq = 0;
292          while (aFreq < tFreq && ge < alphaSize-1) {
293             ge++;
294             aFreq += s->mtfFreq[ge];
295          }
296 
297          if (ge > gs
298              && nPart != nGroups && nPart != 1
299              && ((nGroups-nPart) % 2 == 1)) {
300             aFreq -= s->mtfFreq[ge];
301             ge--;
302          }
303 
304          if (s->verbosity >= 3)
305             VPrintf5( "      initial group %d, [%d .. %d], "
306                       "has %d syms (%4.1f%%)\n",
307                       nPart, gs, ge, aFreq,
308                       (100.0 * (float)aFreq) / (float)(s->nMTF) );
309 
310          for (v = 0; v < alphaSize; v++)
311             if (v >= gs && v <= ge)
312                s->len[nPart-1][v] = BZ_LESSER_ICOST; else
313                s->len[nPart-1][v] = BZ_GREATER_ICOST;
314 
315          nPart--;
316          gs = ge+1;
317          remF -= aFreq;
318       }
319    }
320 
321    /*---
322       Iterate up to BZ_N_ITERS times to improve the tables.
323    ---*/
324    for (iter = 0; iter < BZ_N_ITERS; iter++) {
325 
326       for (t = 0; t < nGroups; t++) fave[t] = 0;
327 
328       for (t = 0; t < nGroups; t++)
329          for (v = 0; v < alphaSize; v++)
330             s->rfreq[t][v] = 0;
331 
332       nSelectors = 0;
333       totc = 0;
334       gs = 0;
335       while (True) {
336 
337          /*--- Set group start & end marks. --*/
338          if (gs >= s->nMTF) break;
339          ge = gs + BZ_G_SIZE - 1;
340          if (ge >= s->nMTF) ge = s->nMTF-1;
341 
342          /*--
343             Calculate the cost of this group as coded
344             by each of the coding tables.
345          --*/
346          for (t = 0; t < nGroups; t++) cost[t] = 0;
347 
348          if (nGroups == 6) {
349             register UInt16 cost0, cost1, cost2, cost3, cost4, cost5;
350             cost0 = cost1 = cost2 = cost3 = cost4 = cost5 = 0;
351             for (i = gs; i <= ge; i++) {
352                UInt16 icv = s->szptr[i];
353                cost0 += s->len[0][icv];
354                cost1 += s->len[1][icv];
355                cost2 += s->len[2][icv];
356                cost3 += s->len[3][icv];
357                cost4 += s->len[4][icv];
358                cost5 += s->len[5][icv];
359             }
360             cost[0] = cost0; cost[1] = cost1; cost[2] = cost2;
361             cost[3] = cost3; cost[4] = cost4; cost[5] = cost5;
362          } else {
363             for (i = gs; i <= ge; i++) {
364                UInt16 icv = s->szptr[i];
365                for (t = 0; t < nGroups; t++) cost[t] += s->len[t][icv];
366             }
367          }
368 
369          /*--
370             Find the coding table which is best for this group,
371             and record its identity in the selector table.
372          --*/
373          bc = 999999999; bt = -1;
374          for (t = 0; t < nGroups; t++)
375             if (cost[t] < bc) { bc = cost[t]; bt = t; };
376          totc += bc;
377          fave[bt]++;
378          s->selector[nSelectors] = bt;
379          nSelectors++;
380 
381          /*--
382             Increment the symbol frequencies for the selected table.
383           --*/
384          for (i = gs; i <= ge; i++)
385             s->rfreq[bt][ s->szptr[i] ]++;
386 
387          gs = ge+1;
388       }
389       if (s->verbosity >= 3) {
390          VPrintf2 ( "      pass %d: size is %d, grp uses are ",
391                    iter+1, totc/8 );
392          for (t = 0; t < nGroups; t++)
393             VPrintf1 ( "%d ", fave[t] );
394          VPrintf0 ( "\n" );
395       }
396 
397       /*--
398         Recompute the tables based on the accumulated frequencies.
399       --*/
400       for (t = 0; t < nGroups; t++)
401          hbMakeCodeLengths ( &(s->len[t][0]), &(s->rfreq[t][0]),
402                              alphaSize, 20 );
403    }
404 
405 
406    AssertH( nGroups < 8, 3002 );
407    AssertH( nSelectors < 32768 &&
408             nSelectors <= (2 + (900000 / BZ_G_SIZE)),
409             3003 );
410 
411 
412    /*--- Compute MTF values for the selectors. ---*/
413    {
414       UChar pos[BZ_N_GROUPS], ll_i, tmp2, tmp;
415       for (i = 0; i < nGroups; i++) pos[i] = i;
416       for (i = 0; i < nSelectors; i++) {
417          ll_i = s->selector[i];
418          j = 0;
419          tmp = pos[j];
420          while ( ll_i != tmp ) {
421             j++;
422             tmp2 = tmp;
423             tmp = pos[j];
424             pos[j] = tmp2;
425          };
426          pos[0] = tmp;
427          s->selectorMtf[i] = j;
428       }
429    };
430 
431    /*--- Assign actual codes for the tables. --*/
432    for (t = 0; t < nGroups; t++) {
433       minLen = 32;
434       maxLen = 0;
435       for (i = 0; i < alphaSize; i++) {
436          if (s->len[t][i] > maxLen) maxLen = s->len[t][i];
437          if (s->len[t][i] < minLen) minLen = s->len[t][i];
438       }
439       AssertH ( !(maxLen > 20), 3004 );
440       AssertH ( !(minLen < 1),  3005 );
441       hbAssignCodes ( &(s->code[t][0]), &(s->len[t][0]),
442                       minLen, maxLen, alphaSize );
443    }
444 
445    /*--- Transmit the mapping table. ---*/
446    {
447       Bool inUse16[16];
448       for (i = 0; i < 16; i++) {
449           inUse16[i] = False;
450           for (j = 0; j < 16; j++)
451              if (s->inUse[i * 16 + j]) inUse16[i] = True;
452       }
453 
454       nBytes = s->numZ;
455       for (i = 0; i < 16; i++)
456          if (inUse16[i]) bsW(s,1,1); else bsW(s,1,0);
457 
458       for (i = 0; i < 16; i++)
459          if (inUse16[i])
460             for (j = 0; j < 16; j++) {
461                if (s->inUse[i * 16 + j]) bsW(s,1,1); else bsW(s,1,0);
462             }
463 
464       if (s->verbosity >= 3)
465          VPrintf1( "      bytes: mapping %d, ", s->numZ-nBytes );
466    }
467 
468    /*--- Now the selectors. ---*/
469    nBytes = s->numZ;
470    bsW ( s, 3, nGroups );
471    bsW ( s, 15, nSelectors );
472    for (i = 0; i < nSelectors; i++) {
473       for (j = 0; j < s->selectorMtf[i]; j++) bsW(s,1,1);
474       bsW(s,1,0);
475    }
476    if (s->verbosity >= 3)
477       VPrintf1( "selectors %d, ", s->numZ-nBytes );
478 
479    /*--- Now the coding tables. ---*/
480    nBytes = s->numZ;
481 
482    for (t = 0; t < nGroups; t++) {
483       Int32 curr = s->len[t][0];
484       bsW ( s, 5, curr );
485       for (i = 0; i < alphaSize; i++) {
486          while (curr < s->len[t][i]) { bsW(s,2,2); curr++; /* 10 */ };
487          while (curr > s->len[t][i]) { bsW(s,2,3); curr--; /* 11 */ };
488          bsW ( s, 1, 0 );
489       }
490    }
491 
492    if (s->verbosity >= 3)
493       VPrintf1 ( "code lengths %d, ", s->numZ-nBytes );
494 
495    /*--- And finally, the block data proper ---*/
496    nBytes = s->numZ;
497    selCtr = 0;
498    gs = 0;
499    while (True) {
500       if (gs >= s->nMTF) break;
501       ge = gs + BZ_G_SIZE - 1;
502       if (ge >= s->nMTF) ge = s->nMTF-1;
503       for (i = gs; i <= ge; i++) {
504          AssertH ( s->selector[selCtr] < nGroups, 3006 );
505          bsW ( s,
506                s->len  [s->selector[selCtr]] [s->szptr[i]],
507                s->code [s->selector[selCtr]] [s->szptr[i]] );
508       }
509 
510       gs = ge+1;
511       selCtr++;
512    }
513    AssertH( selCtr == nSelectors, 3007 );
514 
515    if (s->verbosity >= 3)
516       VPrintf1( "codes %d\n", s->numZ-nBytes );
517 }
518 
519 
520 /*---------------------------------------------------*/
compressBlock(EState * s,Bool is_last_block)521 void compressBlock ( EState* s, Bool is_last_block )
522 {
523    if (s->nblock > 0) {
524 
525       BZ_FINALISE_CRC ( s->blockCRC );
526       s->combinedCRC = (s->combinedCRC << 1) | (s->combinedCRC >> 31);
527       s->combinedCRC ^= s->blockCRC;
528       if (s->blockNo > 1) s->numZ = 0;
529 
530       if (s->verbosity >= 2)
531          VPrintf4( "    block %d: crc = 0x%8x, "
532                    "combined CRC = 0x%8x, size = %d\n",
533                    s->blockNo, s->blockCRC, s->combinedCRC, s->nblock );
534 
535       blockSort ( s );
536    }
537 
538    /*-- If this is the first block, create the stream header. --*/
539    if (s->blockNo == 1) {
540       bsInitWrite ( s );
541       bsPutUChar ( s, 'B' );
542       bsPutUChar ( s, 'Z' );
543       bsPutUChar ( s, 'h' );
544       bsPutUChar ( s, '0' + s->blockSize100k );
545    }
546 
547    if (s->nblock > 0) {
548 
549       bsPutUChar ( s, 0x31 ); bsPutUChar ( s, 0x41 );
550       bsPutUChar ( s, 0x59 ); bsPutUChar ( s, 0x26 );
551       bsPutUChar ( s, 0x53 ); bsPutUChar ( s, 0x59 );
552 
553       /*-- Now the block's CRC, so it is in a known place. --*/
554       bsPutUInt32 ( s, s->blockCRC );
555 
556       /*-- Now a single bit indicating randomisation. --*/
557       if (s->blockRandomised) {
558          bsW(s,1,1); s->nBlocksRandomised++;
559       } else
560          bsW(s,1,0);
561 
562       bsW ( s, 24, s->origPtr );
563       generateMTFValues ( s );
564       sendMTFValues ( s );
565    }
566 
567 
568    /*-- If this is the last block, add the stream trailer. --*/
569    if (is_last_block) {
570 
571       if (s->verbosity >= 2 && s->nBlocksRandomised > 0)
572          VPrintf2 ( "    %d block%s needed randomisation\n",
573                     s->nBlocksRandomised,
574                     s->nBlocksRandomised == 1 ? "" : "s" );
575 
576       bsPutUChar ( s, 0x17 ); bsPutUChar ( s, 0x72 );
577       bsPutUChar ( s, 0x45 ); bsPutUChar ( s, 0x38 );
578       bsPutUChar ( s, 0x50 ); bsPutUChar ( s, 0x90 );
579       bsPutUInt32 ( s, s->combinedCRC );
580       if (s->verbosity >= 2)
581          VPrintf1( "    final combined CRC = 0x%x\n   ", s->combinedCRC );
582       bsFinishWrite ( s );
583    }
584 }
585 
586 
587 /*-------------------------------------------------------------*/
588 /*--- end                                        compress.c ---*/
589 /*-------------------------------------------------------------*/
590