1 
2 /*-------------------------------------------------------------*/
3 /*--- Compression machinery (not incl block sorting)        ---*/
4 /*---                                            compress.c ---*/
5 /*-------------------------------------------------------------*/
6 
7 /*--
8   This file is a part of bzip2 and/or libbzip2, a program and
9   library for lossless, block-sorting data compression.
10 
11   Copyright (C) 1996-1999 Julian R Seward.  All rights reserved.
12 
13   Redistribution and use in source and binary forms, with or without
14   modification, are permitted provided that the following conditions
15   are met:
16 
17   1. Redistributions of source code must retain the above copyright
18      notice, this list of conditions and the following disclaimer.
19 
20   2. The origin of this software must not be misrepresented; you must
21      not claim that you wrote the original software.  If you use this
22      software in a product, an acknowledgment in the product
23      documentation would be appreciated but is not required.
24 
25   3. Altered source versions must be plainly marked as such, and must
26      not be misrepresented as being the original software.
27 
28   4. The name of the author may not be used to endorse or promote
29      products derived from this software without specific prior written
30      permission.
31 
32   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
33   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
34   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
36   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
37   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
38   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
39   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
40   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
42   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 
44   Julian Seward, Cambridge, UK.
45   jseward@acm.org
46   bzip2/libbzip2 version 0.9.5 of 24 May 1999
47 
48   This program is based on (at least) the work of:
49      Mike Burrows
50      David Wheeler
51      Peter Fenwick
52      Alistair Moffat
53      Radford Neal
54      Ian H. Witten
55      Robert Sedgewick
56      Jon L. Bentley
57 
58   For more information on these sources, see the manual.
59 --*/
60 
61 /*--
62    CHANGES
63    ~~~~~~~
64    0.9.0 -- original version.
65 
66    0.9.0a/b -- no changes in this file.
67 
68    0.9.0c
69       * changed setting of nGroups in sendMTFValues() so as to
70         do a bit better on small files
71 --*/
72 
73 #include "stdafx.h"
74 
75 
76 /*---------------------------------------------------*/
77 /*--- Bit stream I/O                              ---*/
78 /*---------------------------------------------------*/
79 
80 /*---------------------------------------------------*/
bsInitWrite(EState * s)81 void bsInitWrite ( EState* s )
82 {
83    s->bsLive = 0;
84    s->bsBuff = 0;
85 }
86 
87 
88 /*---------------------------------------------------*/
89 static
bsFinishWrite(EState * s)90 void bsFinishWrite ( EState* s )
91 {
92    while (s->bsLive > 0) {
93       s->zbits[s->numZ] = (UChar)(s->bsBuff >> 24);
94       s->numZ++;
95       s->bsBuff <<= 8;
96       s->bsLive -= 8;
97    }
98 }
99 
100 
101 /*---------------------------------------------------*/
102 #define bsNEEDW(nz)                           \
103 {                                             \
104    while (s->bsLive >= 8) {                   \
105       s->zbits[s->numZ]                       \
106          = (UChar)(s->bsBuff >> 24);          \
107       s->numZ++;                              \
108       s->bsBuff <<= 8;                        \
109       s->bsLive -= 8;                         \
110    }                                          \
111 }
112 
113 
114 /*---------------------------------------------------*/
115 static
bsW(EState * s,Int32 n,UInt32 v)116 void bsW ( EState* s, Int32 n, UInt32 v )
117 {
118    bsNEEDW ( n );
119    s->bsBuff |= (v << (32 - s->bsLive - n));
120    s->bsLive += n;
121 }
122 
123 
124 /*---------------------------------------------------*/
125 static
bsPutUInt32(EState * s,UInt32 u)126 void bsPutUInt32 ( EState* s, UInt32 u )
127 {
128    bsW ( s, 8, (u >> 24) & 0xffL );
129    bsW ( s, 8, (u >> 16) & 0xffL );
130    bsW ( s, 8, (u >>  8) & 0xffL );
131    bsW ( s, 8,  u        & 0xffL );
132 }
133 
134 
135 /*---------------------------------------------------*/
136 static
bsPutUChar(EState * s,UChar c)137 void bsPutUChar ( EState* s, UChar c )
138 {
139    bsW( s, 8, (UInt32)c );
140 }
141 
142 
143 /*---------------------------------------------------*/
144 /*--- The back end proper                         ---*/
145 /*---------------------------------------------------*/
146 
147 /*---------------------------------------------------*/
148 static
makeMaps_e(EState * s)149 void makeMaps_e ( EState* s )
150 {
151    Int32 i;
152    s->nInUse = 0;
153    for (i = 0; i < 256; i++)
154       if (s->inUse[i]) {
155          s->unseqToSeq[i] = s->nInUse;
156          s->nInUse++;
157       }
158 }
159 
160 
161 /*---------------------------------------------------*/
162 static
generateMTFValues(EState * s)163 void generateMTFValues ( EState* s )
164 {
165    UChar   yy[256];
166    Int32   i, j;
167    UChar   tmp;
168    UChar   tmp2;
169    Int32   zPend;
170    Int32   wr;
171    Int32   EOB;
172 
173    /*
174       After sorting (eg, here),
175          s->arr1 [ 0 .. s->nblock-1 ] holds sorted order,
176          and
177          ((UInt16*)s->arr2) [ 0 .. s->nblock-1 ] [15:8]
178          holds the original block data.
179 
180       The first thing to do is generate the MTF values,
181       and put them in
182          ((UInt16*)s->arr1) [ 0 .. s->nblock-1 ].
183       Because there are strictly fewer or equal MTF values
184       than block values, ptr values in this area are overwritten
185       with MTF values only when they are no longer needed.
186 
187       The final compressed bitstream is generated into the
188       area starting at
189          (UChar*) (&((UInt16)s->arr2)[s->nblock])
190 
191       These storage aliases are set up in bzCompressInit(),
192       except for the last one, which is arranged in
193       compressBlock().
194    */
195    UInt32* ptr   = s->ptr;
196    UInt16* block = s->block;
197    UInt16* mtfv  = s->mtfv;
198 
199    makeMaps_e ( s );
200    EOB = s->nInUse+1;
201 
202    for (i = 0; i <= EOB; i++) s->mtfFreq[i] = 0;
203 
204    wr = 0;
205    zPend = 0;
206    for (i = 0; i < s->nInUse; i++) yy[i] = (UChar) i;
207 
208    for (i = 0; i < s->nblock; i++) {
209       UChar ll_i;
210 
211       AssertD ( wr <= i, "generateMTFValues(1)" );
212       j = ptr[i]-1; if (j < 0) j += s->nblock;
213       ll_i = s->unseqToSeq[block[j] >> 8];
214       AssertD ( ll_i < s->nInUse, "generateMTFValues(2a)" );
215 
216       tmp = yy[0];
217       if (tmp == ll_i) {
218          zPend++;
219       } else {
220          tmp2 = tmp;
221          tmp = yy[1];
222          yy[1] = tmp2;
223          j = 1;
224          while ( ll_i != tmp ) {
225             j++;
226             tmp2 = tmp;
227             tmp = yy[j];
228             yy[j] = tmp2;
229          };
230          yy[0] = tmp;
231 
232          if (zPend > 0) {
233             zPend--;
234             while (True) {
235                if (zPend & 1) {
236                   mtfv[wr] = BZ_RUNB; wr++;
237                   s->mtfFreq[BZ_RUNB]++;
238                } else {
239                   mtfv[wr] = BZ_RUNA; wr++;
240                   s->mtfFreq[BZ_RUNA]++;
241                }
242                if (zPend < 2) break;
243                zPend = (zPend - 2) / 2;
244             };
245             zPend = 0;
246          }
247          mtfv[wr] = j+1; wr++; s->mtfFreq[j+1]++;
248       }
249    }
250 
251    if (zPend > 0) {
252       zPend--;
253       while (True) {
254          if (zPend & 1) {
255             mtfv[wr] = BZ_RUNB; wr++;
256             s->mtfFreq[BZ_RUNB]++;
257          } else {
258             mtfv[wr] = BZ_RUNA; wr++;
259             s->mtfFreq[BZ_RUNA]++;
260          }
261          if (zPend < 2) break;
262          zPend = (zPend - 2) / 2;
263       };
264    }
265 
266    mtfv[wr] = EOB; wr++; s->mtfFreq[EOB]++;
267 
268    s->nMTF = wr;
269 }
270 
271 
272 /*---------------------------------------------------*/
273 #define BZ_LESSER_ICOST  0
274 #define BZ_GREATER_ICOST 15
275 
276 static
sendMTFValues(EState * s)277 void sendMTFValues ( EState* s )
278 {
279    Int32 v, t, i, j, gs, ge, totc, bt, bc, iter;
280    Int32 nSelectors, alphaSize, minLen, maxLen, selCtr;
281    Int32 nGroups, nBytes;
282 
283    /*--
284    UChar  len [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
285    is a global since the decoder also needs it.
286 
287    Int32  code[BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
288    Int32  rfreq[BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
289    are also globals only used in this proc.
290    Made global to keep stack frame size small.
291    --*/
292 
293 
294    UInt16 cost[BZ_N_GROUPS];
295    Int32  fave[BZ_N_GROUPS];
296 
297    UInt16* mtfv = s->mtfv;
298 
299    if (s->verbosity >= 3)
300       VPrintf3( "      %d in block, %d after MTF & 1-2 coding, "
301                 "%d+2 syms in use\n",
302                 s->nblock, s->nMTF, s->nInUse );
303 
304    alphaSize = s->nInUse+2;
305    for (t = 0; t < BZ_N_GROUPS; t++)
306       for (v = 0; v < alphaSize; v++)
307          s->len[t][v] = BZ_GREATER_ICOST;
308 
309    /*--- Decide how many coding tables to use ---*/
310    AssertH ( s->nMTF > 0, 3001 );
311    if (s->nMTF < 200)  nGroups = 2; else
312    if (s->nMTF < 600)  nGroups = 3; else
313    if (s->nMTF < 1200) nGroups = 4; else
314    if (s->nMTF < 2400) nGroups = 5; else
315                        nGroups = 6;
316 
317    /*--- Generate an initial set of coding tables ---*/
318    {
319       Int32 nPart, remF, tFreq, aFreq;
320 
321       nPart = nGroups;
322       remF  = s->nMTF;
323       gs = 0;
324       while (nPart > 0) {
325          tFreq = remF / nPart;
326          ge = gs-1;
327          aFreq = 0;
328          while (aFreq < tFreq && ge < alphaSize-1) {
329             ge++;
330             aFreq += s->mtfFreq[ge];
331          }
332 
333          if (ge > gs
334              && nPart != nGroups && nPart != 1
335              && ((nGroups-nPart) % 2 == 1)) {
336             aFreq -= s->mtfFreq[ge];
337             ge--;
338          }
339 
340          if (s->verbosity >= 3)
341             VPrintf5( "      initial group %d, [%d .. %d], "
342                       "has %d syms (%4.1f%%)\n",
343                       nPart, gs, ge, aFreq,
344                       (100.0 * (float)aFreq) / (float)(s->nMTF) );
345 
346          for (v = 0; v < alphaSize; v++)
347             if (v >= gs && v <= ge)
348                s->len[nPart-1][v] = BZ_LESSER_ICOST; else
349                s->len[nPart-1][v] = BZ_GREATER_ICOST;
350 
351          nPart--;
352          gs = ge+1;
353          remF -= aFreq;
354       }
355    }
356 
357    /*---
358       Iterate up to BZ_N_ITERS times to improve the tables.
359    ---*/
360    for (iter = 0; iter < BZ_N_ITERS; iter++) {
361 
362       for (t = 0; t < nGroups; t++) fave[t] = 0;
363 
364       for (t = 0; t < nGroups; t++)
365          for (v = 0; v < alphaSize; v++)
366             s->rfreq[t][v] = 0;
367 
368       nSelectors = 0;
369       totc = 0;
370       gs = 0;
371       while (True) {
372 
373          /*--- Set group start & end marks. --*/
374          if (gs >= s->nMTF) break;
375          ge = gs + BZ_G_SIZE - 1;
376          if (ge >= s->nMTF) ge = s->nMTF-1;
377 
378          /*--
379             Calculate the cost of this group as coded
380             by each of the coding tables.
381          --*/
382          for (t = 0; t < nGroups; t++) cost[t] = 0;
383 
384          if (nGroups == 6) {
385             register UInt16 cost0, cost1, cost2, cost3, cost4, cost5;
386             cost0 = cost1 = cost2 = cost3 = cost4 = cost5 = 0;
387             for (i = gs; i <= ge; i++) {
388                UInt16 icv = mtfv[i];
389                cost0 += s->len[0][icv];
390                cost1 += s->len[1][icv];
391                cost2 += s->len[2][icv];
392                cost3 += s->len[3][icv];
393                cost4 += s->len[4][icv];
394                cost5 += s->len[5][icv];
395             }
396             cost[0] = cost0; cost[1] = cost1; cost[2] = cost2;
397             cost[3] = cost3; cost[4] = cost4; cost[5] = cost5;
398          } else {
399             for (i = gs; i <= ge; i++) {
400                UInt16 icv = mtfv[i];
401                for (t = 0; t < nGroups; t++) cost[t] += s->len[t][icv];
402             }
403          }
404 
405          /*--
406             Find the coding table which is best for this group,
407             and record its identity in the selector table.
408          --*/
409          bc = 999999999; bt = -1;
410          for (t = 0; t < nGroups; t++)
411             if (cost[t] < bc) { bc = cost[t]; bt = t; };
412          totc += bc;
413          fave[bt]++;
414          s->selector[nSelectors] = bt;
415          nSelectors++;
416 
417          /*--
418             Increment the symbol frequencies for the selected table.
419           --*/
420          for (i = gs; i <= ge; i++)
421             s->rfreq[bt][ mtfv[i] ]++;
422 
423          gs = ge+1;
424       }
425       if (s->verbosity >= 3) {
426          VPrintf2 ( "      pass %d: size is %d, grp uses are ",
427                    iter+1, totc/8 );
428          for (t = 0; t < nGroups; t++)
429             VPrintf1 ( "%d ", fave[t] );
430          VPrintf0 ( "\n" );
431       }
432 
433       /*--
434         Recompute the tables based on the accumulated frequencies.
435       --*/
436       for (t = 0; t < nGroups; t++)
437          hbMakeCodeLengths ( &(s->len[t][0]), &(s->rfreq[t][0]),
438                              alphaSize, 20 );
439    }
440 
441 
442    AssertH( nGroups < 8, 3002 );
443    AssertH( nSelectors < 32768 &&
444             nSelectors <= (2 + (900000 / BZ_G_SIZE)),
445             3003 );
446 
447 
448    /*--- Compute MTF values for the selectors. ---*/
449    {
450       UChar pos[BZ_N_GROUPS], ll_i, tmp2, tmp;
451       for (i = 0; i < nGroups; i++) pos[i] = i;
452       for (i = 0; i < nSelectors; i++) {
453          ll_i = s->selector[i];
454          j = 0;
455          tmp = pos[j];
456          while ( ll_i != tmp ) {
457             j++;
458             tmp2 = tmp;
459             tmp = pos[j];
460             pos[j] = tmp2;
461          };
462          pos[0] = tmp;
463          s->selectorMtf[i] = j;
464       }
465    };
466 
467    /*--- Assign actual codes for the tables. --*/
468    for (t = 0; t < nGroups; t++) {
469       minLen = 32;
470       maxLen = 0;
471       for (i = 0; i < alphaSize; i++) {
472          if (s->len[t][i] > maxLen) maxLen = s->len[t][i];
473          if (s->len[t][i] < minLen) minLen = s->len[t][i];
474       }
475       AssertH ( !(maxLen > 20), 3004 );
476       AssertH ( !(minLen < 1),  3005 );
477       hbAssignCodes ( &(s->code[t][0]), &(s->len[t][0]),
478                       minLen, maxLen, alphaSize );
479    }
480 
481    /*--- Transmit the mapping table. ---*/
482    {
483       Bool inUse16[16];
484       for (i = 0; i < 16; i++) {
485           inUse16[i] = False;
486           for (j = 0; j < 16; j++)
487              if (s->inUse[i * 16 + j]) inUse16[i] = True;
488       }
489 
490       nBytes = s->numZ;
491       for (i = 0; i < 16; i++)
492          if (inUse16[i]) bsW(s,1,1); else bsW(s,1,0);
493 
494       for (i = 0; i < 16; i++)
495          if (inUse16[i])
496             for (j = 0; j < 16; j++) {
497                if (s->inUse[i * 16 + j]) bsW(s,1,1); else bsW(s,1,0);
498             }
499 
500       if (s->verbosity >= 3)
501          VPrintf1( "      bytes: mapping %d, ", s->numZ-nBytes );
502    }
503 
504    /*--- Now the selectors. ---*/
505    nBytes = s->numZ;
506    bsW ( s, 3, nGroups );
507    bsW ( s, 15, nSelectors );
508    for (i = 0; i < nSelectors; i++) {
509       for (j = 0; j < s->selectorMtf[i]; j++) bsW(s,1,1);
510       bsW(s,1,0);
511    }
512    if (s->verbosity >= 3)
513       VPrintf1( "selectors %d, ", s->numZ-nBytes );
514 
515    /*--- Now the coding tables. ---*/
516    nBytes = s->numZ;
517 
518    for (t = 0; t < nGroups; t++) {
519       Int32 curr = s->len[t][0];
520       bsW ( s, 5, curr );
521       for (i = 0; i < alphaSize; i++) {
522          while (curr < s->len[t][i]) { bsW(s,2,2); curr++; /* 10 */ };
523          while (curr > s->len[t][i]) { bsW(s,2,3); curr--; /* 11 */ };
524          bsW ( s, 1, 0 );
525       }
526    }
527 
528    if (s->verbosity >= 3)
529       VPrintf1 ( "code lengths %d, ", s->numZ-nBytes );
530 
531    /*--- And finally, the block data proper ---*/
532    nBytes = s->numZ;
533    selCtr = 0;
534    gs = 0;
535    while (True) {
536       if (gs >= s->nMTF) break;
537       ge = gs + BZ_G_SIZE - 1;
538       if (ge >= s->nMTF) ge = s->nMTF-1;
539       for (i = gs; i <= ge; i++) {
540          AssertH ( s->selector[selCtr] < nGroups, 3006 );
541          bsW ( s,
542                s->len  [s->selector[selCtr]] [mtfv[i]],
543                s->code [s->selector[selCtr]] [mtfv[i]] );
544       }
545 
546       gs = ge+1;
547       selCtr++;
548    }
549    AssertH( selCtr == nSelectors, 3007 );
550 
551    if (s->verbosity >= 3)
552       VPrintf1( "codes %d\n", s->numZ-nBytes );
553 }
554 
555 
556 /*---------------------------------------------------*/
compressBlock(EState * s,Bool is_last_block)557 void compressBlock ( EState* s, Bool is_last_block )
558 {
559    if (s->nblock > 0) {
560 
561       BZ_FINALISE_CRC ( s->blockCRC );
562       s->combinedCRC = (s->combinedCRC << 1) | (s->combinedCRC >> 31);
563       s->combinedCRC ^= s->blockCRC;
564       if (s->blockNo > 1) s->numZ = 0;
565 
566       if (s->verbosity >= 2)
567          VPrintf4( "    block %d: crc = 0x%8x, "
568                    "combined CRC = 0x%8x, size = %d\n",
569                    s->blockNo, s->blockCRC, s->combinedCRC, s->nblock );
570 
571       blockSort ( s );
572    }
573 
574    s->zbits = (UChar*) (&((UInt16*)s->arr2)[s->nblock]);
575 
576    /*-- If this is the first block, create the stream header. --*/
577    if (s->blockNo == 1) {
578       bsInitWrite ( s );
579       bsPutUChar ( s, 'B' );
580       bsPutUChar ( s, 'Z' );
581       bsPutUChar ( s, 'h' );
582       bsPutUChar ( s, '0' + s->blockSize100k );
583    }
584 
585    if (s->nblock > 0) {
586 
587       bsPutUChar ( s, 0x31 ); bsPutUChar ( s, 0x41 );
588       bsPutUChar ( s, 0x59 ); bsPutUChar ( s, 0x26 );
589       bsPutUChar ( s, 0x53 ); bsPutUChar ( s, 0x59 );
590 
591       /*-- Now the block's CRC, so it is in a known place. --*/
592       bsPutUInt32 ( s, s->blockCRC );
593 
594       /*--
595          Now a single bit indicating (non-)randomisation.
596          As of version 0.9.5, we use a better sorting algorithm
597          which makes randomisation unnecessary.  So always set
598          the randomised bit to 'no'.  Of course, the decoder
599          still needs to be able to handle randomised blocks
600          so as to maintain backwards compatibility with
601          older versions of bzip2.
602       --*/
603       bsW(s,1,0);
604 
605       bsW ( s, 24, s->origPtr );
606       generateMTFValues ( s );
607       sendMTFValues ( s );
608    }
609 
610 
611    /*-- If this is the last block, add the stream trailer. --*/
612    if (is_last_block) {
613 
614       bsPutUChar ( s, 0x17 ); bsPutUChar ( s, 0x72 );
615       bsPutUChar ( s, 0x45 ); bsPutUChar ( s, 0x38 );
616       bsPutUChar ( s, 0x50 ); bsPutUChar ( s, 0x90 );
617       bsPutUInt32 ( s, s->combinedCRC );
618       if (s->verbosity >= 2)
619          VPrintf1( "    final combined CRC = 0x%x\n   ", s->combinedCRC );
620       bsFinishWrite ( s );
621    }
622 }
623 
624 
625 /*-------------------------------------------------------------*/
626 /*--- end                                        compress.c ---*/
627 /*-------------------------------------------------------------*/
628