1 #pragma prototyped
2
3 /*-------------------------------------------------------------*/
4 /*--- Compression machinery (not incl block sorting) ---*/
5 /*--- compress.c ---*/
6 /*-------------------------------------------------------------*/
7
8 /*--
9 This file is a part of bzip2 and/or libbzip2, a program and
10 library for lossless, block-sorting data compression.
11
12 Copyright (C) 1996-1998 Julian R Seward. All rights reserved.
13
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions
16 are met:
17
18 1. Redistributions of source code must retain the above copyright
19 notice, this list of conditions and the following disclaimer.
20
21 2. The origin of this software must not be misrepresented; you must
22 not claim that you wrote the original software. If you use this
23 software in a product, an acknowledgment in the product
24 documentation would be appreciated but is not required.
25
26 3. Altered source versions must be plainly marked as such, and must
27 not be misrepresented as being the original software.
28
29 4. The name of the author may not be used to endorse or promote
30 products derived from this software without specific prior written
31 permission.
32
33 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
34 OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
35 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36 ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
37 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
38 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
39 GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
40 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
41 WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
42 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
43 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
45 Julian Seward, Guildford, Surrey, UK.
46 jseward@acm.org
47 bzip2/libbzip2 version 0.9.0 of 28 June 1998
48
49 This program is based on (at least) the work of:
50 Mike Burrows
51 David Wheeler
52 Peter Fenwick
53 Alistair Moffat
54 Radford Neal
55 Ian H. Witten
56 Robert Sedgewick
57 Jon L. Bentley
58
59 For more information on these sources, see the manual.
60 --*/
61
62 /*--
63 CHANGES
64 ~~~~~~~
65 0.9.0 -- original version.
66
67 0.9.0a/b -- no changes in this file.
68
69 0.9.0c
70 * changed setting of nGroups in sendMTFValues() so as to
71 do a bit better on small files
72 --*/
73
74 #include "bzhdr.h"
75
76
77 /*---------------------------------------------------*/
78 /*--- Bit stream I/O ---*/
79 /*---------------------------------------------------*/
80
81 /*---------------------------------------------------*/
bsInitWrite(EState * s)82 void bsInitWrite ( EState* s )
83 {
84 s->bsLive = 0;
85 s->bsBuff = 0;
86 }
87
88
89 /*---------------------------------------------------*/
90 static
bsFinishWrite(EState * s)91 void bsFinishWrite ( EState* s )
92 {
93 while (s->bsLive > 0) {
94 ((UChar*)(s->quadrant))[s->numZ] = (UChar)(s->bsBuff >> 24);
95 s->numZ++;
96 s->bsBuff <<= 8;
97 s->bsLive -= 8;
98 }
99 }
100
101
102 /*---------------------------------------------------*/
103 #define bsNEEDW(nz) \
104 { \
105 while (s->bsLive >= 8) { \
106 ((UChar*)(s->quadrant))[s->numZ] \
107 = (UChar)(s->bsBuff >> 24); \
108 s->numZ++; \
109 s->bsBuff <<= 8; \
110 s->bsLive -= 8; \
111 } \
112 }
113
114
115 /*---------------------------------------------------*/
116 static
bsW(EState * s,Int32 n,UInt32 v)117 void bsW ( EState* s, Int32 n, UInt32 v )
118 {
119 bsNEEDW ( n );
120 s->bsBuff |= (v << (32 - s->bsLive - n));
121 s->bsLive += n;
122 }
123
124
125 /*---------------------------------------------------*/
126 static
bsPutUInt32(EState * s,UInt32 u)127 void bsPutUInt32 ( EState* s, UInt32 u )
128 {
129 bsW ( s, 8, (u >> 24) & 0xffL );
130 bsW ( s, 8, (u >> 16) & 0xffL );
131 bsW ( s, 8, (u >> 8) & 0xffL );
132 bsW ( s, 8, u & 0xffL );
133 }
134
135
136 /*---------------------------------------------------*/
137 static
bsPutUChar(EState * s,UChar c)138 void bsPutUChar ( EState* s, UChar c )
139 {
140 bsW( s, 8, (UInt32)c );
141 }
142
143
144 /*---------------------------------------------------*/
145 /*--- The back end proper ---*/
146 /*---------------------------------------------------*/
147
148 /*---------------------------------------------------*/
149 static
makeMaps_e(EState * s)150 void makeMaps_e ( EState* s )
151 {
152 Int32 i;
153 s->nInUse = 0;
154 for (i = 0; i < 256; i++)
155 if (s->inUse[i]) {
156 s->unseqToSeq[i] = s->nInUse;
157 s->nInUse++;
158 }
159 }
160
161
162 /*---------------------------------------------------*/
163 static
generateMTFValues(EState * s)164 void generateMTFValues ( EState* s )
165 {
166 UChar yy[256];
167 Int32 i, j;
168 UChar tmp;
169 UChar tmp2;
170 Int32 zPend;
171 Int32 wr;
172 Int32 EOB;
173
174 makeMaps_e ( s );
175 EOB = s->nInUse+1;
176
177 for (i = 0; i <= EOB; i++) s->mtfFreq[i] = 0;
178
179 wr = 0;
180 zPend = 0;
181 for (i = 0; i < s->nInUse; i++) yy[i] = (UChar) i;
182
183 for (i = 0; i < s->nblock; i++) {
184 UChar ll_i;
185
186 AssertD ( wr <= i, "generateMTFValues(1)" );
187 j = s->zptr[i]-1; if (j < 0) j += s->nblock;
188 ll_i = s->unseqToSeq[s->block[j]];
189 AssertD ( ll_i < s->nInUse, "generateMTFValues(2a)" );
190
191 j = 0;
192 tmp = yy[j];
193 while ( ll_i != tmp ) {
194 j++;
195 tmp2 = tmp;
196 tmp = yy[j];
197 yy[j] = tmp2;
198 };
199 yy[0] = tmp;
200
201 if (j == 0) {
202 zPend++;
203 } else {
204 if (zPend > 0) {
205 zPend--;
206 while (True) {
207 switch (zPend % 2) {
208 case 0: s->szptr[wr] = BZ_RUNA; wr++; s->mtfFreq[BZ_RUNA]++; break;
209 case 1: s->szptr[wr] = BZ_RUNB; wr++; s->mtfFreq[BZ_RUNB]++; break;
210 };
211 if (zPend < 2) break;
212 zPend = (zPend - 2) / 2;
213 };
214 zPend = 0;
215 }
216 s->szptr[wr] = j+1; wr++; s->mtfFreq[j+1]++;
217 }
218 }
219
220 if (zPend > 0) {
221 zPend--;
222 while (True) {
223 switch (zPend % 2) {
224 case 0: s->szptr[wr] = BZ_RUNA; wr++; s->mtfFreq[BZ_RUNA]++; break;
225 case 1: s->szptr[wr] = BZ_RUNB; wr++; s->mtfFreq[BZ_RUNB]++; break;
226 };
227 if (zPend < 2) break;
228 zPend = (zPend - 2) / 2;
229 };
230 }
231
232 s->szptr[wr] = EOB; wr++; s->mtfFreq[EOB]++;
233
234 s->nMTF = wr;
235 }
236
237
238 /*---------------------------------------------------*/
239 #define BZ_LESSER_ICOST 0
240 #define BZ_GREATER_ICOST 15
241
242 static
sendMTFValues(EState * s)243 void sendMTFValues ( EState* s )
244 {
245 Int32 v, t, i, j, gs, ge, totc, bt, bc, iter;
246 Int32 nSelectors, alphaSize, minLen, maxLen, selCtr;
247 Int32 nGroups, nBytes;
248
249 /*--
250 UChar len [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
251 is a global since the decoder also needs it.
252
253 Int32 code[BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
254 Int32 rfreq[BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE];
255 are also globals only used in this proc.
256 Made global to keep stack frame size small.
257 --*/
258
259
260 UInt16 cost[BZ_N_GROUPS];
261 Int32 fave[BZ_N_GROUPS];
262
263 if (s->verbosity >= 3)
264 VPrintf3( " %d in block, %d after MTF & 1-2 coding, "
265 "%d+2 syms in use\n",
266 s->nblock, s->nMTF, s->nInUse );
267
268 alphaSize = s->nInUse+2;
269 for (t = 0; t < BZ_N_GROUPS; t++)
270 for (v = 0; v < alphaSize; v++)
271 s->len[t][v] = BZ_GREATER_ICOST;
272
273 /*--- Decide how many coding tables to use ---*/
274 AssertH ( s->nMTF > 0, 3001 );
275 if (s->nMTF < 200) nGroups = 2; else
276 if (s->nMTF < 600) nGroups = 3; else
277 if (s->nMTF < 1200) nGroups = 4; else
278 if (s->nMTF < 2400) nGroups = 5; else
279 nGroups = 6;
280
281 /*--- Generate an initial set of coding tables ---*/
282 {
283 Int32 nPart, remF, tFreq, aFreq;
284
285 nPart = nGroups;
286 remF = s->nMTF;
287 gs = 0;
288 while (nPart > 0) {
289 tFreq = remF / nPart;
290 ge = gs-1;
291 aFreq = 0;
292 while (aFreq < tFreq && ge < alphaSize-1) {
293 ge++;
294 aFreq += s->mtfFreq[ge];
295 }
296
297 if (ge > gs
298 && nPart != nGroups && nPart != 1
299 && ((nGroups-nPart) % 2 == 1)) {
300 aFreq -= s->mtfFreq[ge];
301 ge--;
302 }
303
304 if (s->verbosity >= 3)
305 VPrintf5( " initial group %d, [%d .. %d], "
306 "has %d syms (%4.1f%%)\n",
307 nPart, gs, ge, aFreq,
308 (100.0 * (float)aFreq) / (float)(s->nMTF) );
309
310 for (v = 0; v < alphaSize; v++)
311 if (v >= gs && v <= ge)
312 s->len[nPart-1][v] = BZ_LESSER_ICOST; else
313 s->len[nPart-1][v] = BZ_GREATER_ICOST;
314
315 nPart--;
316 gs = ge+1;
317 remF -= aFreq;
318 }
319 }
320
321 /*---
322 Iterate up to BZ_N_ITERS times to improve the tables.
323 ---*/
324 for (iter = 0; iter < BZ_N_ITERS; iter++) {
325
326 for (t = 0; t < nGroups; t++) fave[t] = 0;
327
328 for (t = 0; t < nGroups; t++)
329 for (v = 0; v < alphaSize; v++)
330 s->rfreq[t][v] = 0;
331
332 nSelectors = 0;
333 totc = 0;
334 gs = 0;
335 while (True) {
336
337 /*--- Set group start & end marks. --*/
338 if (gs >= s->nMTF) break;
339 ge = gs + BZ_G_SIZE - 1;
340 if (ge >= s->nMTF) ge = s->nMTF-1;
341
342 /*--
343 Calculate the cost of this group as coded
344 by each of the coding tables.
345 --*/
346 for (t = 0; t < nGroups; t++) cost[t] = 0;
347
348 if (nGroups == 6) {
349 register UInt16 cost0, cost1, cost2, cost3, cost4, cost5;
350 cost0 = cost1 = cost2 = cost3 = cost4 = cost5 = 0;
351 for (i = gs; i <= ge; i++) {
352 UInt16 icv = s->szptr[i];
353 cost0 += s->len[0][icv];
354 cost1 += s->len[1][icv];
355 cost2 += s->len[2][icv];
356 cost3 += s->len[3][icv];
357 cost4 += s->len[4][icv];
358 cost5 += s->len[5][icv];
359 }
360 cost[0] = cost0; cost[1] = cost1; cost[2] = cost2;
361 cost[3] = cost3; cost[4] = cost4; cost[5] = cost5;
362 } else {
363 for (i = gs; i <= ge; i++) {
364 UInt16 icv = s->szptr[i];
365 for (t = 0; t < nGroups; t++) cost[t] += s->len[t][icv];
366 }
367 }
368
369 /*--
370 Find the coding table which is best for this group,
371 and record its identity in the selector table.
372 --*/
373 bc = 999999999; bt = -1;
374 for (t = 0; t < nGroups; t++)
375 if (cost[t] < bc) { bc = cost[t]; bt = t; };
376 totc += bc;
377 fave[bt]++;
378 s->selector[nSelectors] = bt;
379 nSelectors++;
380
381 /*--
382 Increment the symbol frequencies for the selected table.
383 --*/
384 for (i = gs; i <= ge; i++)
385 s->rfreq[bt][ s->szptr[i] ]++;
386
387 gs = ge+1;
388 }
389 if (s->verbosity >= 3) {
390 VPrintf2 ( " pass %d: size is %d, grp uses are ",
391 iter+1, totc/8 );
392 for (t = 0; t < nGroups; t++)
393 VPrintf1 ( "%d ", fave[t] );
394 VPrintf0 ( "\n" );
395 }
396
397 /*--
398 Recompute the tables based on the accumulated frequencies.
399 --*/
400 for (t = 0; t < nGroups; t++)
401 hbMakeCodeLengths ( &(s->len[t][0]), &(s->rfreq[t][0]),
402 alphaSize, 20 );
403 }
404
405
406 AssertH( nGroups < 8, 3002 );
407 AssertH( nSelectors < 32768 &&
408 nSelectors <= (2 + (900000 / BZ_G_SIZE)),
409 3003 );
410
411
412 /*--- Compute MTF values for the selectors. ---*/
413 {
414 UChar pos[BZ_N_GROUPS], ll_i, tmp2, tmp;
415 for (i = 0; i < nGroups; i++) pos[i] = i;
416 for (i = 0; i < nSelectors; i++) {
417 ll_i = s->selector[i];
418 j = 0;
419 tmp = pos[j];
420 while ( ll_i != tmp ) {
421 j++;
422 tmp2 = tmp;
423 tmp = pos[j];
424 pos[j] = tmp2;
425 };
426 pos[0] = tmp;
427 s->selectorMtf[i] = j;
428 }
429 };
430
431 /*--- Assign actual codes for the tables. --*/
432 for (t = 0; t < nGroups; t++) {
433 minLen = 32;
434 maxLen = 0;
435 for (i = 0; i < alphaSize; i++) {
436 if (s->len[t][i] > maxLen) maxLen = s->len[t][i];
437 if (s->len[t][i] < minLen) minLen = s->len[t][i];
438 }
439 AssertH ( !(maxLen > 20), 3004 );
440 AssertH ( !(minLen < 1), 3005 );
441 hbAssignCodes ( &(s->code[t][0]), &(s->len[t][0]),
442 minLen, maxLen, alphaSize );
443 }
444
445 /*--- Transmit the mapping table. ---*/
446 {
447 Bool inUse16[16];
448 for (i = 0; i < 16; i++) {
449 inUse16[i] = False;
450 for (j = 0; j < 16; j++)
451 if (s->inUse[i * 16 + j]) inUse16[i] = True;
452 }
453
454 nBytes = s->numZ;
455 for (i = 0; i < 16; i++)
456 if (inUse16[i]) bsW(s,1,1); else bsW(s,1,0);
457
458 for (i = 0; i < 16; i++)
459 if (inUse16[i])
460 for (j = 0; j < 16; j++) {
461 if (s->inUse[i * 16 + j]) bsW(s,1,1); else bsW(s,1,0);
462 }
463
464 if (s->verbosity >= 3)
465 VPrintf1( " bytes: mapping %d, ", s->numZ-nBytes );
466 }
467
468 /*--- Now the selectors. ---*/
469 nBytes = s->numZ;
470 bsW ( s, 3, nGroups );
471 bsW ( s, 15, nSelectors );
472 for (i = 0; i < nSelectors; i++) {
473 for (j = 0; j < s->selectorMtf[i]; j++) bsW(s,1,1);
474 bsW(s,1,0);
475 }
476 if (s->verbosity >= 3)
477 VPrintf1( "selectors %d, ", s->numZ-nBytes );
478
479 /*--- Now the coding tables. ---*/
480 nBytes = s->numZ;
481
482 for (t = 0; t < nGroups; t++) {
483 Int32 curr = s->len[t][0];
484 bsW ( s, 5, curr );
485 for (i = 0; i < alphaSize; i++) {
486 while (curr < s->len[t][i]) { bsW(s,2,2); curr++; /* 10 */ };
487 while (curr > s->len[t][i]) { bsW(s,2,3); curr--; /* 11 */ };
488 bsW ( s, 1, 0 );
489 }
490 }
491
492 if (s->verbosity >= 3)
493 VPrintf1 ( "code lengths %d, ", s->numZ-nBytes );
494
495 /*--- And finally, the block data proper ---*/
496 nBytes = s->numZ;
497 selCtr = 0;
498 gs = 0;
499 while (True) {
500 if (gs >= s->nMTF) break;
501 ge = gs + BZ_G_SIZE - 1;
502 if (ge >= s->nMTF) ge = s->nMTF-1;
503 for (i = gs; i <= ge; i++) {
504 AssertH ( s->selector[selCtr] < nGroups, 3006 );
505 bsW ( s,
506 s->len [s->selector[selCtr]] [s->szptr[i]],
507 s->code [s->selector[selCtr]] [s->szptr[i]] );
508 }
509
510 gs = ge+1;
511 selCtr++;
512 }
513 AssertH( selCtr == nSelectors, 3007 );
514
515 if (s->verbosity >= 3)
516 VPrintf1( "codes %d\n", s->numZ-nBytes );
517 }
518
519
520 /*---------------------------------------------------*/
compressBlock(EState * s,Bool is_last_block)521 void compressBlock ( EState* s, Bool is_last_block )
522 {
523 if (s->nblock > 0) {
524
525 BZ_FINALISE_CRC ( s->blockCRC );
526 s->combinedCRC = (s->combinedCRC << 1) | (s->combinedCRC >> 31);
527 s->combinedCRC ^= s->blockCRC;
528 if (s->blockNo > 1) s->numZ = 0;
529
530 if (s->verbosity >= 2)
531 VPrintf4( " block %d: crc = 0x%8x, "
532 "combined CRC = 0x%8x, size = %d\n",
533 s->blockNo, s->blockCRC, s->combinedCRC, s->nblock );
534
535 blockSort ( s );
536 }
537
538 /*-- If this is the first block, create the stream header. --*/
539 if (s->blockNo == 1) {
540 bsInitWrite ( s );
541 bsPutUChar ( s, 'B' );
542 bsPutUChar ( s, 'Z' );
543 bsPutUChar ( s, 'h' );
544 bsPutUChar ( s, '0' + s->blockSize100k );
545 }
546
547 if (s->nblock > 0) {
548
549 bsPutUChar ( s, 0x31 ); bsPutUChar ( s, 0x41 );
550 bsPutUChar ( s, 0x59 ); bsPutUChar ( s, 0x26 );
551 bsPutUChar ( s, 0x53 ); bsPutUChar ( s, 0x59 );
552
553 /*-- Now the block's CRC, so it is in a known place. --*/
554 bsPutUInt32 ( s, s->blockCRC );
555
556 /*-- Now a single bit indicating randomisation. --*/
557 if (s->blockRandomised) {
558 bsW(s,1,1); s->nBlocksRandomised++;
559 } else
560 bsW(s,1,0);
561
562 bsW ( s, 24, s->origPtr );
563 generateMTFValues ( s );
564 sendMTFValues ( s );
565 }
566
567
568 /*-- If this is the last block, add the stream trailer. --*/
569 if (is_last_block) {
570
571 if (s->verbosity >= 2 && s->nBlocksRandomised > 0)
572 VPrintf2 ( " %d block%s needed randomisation\n",
573 s->nBlocksRandomised,
574 s->nBlocksRandomised == 1 ? "" : "s" );
575
576 bsPutUChar ( s, 0x17 ); bsPutUChar ( s, 0x72 );
577 bsPutUChar ( s, 0x45 ); bsPutUChar ( s, 0x38 );
578 bsPutUChar ( s, 0x50 ); bsPutUChar ( s, 0x90 );
579 bsPutUInt32 ( s, s->combinedCRC );
580 if (s->verbosity >= 2)
581 VPrintf1( " final combined CRC = 0x%x\n ", s->combinedCRC );
582 bsFinishWrite ( s );
583 }
584 }
585
586
587 /*-------------------------------------------------------------*/
588 /*--- end compress.c ---*/
589 /*-------------------------------------------------------------*/
590