1
2 /*-----------------------------------------------------------*/
3 /*--- A block-sorting, lossless compressor bzip.c ---*/
4 /*-----------------------------------------------------------*/
5
6 /*--
7 This program is BZIP, a lossless, block-sorting data compressor,
8 version 0.21, dated 25-August-1996.
9
10 Copyright (C) 1996 by Julian Seward.
11 Department of Computer Science, University of Manchester,
12 Oxford Road, Manchester M13 9PL, UK.
13 email: sewardj@cs.man.ac.uk
14
15 This program is free software; you can redistribute it and/or modify
16 it under the terms of the GNU General Public License as published by
17 the Free Software Foundation; either version 2 of the License, or
18 (at your option) any later version.
19
20 This program is distributed in the hope that it will be useful,
21 but WITHOUT ANY WARRANTY; without even the implied warranty of
22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 GNU General Public License for more details.
24
25 You should have received a copy of the GNU General Public License
26 along with this program; if not, write to the Free Software
27 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28
29 The GNU General Public License is contained in the file LICENSE.
30
31 This program is based on (at least) the work of:
32 Mike Burrows
33 David Wheeler
34 Peter Fenwick
35 Alistair Moffat
36 Radford Neal
37 Ian H. Witten
38
39 For more information on these sources, see the file ALGORITHMS.
40 --*/
41
42 /*----------------------------------------------------*/
43 /*--- IMPORTANT ---*/
44 /*----------------------------------------------------*/
45
46 /*--
47 WARNING:
48 This program (attempts to) compress data by performing several
49 non-trivial transformations on it. Unless you are 100% familiar
50 with *all* the algorithms contained herein, and with the
51 consequences of modifying them, you should NOT meddle with the
52 compression or decompression machinery. Incorrect changes can
53 and very likely *will* lead to disasterous loss of data.
54
55 DISCLAIMER:
56 I TAKE NO RESPONSIBILITY FOR ANY LOSS OF DATA ARISING FROM THE
57 USE OF THIS PROGRAM, HOWSOEVER CAUSED.
58
59 Every compression of a file implies an assumption that the
60 compressed file can be decompressed to reproduce the original.
61 Great efforts in design, coding and testing have been made to
62 ensure that this program works correctly. However, the
63 complexity of the algorithms, and, in particular, the presence
64 of various special cases in the code which occur with very low
65 but non-zero probability make it impossible to rule out the
66 possibility of bugs remaining in the program. DO NOT COMPRESS
67 ANY DATA WITH THIS PROGRAM UNLESS YOU ARE PREPARED TO ACCEPT THE
68 POSSIBILITY, HOWEVER SMALL, THAT THE DATA WILL NOT BE RECOVERABLE.
69
70 That is not to say this program is inherently unreliable.
71 Indeed, I very much hope the opposite is true. BZIP has been
72 carefully constructed and extensively tested.
73 --*/
74
75
76
77 /*----------------------------------------------------*/
78 /*--- and now for something much more pleasant :-) ---*/
79 /*----------------------------------------------------*/
80
81 /*---------------------------------------------*/
82 /*--
83 Place a 1 beside your platform, and 0 elsewhere.
84 --*/
85
86 #define BZ_UNIX_32 1 /*-- generic 32-bit Unix --*/
87 #define BZ_UNIX_64 0 /*-- generic 64-bit Unix --*/
88 #define BZ_WIN32_BORLANDC50 0 /*-- Win32/95/NT, Borland C++ 5.0 --*/
89 #define BZ_WIN32_MSVC20 0 /*-- Win32/95/NT, MS VC++ 2.0 --*/
90
91 #define BZ_UNIX (BZ_UNIX_32 | BZ_UNIX_64)
92
93
94 /*---------------------------------------------*/
95 /*--
96 Some stuff for all platforms.
97 --*/
98
99 #include <stdio.h>
100 #include <stdlib.h>
101 #include <assert.h>
102 #include <string.h>
103 #include <signal.h>
104 #include <errno.h>
105
106 #define ERROR_IF_EOF(i) { if ((i) == EOF) ioError(); }
107 #define ERROR_IF_NOT_ZERO(i) { if ((i) != 0) ioError(); }
108 #define ERROR_IF_MINUS_ONE(i) { if ((i) == (-1)) ioError(); }
109
110
111 /*---------------------------------------------*/
112 /*--
113 Platform-specific stuff.
114 --*/
115
116 #if BZ_UNIX_32
117 #include <sys/types.h>
118 #include <utime.h>
119 #include <unistd.h>
120 #if !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__NetBSD__) \
121 && !defined(__DragonFly__) /* stdlib already included above */
122 #include <malloc.h>
123 #endif
124 #include <sys/stat.h>
125 #include <sys/times.h>
126
127 #define Int32 int
128 #define UInt32 unsigned int
129 #define Char char
130 #define UChar unsigned char
131
132 #define PATH_SEP '/'
133 #define MY_LSTAT lstat
134 #define MY_S_IFREG S_ISREG
135 #define MY_STAT stat
136
137 #define APPEND_FILESPEC(root, name) \
138 root=snocString((root), (name))
139
140 #define SET_BINARY_MODE(fd) /**/
141
142 /*--
143 You should try very hard to persuade your C compiler
144 to inline the bits marked INLINE. Otherwise bzip will
145 run rather slowly. gcc version 2.x is recommended.
146 --*/
147 #ifdef __GNUC__
148 #define INLINE inline
149 #define NORETURN __attribute__ ((noreturn))
150 #else
151 #define INLINE /**/
152 #define NORETURN /**/
153 #endif
154 #endif
155
156
157 #if BZ_UNIX_64
158 Any volunteers? If so, please mail me the relevant bits.
159 #endif
160
161
162 #if BZ_WIN32_BORLANDC50
163 #include <dir.h>
164 #include <io.h>
165 #include <fcntl.h>
166 #include <sys\stat.h>
167 #include <utime.h>
168
169 #define Int32 int
170 #define UInt32 unsigned int
171 #define Char char
172 #define UChar unsigned char
173
174 #define INLINE /**/
175 #define NORETURN /**/
176 #define PATH_SEP '\\'
177 #define MY_S_IFREG(x) ((x) & S_IFREG)
178 #define MY_LSTAT stat
179 #define MY_STAT stat
180
181 #define APPEND_FILESPEC(root, spec) \
182 do { \
183 if ((spec)[0] == '-') { \
184 root = snocString((root), (spec)); \
185 } else { \
186 struct ffblk ffblk; \
187 int done; \
188 done = findfirst((spec), &ffblk, 0); \
189 if ( done ) { \
190 root = snocString ((root), (spec)); \
191 } else { \
192 while ( !done ) { \
193 root = snocString((root), \
194 &ffblk.ff_name[0]); \
195 done = findnext(&ffblk); \
196 } \
197 } \
198 } \
199 } while ( 0 )
200
201 #define SET_BINARY_MODE(fd) \
202 do { \
203 int retVal = setmode ( fileno ( fd ), \
204 O_BINARY ); \
205 ERROR_IF_MINUS_ONE ( retVal ); \
206 } while ( 0 )
207
208 #endif
209
210
211 #if BZ_WIN32_MSVC20
212 #include <io.h>
213 #include <fcntl.h>
214 #include <sys\stat.h>
215 #include <sys\utime.h>
216
217 #define Int32 int
218 #define UInt32 unsigned int
219 #define Char char
220 #define UChar unsigned char
221
222 #define INLINE /**/
223 #define NORETURN /**/
224 #define PATH_SEP '\\'
225 #define MY_LSTAT _stat
226 #define MY_STAT _stat
227 #define MY_S_IFREG(x) ((x) & _S_IFREG)
228
229 #define APPEND_FILESPEC(root, spec) \
230 do { \
231 if ((spec)[0] == '-') { \
232 root = snocString((root), (spec)); \
233 } else { \
234 struct _finddata_t c_file; \
235 long hFile; \
236 hFile = _findfirst((spec), &c_file); \
237 if ( hFile == -1L ) { \
238 root = snocString ((root), (spec)); \
239 } else { \
240 int anInt = 0; \
241 while ( anInt == 0 ) { \
242 root = snocString((root), \
243 &c_file.name[0]); \
244 anInt = _findnext(hFile, &c_file); \
245 } \
246 } \
247 } \
248 } while ( 0 )
249
250 #define SET_BINARY_MODE(fd) \
251 do { \
252 int retVal = setmode ( fileno ( fd ), \
253 O_BINARY ); \
254 ERROR_IF_MINUS_ONE ( retVal ); \
255 } while ( 0 )
256
257 #endif
258
259
260 /*---------------------------------------------*/
261 /*--
262 Some more stuff for all platforms :-)
263 --*/
264
265 #define Bool int
266 #define True 1
267 #define False 0
268
269 /*--
270 IntNative is your platform's `native' int size.
271 Only here to avoid probs with 64-bit platforms.
272 --*/
273 #define IntNative int
274
275
276 /*--
277 change to 1, or compile with -DDEBUG=1 to debug
278 --*/
279 #ifndef DEBUG
280 #define DEBUG 0
281 #endif
282
283
284 /*---------------------------------------------------*/
285 /*--- ---*/
286 /*---------------------------------------------------*/
287
288 /*--
289 Implementation notes, July 1996
290 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
291
292 Memory allocation
293 ~~~~~~~~~~~~~~~~~
294 All large data structures are allocated on the C heap,
295 for better or for worse. That includes the various
296 arrays of pointers, striped words, bytes and frequency
297 tables for compression and decompression. The only non-heap
298 allocated structures are the coding models and the
299 BitStream structures, but these are both small.
300
301 BZIP can operate at various block-sizes, ranging from
302 100k to 900k in 100k steps, and it allocates only as
303 much as it needs to. When compressing, we know from the
304 command-line options what the block-size is going to be,
305 so all allocation can be done at start-up; if that
306 succeeds, there can be no further allocation problems.
307
308 Decompression is more complicated. Each compressed file
309 contains, in its header, a byte indicating the block
310 size used for compression. This means BZIP potentially
311 needs to reallocate memory for each file it deals with,
312 which in turn opens the possibility for a memory allocation
313 failure part way through a run of files, by encountering
314 a file requiring a much larger block size than all the
315 ones preceding it.
316
317 The policy is to simply give up if a memory allocation
318 failure occurs. During decompression, it would be
319 possible to move on to subsequent files in the hope that
320 some might ask for a smaller block size, but the
321 complications for doing this seem more trouble than they
322 are worth.
323
324
325 Compressed file formats
326 ~~~~~~~~~~~~~~~~~~~~~~~
327 Perhaps the most important point is that compressed
328 files start with a 4-byte preamble:
329
330 'B' 'Z' -- a crude `magic number'
331
332 '0' -- file format version
333
334 '1' to '9' -- block size indicator
335
336 The third byte gives a trapdoor mechanism so that we can
337 change file formats and/or algorithm later, without
338 losing backwards compatibility. The fourth byte
339 indicates the compression block size; '1' stands for
340 100k, '2' for 200k, &c.
341
342 In the present file format (call it version '0') *all*
343 material after this 4-byte preamble is written via the
344 arithmetic coder, using various different models,
345 including the `bogus' non-adaptive 256-entry model used
346 to send bytes through the arithmetic coder. The overall
347 structure of this part is a sequence of blocks, each of
348 the format
349
350 origPtr run-length-coded MTF values EOB
351
352 origPtr is a 32-bit number (sent via bogusModel)
353 indicating the position in the sorted block of the
354 un-rotated original string. A negative value of
355 origPtr indicates that this is the last block.
356
357 Finally, after all the blocks, there is a 32-bit
358 CRC value, again sent using the `bogus' model.
359 The CRC applies to the entirety of the uncompressed
360 data stream.
361
362 The MTF values are coded exactly as with Fenwick's
363 `structured model', augmented with Wheeler's run-length
364 coding scheme for zeros. The basis model has an extra
365 symbol, EOB, denoting the end of the block; if that is
366 not encountered within the block size indicated by the
367 preamble, something is wrong.
368
369
370 Error conditions
371 ~~~~~~~~~~~~~~~~
372 Dealing with error conditions is the least satisfactory
373 aspect of BZIP. The policy is to try and leave the
374 filesystem in a consistent state, then quit, even if it
375 means not processing some of the files mentioned in the
376 command line. `A consistent state' means that a file
377 exists either in its compressed or uncompressed form,
378 but not both. This boils down to the rule `delete the
379 output file if an error condition occurs, leaving the
380 input intact'. Input files are only deleted when we can
381 be pretty sure the output file has been written and
382 closed successfully.
383
384 Errors are a dog because there's so many things to
385 deal with. The following can happen mid-file, and
386 require cleaning up.
387
388 internal `panics' -- indicating a bug
389 corrupted compressed file -- block overrun in MTF decode
390 bad magic number or version on compressed file
391 can't allocate enough memory to decompress this file
392 I/O error reading/writing/opening/closing
393 signal catches -- Control-C, SIGTERM, SIGHUP.
394
395 Other conditions, primarily pertaining to file names,
396 can be checked in-between files, which makes dealing
397 with them easier.
398 --*/
399
400
401
402
403 /*---------------------------------------------*/
404
405 Int32 bytesIn, bytesOut;
406 Bool verbose, veryVerbose;
407 Bool compressing, keepInputFiles;
408 UInt32 globalCrc;
409
410 #define OM_FILES_TO_FILES 1
411 #define OM_FILE_TO_STDOUT 2
412 #define OM_STDIN_TO_STDOUT 3
413 Int32 opMode;
414
415 Int32 longestFileName;
416 Char inName[1024];
417 Char outName[1024];
418 Char *progName;
419 Char progNameReally[1024];
420 FILE *outputHandleJustInCase;
421
422 void panic ( Char* ) NORETURN;
423 void ioError ( void ) NORETURN;
424 void compressOutOfMemory ( Int32, Int32 ) NORETURN;
425 void uncompressOutOfMemory ( Int32, Int32 ) NORETURN;
426 void blockOverrun ( void ) NORETURN;
427 void unblockError ( void ) NORETURN;
428 void crcError ( UInt32, UInt32 ) NORETURN;
429 void bitStreamEOF ( void ) NORETURN;
430 void cleanUpAndFail ( void ) NORETURN;
431 void compressedStreamEOF ( void ) NORETURN;
432
433
434 /*---------------------------------------------------*/
435 /*--- 32-bit CRC grunge ---*/
436 /*---------------------------------------------------*/
437
438 /*--
439 I think this is an implementation of the AUTODIN-II,
440 Ethernet & FDDI 32-bit CRC standard. Vaguely derived
441 from code by Rob Warnock, in Section 51 of the
442 comp.compression FAQ.
443 --*/
444
445 UInt32 crc32Table[256] = {
446
447 /*-- Ugly, innit? --*/
448
449 0x00000000L, 0x04c11db7L, 0x09823b6eL, 0x0d4326d9L,
450 0x130476dcL, 0x17c56b6bL, 0x1a864db2L, 0x1e475005L,
451 0x2608edb8L, 0x22c9f00fL, 0x2f8ad6d6L, 0x2b4bcb61L,
452 0x350c9b64L, 0x31cd86d3L, 0x3c8ea00aL, 0x384fbdbdL,
453 0x4c11db70L, 0x48d0c6c7L, 0x4593e01eL, 0x4152fda9L,
454 0x5f15adacL, 0x5bd4b01bL, 0x569796c2L, 0x52568b75L,
455 0x6a1936c8L, 0x6ed82b7fL, 0x639b0da6L, 0x675a1011L,
456 0x791d4014L, 0x7ddc5da3L, 0x709f7b7aL, 0x745e66cdL,
457 0x9823b6e0L, 0x9ce2ab57L, 0x91a18d8eL, 0x95609039L,
458 0x8b27c03cL, 0x8fe6dd8bL, 0x82a5fb52L, 0x8664e6e5L,
459 0xbe2b5b58L, 0xbaea46efL, 0xb7a96036L, 0xb3687d81L,
460 0xad2f2d84L, 0xa9ee3033L, 0xa4ad16eaL, 0xa06c0b5dL,
461 0xd4326d90L, 0xd0f37027L, 0xddb056feL, 0xd9714b49L,
462 0xc7361b4cL, 0xc3f706fbL, 0xceb42022L, 0xca753d95L,
463 0xf23a8028L, 0xf6fb9d9fL, 0xfbb8bb46L, 0xff79a6f1L,
464 0xe13ef6f4L, 0xe5ffeb43L, 0xe8bccd9aL, 0xec7dd02dL,
465 0x34867077L, 0x30476dc0L, 0x3d044b19L, 0x39c556aeL,
466 0x278206abL, 0x23431b1cL, 0x2e003dc5L, 0x2ac12072L,
467 0x128e9dcfL, 0x164f8078L, 0x1b0ca6a1L, 0x1fcdbb16L,
468 0x018aeb13L, 0x054bf6a4L, 0x0808d07dL, 0x0cc9cdcaL,
469 0x7897ab07L, 0x7c56b6b0L, 0x71159069L, 0x75d48ddeL,
470 0x6b93dddbL, 0x6f52c06cL, 0x6211e6b5L, 0x66d0fb02L,
471 0x5e9f46bfL, 0x5a5e5b08L, 0x571d7dd1L, 0x53dc6066L,
472 0x4d9b3063L, 0x495a2dd4L, 0x44190b0dL, 0x40d816baL,
473 0xaca5c697L, 0xa864db20L, 0xa527fdf9L, 0xa1e6e04eL,
474 0xbfa1b04bL, 0xbb60adfcL, 0xb6238b25L, 0xb2e29692L,
475 0x8aad2b2fL, 0x8e6c3698L, 0x832f1041L, 0x87ee0df6L,
476 0x99a95df3L, 0x9d684044L, 0x902b669dL, 0x94ea7b2aL,
477 0xe0b41de7L, 0xe4750050L, 0xe9362689L, 0xedf73b3eL,
478 0xf3b06b3bL, 0xf771768cL, 0xfa325055L, 0xfef34de2L,
479 0xc6bcf05fL, 0xc27dede8L, 0xcf3ecb31L, 0xcbffd686L,
480 0xd5b88683L, 0xd1799b34L, 0xdc3abdedL, 0xd8fba05aL,
481 0x690ce0eeL, 0x6dcdfd59L, 0x608edb80L, 0x644fc637L,
482 0x7a089632L, 0x7ec98b85L, 0x738aad5cL, 0x774bb0ebL,
483 0x4f040d56L, 0x4bc510e1L, 0x46863638L, 0x42472b8fL,
484 0x5c007b8aL, 0x58c1663dL, 0x558240e4L, 0x51435d53L,
485 0x251d3b9eL, 0x21dc2629L, 0x2c9f00f0L, 0x285e1d47L,
486 0x36194d42L, 0x32d850f5L, 0x3f9b762cL, 0x3b5a6b9bL,
487 0x0315d626L, 0x07d4cb91L, 0x0a97ed48L, 0x0e56f0ffL,
488 0x1011a0faL, 0x14d0bd4dL, 0x19939b94L, 0x1d528623L,
489 0xf12f560eL, 0xf5ee4bb9L, 0xf8ad6d60L, 0xfc6c70d7L,
490 0xe22b20d2L, 0xe6ea3d65L, 0xeba91bbcL, 0xef68060bL,
491 0xd727bbb6L, 0xd3e6a601L, 0xdea580d8L, 0xda649d6fL,
492 0xc423cd6aL, 0xc0e2d0ddL, 0xcda1f604L, 0xc960ebb3L,
493 0xbd3e8d7eL, 0xb9ff90c9L, 0xb4bcb610L, 0xb07daba7L,
494 0xae3afba2L, 0xaafbe615L, 0xa7b8c0ccL, 0xa379dd7bL,
495 0x9b3660c6L, 0x9ff77d71L, 0x92b45ba8L, 0x9675461fL,
496 0x8832161aL, 0x8cf30badL, 0x81b02d74L, 0x857130c3L,
497 0x5d8a9099L, 0x594b8d2eL, 0x5408abf7L, 0x50c9b640L,
498 0x4e8ee645L, 0x4a4ffbf2L, 0x470cdd2bL, 0x43cdc09cL,
499 0x7b827d21L, 0x7f436096L, 0x7200464fL, 0x76c15bf8L,
500 0x68860bfdL, 0x6c47164aL, 0x61043093L, 0x65c52d24L,
501 0x119b4be9L, 0x155a565eL, 0x18197087L, 0x1cd86d30L,
502 0x029f3d35L, 0x065e2082L, 0x0b1d065bL, 0x0fdc1becL,
503 0x3793a651L, 0x3352bbe6L, 0x3e119d3fL, 0x3ad08088L,
504 0x2497d08dL, 0x2056cd3aL, 0x2d15ebe3L, 0x29d4f654L,
505 0xc5a92679L, 0xc1683bceL, 0xcc2b1d17L, 0xc8ea00a0L,
506 0xd6ad50a5L, 0xd26c4d12L, 0xdf2f6bcbL, 0xdbee767cL,
507 0xe3a1cbc1L, 0xe760d676L, 0xea23f0afL, 0xeee2ed18L,
508 0xf0a5bd1dL, 0xf464a0aaL, 0xf9278673L, 0xfde69bc4L,
509 0x89b8fd09L, 0x8d79e0beL, 0x803ac667L, 0x84fbdbd0L,
510 0x9abc8bd5L, 0x9e7d9662L, 0x933eb0bbL, 0x97ffad0cL,
511 0xafb010b1L, 0xab710d06L, 0xa6322bdfL, 0xa2f33668L,
512 0xbcb4666dL, 0xb8757bdaL, 0xb5365d03L, 0xb1f740b4L
513 };
514
515
516 /*---------------------------------------------*/
initialiseCRC(void)517 void initialiseCRC ( void )
518 {
519 globalCrc = 0xffffffffL;
520 }
521
522
523 /*---------------------------------------------*/
getFinalCRC(void)524 UInt32 getFinalCRC ( void )
525 {
526 return ~globalCrc;
527 }
528
529
530 /*---------------------------------------------*/
getGlobalCRC(void)531 UInt32 getGlobalCRC ( void )
532 {
533 return globalCrc;
534 }
535
536
537 /*---------------------------------------------*/
setGlobalCRC(UInt32 newCrc)538 void setGlobalCRC ( UInt32 newCrc )
539 {
540 globalCrc = newCrc;
541 }
542
543
544 /*---------------------------------------------*/
545 #define UPDATE_CRC(crcVar,cha) \
546 { \
547 crcVar = (crcVar << 8) ^ \
548 crc32Table[(crcVar >> 24) ^ \
549 ((UChar)cha)]; \
550 }
551
552
553 /*---------------------------------------------------*/
554 /*--- Bit stream I/O ---*/
555 /*---------------------------------------------------*/
556
557 /*--
558 Although it seems a bit silly, we restrict
559 ourselves to one bitstream at a time, so as to
560 avoid malloc-ing the structure. This avoids any
561 possible fragmentation-style interactions with
562 the repeated malloc/free cycles of large areas
563 which happen during decompression.
564
565 bsInUse is a consistency-check feature.
566 --*/
567
568 typedef
569 struct {
570 FILE* handle;
571 Int32 buffer;
572 Int32 buffLive;
573 Char mode;
574 }
575 BitStream;
576
577 BitStream aBitStreamBuffer;
578 Bool bsInUse;
579
580
581 /*---------------------------------------------*/
bsOpenReadStream(FILE * stream)582 BitStream* bsOpenReadStream ( FILE* stream )
583 {
584 BitStream *bs;
585
586 if (bsInUse) panic ( "bsOpenReadStream" );
587 bsInUse = True;
588 bs = &aBitStreamBuffer;
589
590 bs->handle = stream;
591 bs->buffer = 0;
592 bs->buffLive = 0;
593 bs->mode = 'r';
594
595 return bs;
596 }
597
598
599 /*---------------------------------------------*/
bsOpenWriteStream(FILE * stream)600 BitStream* bsOpenWriteStream ( FILE* stream )
601 {
602 BitStream *bs;
603
604 if (bsInUse) panic ( "bsOpenWriteStream" );
605 bsInUse = True;
606 bs = &aBitStreamBuffer;
607
608 bs->handle = stream;
609 bs->buffer = 0;
610 bs->buffLive = 0;
611 bs->mode = 'w';
612
613 return bs;
614 }
615
616
617 /*---------------------------------------------*/
bsPutBit(BitStream * bs,Int32 bit)618 INLINE void bsPutBit ( BitStream* bs, Int32 bit )
619 {
620 if (bs->buffLive == 8) {
621 IntNative retVal = putc ( (UChar) bs->buffer, bs->handle );
622 ERROR_IF_EOF ( retVal );
623 bytesOut++;
624 bs->buffLive = 1;
625 bs->buffer = bit & 0x1;
626 } else {
627 bs->buffer = ( (bs->buffer << 1) | (bit & 0x1) );
628 bs->buffLive++;
629 };
630 }
631
632
633 /*---------------------------------------------*/
634 /*--
635 We abort if an EOF appears, regardless of whether
636 an I/O error has happened, or we've really hit
637 the end of the file. The pseudo-justification for
638 this is that the caller should interpret the bit
639 stream and decide for itself when the stream has
640 ended.
641 --*/
642
bsGetBit(BitStream * bs)643 INLINE Int32 bsGetBit ( BitStream* bs )
644 {
645 if (bs->buffLive > 0) {
646 bs->buffLive --;
647 return ( ((bs->buffer) >> (bs->buffLive)) & 0x1 );
648 } else {
649 IntNative retVal = getc ( bs->handle );
650 if ( retVal == EOF ) compressedStreamEOF();
651 bs->buffLive = 7;
652 bs->buffer = retVal;
653 if (bs->buffer == EOF) bitStreamEOF();
654 return ( ((bs->buffer) >> 7) & 0x1 );
655 }
656 }
657
658
659 /*---------------------------------------------*/
bsGetUChar(BitStream * bs)660 UChar bsGetUChar ( BitStream* bs )
661 {
662 Int32 i;
663 UInt32 c;
664
665 c = 0;
666 for (i = 0; i <= 7; i++)
667 c = (c << 1) | bsGetBit ( bs );
668
669 return (UChar)c;
670 }
671
672
673 /*---------------------------------------------*/
bsPutUChar(BitStream * bs,UChar c)674 void bsPutUChar ( BitStream* bs, UChar c )
675 {
676 Int32 i;
677 for (i = 7; i >= 0; i--)
678 bsPutBit ( bs, (((UInt32) c) >> i) & 0x1 );
679 }
680
681
682 /*---------------------------------------------*/
bsClose(BitStream * bs)683 void bsClose ( BitStream* bs )
684 {
685 IntNative retVal;
686 if (!bsInUse) panic ( "bsClose" );
687 bsInUse = False;
688
689 if ( bs->mode == 'w' ) {
690 while ( bs->buffLive < 8 ) {
691 bs->buffLive++;
692 bs->buffer <<= 1;
693 };
694 retVal = putc ( (UChar) (bs->buffer), bs->handle );
695 ERROR_IF_EOF ( retVal );
696 bytesOut++;
697 retVal = fflush ( bs->handle );
698 ERROR_IF_EOF ( retVal );
699 }
700 ERROR_IF_NOT_ZERO( ferror(bs->handle) );
701 retVal = fclose ( bs->handle );
702 ERROR_IF_EOF ( retVal );
703 }
704
705
706 /*---------------------------------------------------*/
707 /*--- Generic frequency-table stuff [data defn] ---*/
708 /*---------------------------------------------------*/
709
710 #define MAX_SYMBOLS 256
711
712 /*-- freq[0] is unused, and is kept at zero.
713 freq[MAX_SYMBOLS+1] is also unused and kept at zero.
714 This is for historical reasons, and is no longer
715 necessary.
716
717 The counts for symbols 1..numSymbols are
718 stored at freq[1] .. freq[numSymbols].
719
720 Presumably one should make sure that
721 ((incVal + noExceed) / 2) < noExceed
722 so that scaling always produces sensible results.
723
724 We take incValue == 0 to mean that the counts shouldn't
725 be incremented or scaled.
726
727 This data decl has to go before the arithmetic coding stuff.
728 --*/
729
730 typedef
731 struct {
732 UInt32 numScalings;
733 UInt32 numTraffic;
734 UInt32 totFreq;
735 UInt32 numSymbols;
736 UInt32 incValue;
737 UInt32 noExceed;
738 Char *name;
739 UInt32 freq[MAX_SYMBOLS + 2];
740 }
741 Model;
742
743
744 /*---------------------------------------------------*/
745 /*--- The DCC95 arithmetic coder. ---*/
746 /*---------------------------------------------------*/
747
748 /*--
749 This is a clean-room (ie, my own) implementation of the
750 coder described in ``Arithmetic Coding Revisited'',
751 by Alistair Moffat, Radford Neal and Ian Witten,
752 originally presented at the 1995 IEEE Data Compression
753 Conference, Snowbird, Utah, USA in March 1995.
754
755 The paper has evolved somewhat since then. This
756 implementation pertains to the June 1996 version of
757 the paper. In particular, we have an initial value
758 for R of 2^(b-1) rather than 2^(b-1) - 1, and termination
759 of coding (overly conservative here) is different.
760
761 I don't use the shift-add multiply/divide machinery;
762 I could, but it adds complexity & I'm not convinced about
763 the long-term architectural benefit of that approach.
764 I could be wrong.
765 --*/
766
767 #define TWO_TO_THE(n) (1 << (n))
768 #define MAX_BITS_OUTSTANDING 500000000
769
770 #define smallB 26
771 #define smallF 18
772
773 UInt32 bigL;
774 UInt32 bigR;
775 UInt32 bigD;
776 UInt32 bitsOutstanding;
777
778
779 /*---------------------------------------------*/
minUInt32(UInt32 a,UInt32 b)780 INLINE UInt32 minUInt32 ( UInt32 a, UInt32 b )
781 {
782 if (a < b) return a; else return b;
783 }
784
785
786 /*---------------------------------------------*/
arithCodeBitPlusFollow(BitStream * bs,UInt32 bit)787 INLINE void arithCodeBitPlusFollow ( BitStream *bs, UInt32 bit )
788 {
789 bsPutBit ( bs, bit );
790 while ( bitsOutstanding > 0 ) {
791 bsPutBit ( bs, 1 - bit );
792 bitsOutstanding --;
793 }
794 }
795
796
797 /*---------------------------------------------*/
arithCodeStartEncoding(BitStream * bs)798 void arithCodeStartEncoding ( BitStream *bs )
799 {
800 bigL = 0;
801 bigR = TWO_TO_THE ( smallB - 1 );
802 bitsOutstanding = 0;
803 }
804
805
806 /*---------------------------------------------*/
arithCodeDoneEncoding(BitStream * bs)807 void arithCodeDoneEncoding ( BitStream *bs )
808 {
809 Int32 i;
810
811 for (i = smallB; i >= 1; i--)
812 arithCodeBitPlusFollow ( bs, (bigL >> (i-1)) & 0x1 );
813 }
814
815
816 /*---------------------------------------------*/
arithCodeStartDecoding(BitStream * bs)817 void arithCodeStartDecoding ( BitStream *bs )
818 {
819 Int32 i;
820
821 bigL = 0;
822 bigR = TWO_TO_THE ( smallB - 1 );
823 bigD = 0;
824 for (i = 1; i <= smallB; i++)
825 bigD = (bigD << 1) + bsGetBit ( bs );
826 }
827
828
829 /*---------------------------------------------*/
arithCodeDoneDecoding(BitStream * bs)830 void arithCodeDoneDecoding ( BitStream *bs )
831 {
832 /*--- No action necessary. ---*/
833 }
834
835
836 /*---------------------------------------------*/
arithCodeRenormalise_Encode(BitStream * bs)837 INLINE void arithCodeRenormalise_Encode ( BitStream* bs )
838 {
839 while (bigR <= TWO_TO_THE ( smallB-2 ) ) {
840 if ( (bigL + bigR) <= TWO_TO_THE ( smallB-1 ) ) {
841 arithCodeBitPlusFollow ( bs, 0 );
842 } else
843 if ( TWO_TO_THE ( smallB-1 ) <= bigL ) {
844 arithCodeBitPlusFollow ( bs, 1 );
845 bigL = bigL - TWO_TO_THE ( smallB-1 );
846 } else {
847 bitsOutstanding++;
848 bigL = bigL - TWO_TO_THE ( smallB-2 );
849 }
850 bigL = 2 * bigL;
851 bigR = 2 * bigR;
852 }
853 }
854
855
856 /*---------------------------------------------*/
arithCodeSymbol(BitStream * bs,Model * m,Int32 symbol)857 void arithCodeSymbol ( BitStream *bs, Model *m, Int32 symbol )
858 {
859 UInt32 smallL, smallH, smallT, smallR, smallR_x_smallL;
860 Int32 i;
861
862 #if DEBUG
863 assert ( TWO_TO_THE ( smallB-2 ) < bigR );
864 assert ( bigR <= TWO_TO_THE ( smallB-1 ) );
865 assert ( 0 <= bigL );
866 assert ( bigL < TWO_TO_THE ( smallB ) - TWO_TO_THE ( smallB-2 ) );
867 assert ( (bigL + bigR) <= TWO_TO_THE ( smallB ) );
868 #endif
869
870 /*--- Set smallL and smallH to the cumfreq values
871 respectively prior to and including symbol.
872 ---*/
873 smallT = m->totFreq;
874 smallL = 0;
875 for (i = 1; i < symbol; i++) smallL += m->freq[i];
876 smallH = smallL + m->freq[symbol];
877
878 smallR = bigR / smallT;
879
880 smallR_x_smallL = smallR * smallL;
881 bigL = bigL + smallR_x_smallL;
882
883 if (smallH < smallT)
884 bigR = smallR * (smallH - smallL); else
885 bigR = bigR - smallR_x_smallL;
886
887 arithCodeRenormalise_Encode ( bs );
888
889 if (bitsOutstanding > MAX_BITS_OUTSTANDING)
890 panic ( "arithCodeSymbol: too many bits outstanding" );
891 }
892
893
894 /*---------------------------------------------*/
arithDecodeSymbol(BitStream * bs,Model * m)895 Int32 arithDecodeSymbol ( BitStream *bs, Model *m )
896 {
897 UInt32 smallL, smallH, smallT, smallR;
898 UInt32 smallR_x_smallL, target, symbol;
899
900 smallT = m->totFreq;
901
902 /*--- Get target value. ---*/
903 smallR = bigR / smallT;
904 target = minUInt32 ( smallT-1, bigD / smallR );
905
906 symbol = 0;
907 smallH = 0;
908 while (smallH <= target) {
909 symbol++;
910 smallH += m->freq[symbol];
911 }
912 smallL = smallH - m->freq[symbol];
913
914 smallR_x_smallL = smallR * smallL;
915 bigD = bigD - smallR_x_smallL;
916
917 if (smallH < smallT)
918 bigR = smallR * (smallH - smallL); else
919 bigR = bigR - smallR_x_smallL;
920
921 while ( bigR <= TWO_TO_THE ( smallB-2 ) ) {
922 bigR = 2 * bigR;
923 bigD = 2 * bigD + bsGetBit ( bs );
924 }
925
926 return symbol;
927 }
928
929
930 /*---------------------------------------------------*/
931 /*--- Generic frequency-table stuff [fn defns] ---*/
932 /*---------------------------------------------------*/
933
934 /*---------------------------------------------*/
initModel(Model * m,Char * initName,Int32 initNumSymbols,Int32 initIncValue,Int32 initNoExceed)935 void initModel ( Model *m,
936 Char *initName,
937 Int32 initNumSymbols,
938 Int32 initIncValue,
939 Int32 initNoExceed
940 )
941 {
942 Int32 i;
943
944 if (initIncValue == 0) {
945 m->totFreq = initNumSymbols;
946 for (i = 1; i <= initNumSymbols; i++)
947 m->freq[i] = 1;
948 } else {
949 m->totFreq = initNumSymbols * initIncValue;
950 for (i = 1; i <= initNumSymbols; i++)
951 m->freq[i] = initIncValue;
952 };
953
954 m->numSymbols = initNumSymbols;
955 m->incValue = initIncValue;
956 m->noExceed = initNoExceed;
957 m->name = initName;
958 m->freq[0] = 0;
959 m->freq[initNumSymbols + 1] = 0;
960 m->numScalings = 0;
961 }
962
963
964 /*---------------------------------------------*/
dumpModelStats(Model * m)965 void dumpModelStats ( Model *m )
966 {
967 fprintf (
968 stderr,
969 "model %s:\t scalings %d\n",
970 m->name,
971 m->numScalings
972 );
973 }
974
975
976 /*---------------------------------------------*/
updateModel(Model * m,Int32 symbol)977 INLINE void updateModel ( Model *m, Int32 symbol )
978 {
979 UInt32 i;
980
981 m->totFreq += m->incValue;
982 m->freq[symbol] += m->incValue;
983 if (m->totFreq > m->noExceed) {
984 m->totFreq = 0;
985 m->numScalings++;
986 for (i = 1; i <= m->numSymbols; i++) {
987 m->freq[i] = (m->freq[i] + 1) >> 1;
988 m->totFreq += m->freq[i];
989 }
990 }
991 }
992
993
994 /*---------------------------------------------*/
putSymbol(Model * m,Int32 symbol,BitStream * bs)995 INLINE void putSymbol ( Model *m, Int32 symbol, BitStream *bs )
996 {
997 #if DEBUG
998 if (! (symbol >= 1 && symbol <= m->numSymbols) )
999 fprintf ( stderr,
1000 "BAD, mod = %s, sym = %d, max = %d\n",
1001 m->name, symbol, m->numSymbols );
1002 #endif
1003
1004 arithCodeSymbol ( bs, m, symbol );
1005 updateModel ( m, symbol );
1006 }
1007
1008
1009 /*---------------------------------------------*/
getSymbol(Model * m,BitStream * bs)1010 INLINE Int32 getSymbol ( Model *m, BitStream *bs )
1011 {
1012 Int32 symbol;
1013
1014 symbol = arithDecodeSymbol ( bs, m );
1015 updateModel ( m, symbol );
1016
1017 #if DEBUG
1018 assert (symbol >= 1 && symbol <= m->numSymbols);
1019 #endif
1020
1021 return symbol;
1022 }
1023
1024
1025 /*---------------------------------------------------*/
1026 /*--- For sending bytes/words thru arith coder ---*/
1027 /*---------------------------------------------------*/
1028
1029 Model bogusModel;
1030
1031
1032 /*---------------------------------------------*/
initBogusModel(void)1033 void initBogusModel ( void )
1034 {
1035 initModel ( &bogusModel, "bogus", 256, 0, 256 );
1036 }
1037
1038
1039 /*---------------------------------------------*/
putUChar(BitStream * bs,UChar c)1040 void putUChar ( BitStream *bs, UChar c )
1041 {
1042 putSymbol ( &bogusModel, 1 + (UInt32)c, bs );
1043 }
1044
1045
1046 /*---------------------------------------------*/
putInt32(BitStream * bs,Int32 i)1047 void putInt32 ( BitStream *bs, Int32 i )
1048 {
1049 putUChar ( bs, (UChar) (((UInt32)i >> 24) & 0xFF) );
1050 putUChar ( bs, (UChar) (((UInt32)i >> 16) & 0xFF) );
1051 putUChar ( bs, (UChar) (((UInt32)i >> 8) & 0xFF) );
1052 putUChar ( bs, (UChar) ( (UInt32)i & 0xFF) );
1053 }
1054
1055
1056 /*---------------------------------------------*/
putUInt32(BitStream * bs,UInt32 i)1057 void putUInt32 ( BitStream *bs, UInt32 i )
1058 {
1059 putUChar ( bs, (UChar) ((i >> 24) & 0xFF) );
1060 putUChar ( bs, (UChar) ((i >> 16) & 0xFF) );
1061 putUChar ( bs, (UChar) ((i >> 8) & 0xFF) );
1062 putUChar ( bs, (UChar) ( i & 0xFF) );
1063 }
1064
1065
1066 /*---------------------------------------------*/
getUChar(BitStream * bs)1067 UChar getUChar ( BitStream *bs )
1068 {
1069 return (UChar) (getSymbol ( &bogusModel, bs ) - 1);
1070 }
1071
1072
1073 /*---------------------------------------------*/
getInt32(BitStream * bs)1074 Int32 getInt32 ( BitStream *bs )
1075 {
1076 UInt32 res = 0;
1077
1078 res |= (getUChar ( bs ) << 24);
1079 res |= (getUChar ( bs ) << 16);
1080 res |= (getUChar ( bs ) << 8);
1081 res |= (getUChar ( bs ) );
1082 return (Int32)res;
1083 }
1084
1085
1086 /*---------------------------------------------*/
getUInt32(BitStream * bs)1087 UInt32 getUInt32 ( BitStream *bs )
1088 {
1089 UInt32 res = 0;
1090
1091 res |= (getUChar ( bs ) << 24);
1092 res |= (getUChar ( bs ) << 16);
1093 res |= (getUChar ( bs ) << 8);
1094 res |= (getUChar ( bs ) );
1095 return res;
1096 }
1097
1098
1099 /*---------------------------------------------------*/
1100 /*--- The structured model proper ---*/
1101 /*---------------------------------------------------*/
1102
1103 #define BASIS 0
1104 #define MODEL_2_3 1
1105 #define MODEL_4_7 2
1106 #define MODEL_8_15 3
1107 #define MODEL_16_31 4
1108 #define MODEL_32_63 5
1109 #define MODEL_64_127 6
1110 #define MODEL_128_255 7
1111
1112 Model models[8];
1113
1114
1115 /*---------------------------------------------*/
1116 /*--
1117 The parameters in these models and bogusModel
1118 -- specifically, the value of 1000 for
1119 max-total-frequency -- determine the lowest
1120 acceptable values for smallF and indirectly smallB
1121 in the arithmetic coder above.
1122 --*/
initModels(void)1123 void initModels ( void )
1124 {
1125 initModel ( &models[BASIS], "basis", 11, 12, 1000 );
1126 initModel ( &models[MODEL_2_3], "2-3", 2, 4, 1000 );
1127 initModel ( &models[MODEL_4_7], "4-7", 4, 3, 1000 );
1128 initModel ( &models[MODEL_8_15], "8-15", 8, 3, 1000 );
1129 initModel ( &models[MODEL_16_31], "16-31", 16, 3, 1000 );
1130 initModel ( &models[MODEL_32_63], "32-63", 32, 3, 1000 );
1131 initModel ( &models[MODEL_64_127], "64-127", 64, 2, 1000 );
1132 initModel ( &models[MODEL_128_255], "128-255", 128, 1, 1000 );
1133 }
1134
1135
1136 /*---------------------------------------------*/
dumpAllModelStats(void)1137 void dumpAllModelStats ( void )
1138 {
1139 dumpModelStats ( &bogusModel );
1140 dumpModelStats ( &models[BASIS] );
1141 dumpModelStats ( &models[MODEL_2_3] );
1142 dumpModelStats ( &models[MODEL_4_7] );
1143 dumpModelStats ( &models[MODEL_8_15] );
1144 dumpModelStats ( &models[MODEL_16_31] );
1145 dumpModelStats ( &models[MODEL_32_63] );
1146 dumpModelStats ( &models[MODEL_64_127] );
1147 dumpModelStats ( &models[MODEL_128_255] );
1148 }
1149
1150
1151
1152 #define VAL_RUNA 1
1153 #define VAL_RUNB 2
1154 #define VAL_ONE 3
1155 #define VAL_2_3 4
1156 #define VAL_4_7 5
1157 #define VAL_8_15 6
1158 #define VAL_16_31 7
1159 #define VAL_32_63 8
1160 #define VAL_64_127 9
1161 #define VAL_128_255 10
1162 #define VAL_EOB 11
1163
1164 #define RUNA 257
1165 #define RUNB 258
1166 #define EOB 259
1167 #define INVALID 260
1168
1169
1170 /*---------------------------------------------*/
getMTFVal(BitStream * bs)1171 Int32 getMTFVal ( BitStream *bs )
1172 {
1173 Int32 retVal;
1174
1175 switch ( getSymbol ( &models[BASIS], bs ) ) {
1176 case VAL_EOB:
1177 retVal = EOB; break;
1178 case VAL_RUNA:
1179 retVal = RUNA; break;
1180 case VAL_RUNB:
1181 retVal = RUNB; break;
1182 case VAL_ONE:
1183 retVal = 1; break;
1184 case VAL_2_3:
1185 retVal = getSymbol ( &models[MODEL_2_3], bs ) + 2 - 1; break;
1186 case VAL_4_7:
1187 retVal = getSymbol ( &models[MODEL_4_7], bs ) + 4 - 1; break;
1188 case VAL_8_15:
1189 retVal = getSymbol ( &models[MODEL_8_15], bs ) + 8 - 1; break;
1190 case VAL_16_31:
1191 retVal = getSymbol ( &models[MODEL_16_31], bs ) + 16 - 1; break;
1192 case VAL_32_63:
1193 retVal = getSymbol ( &models[MODEL_32_63], bs ) + 32 - 1; break;
1194 case VAL_64_127:
1195 retVal = getSymbol ( &models[MODEL_64_127], bs ) + 64 - 1; break;
1196 default:
1197 retVal = getSymbol ( &models[MODEL_128_255], bs ) + 128 - 1; break;
1198 }
1199 return retVal;
1200 }
1201
1202
1203 /*---------------------------------------------*/
sendMTFVal(BitStream * bs,Int32 n)1204 void sendMTFVal ( BitStream *bs, Int32 n )
1205 {
1206 if (n == RUNA) putSymbol ( &models[BASIS], VAL_RUNA, bs ); else
1207 if (n == RUNB) putSymbol ( &models[BASIS], VAL_RUNB, bs ); else
1208 if (n == EOB ) putSymbol ( &models[BASIS], VAL_EOB, bs ); else
1209
1210 if (n == 1) putSymbol ( &models[BASIS], VAL_ONE, bs ); else
1211
1212 if (n >= 2 && n <= 3) {
1213 putSymbol ( &models[BASIS], VAL_2_3, bs );
1214 putSymbol ( &models[MODEL_2_3], n - 2 + 1, bs );
1215 } else
1216
1217 if (n >= 4 && n <= 7) {
1218 putSymbol ( &models[BASIS], VAL_4_7, bs );
1219 putSymbol ( &models[MODEL_4_7], n - 4 + 1, bs );
1220 } else
1221
1222 if (n >= 8 && n <= 15) {
1223 putSymbol ( &models[BASIS], VAL_8_15, bs );
1224 putSymbol ( &models[MODEL_8_15], n - 8 + 1, bs );
1225 } else
1226
1227 if (n >= 16 && n <= 31) {
1228 putSymbol ( &models[BASIS], VAL_16_31, bs );
1229 putSymbol ( &models[MODEL_16_31], n - 16 + 1, bs );
1230 } else
1231
1232 if (n >= 32 && n <= 63) {
1233 putSymbol ( &models[BASIS], VAL_32_63, bs );
1234 putSymbol ( &models[MODEL_32_63], n - 32 + 1, bs );
1235 } else
1236
1237 if (n >= 64 && n <= 127) {
1238 putSymbol ( &models[BASIS], VAL_64_127, bs );
1239 putSymbol ( &models[MODEL_64_127], n - 64 + 1, bs );
1240 } else
1241
1242 if (n >= 128 && n <= 255) {
1243 putSymbol ( &models[BASIS], VAL_128_255, bs );
1244 putSymbol ( &models[MODEL_128_255], n - 128 + 1, bs );
1245 } else {
1246
1247 panic ( "sendMTFVal: bad value!" );
1248 }
1249 }
1250
1251
1252 /*---------------------------------------------------*/
1253 /*--- Move-to-front encoding/decoding ---*/
1254 /*---------------------------------------------------*/
1255
1256 /*--
1257 These are the main data structures for
1258 the Burrows-Wheeler transform.
1259 --*/
1260
1261 /*--
1262 For good performance, fullGt() allows pointers
1263 to get partially denormalised. As a consequence,
1264 we have to copy some small quantity of data
1265 from the beginning of a block to the end of it
1266 so things still work right. These constants control
1267 that.
1268 --*/
1269 #define NUM_FULLGT_UNROLLINGS 4
1270 #define MAX_DENORM_OFFSET (4 * NUM_FULLGT_UNROLLINGS)
1271
1272
1273 /*--
1274 Pointers to compression and decompression
1275 structures. Set by
1276 allocateCompressStructures and
1277 setDecompressStructureSizes
1278
1279 The structures are always set to be suitable
1280 for a block of size 100000 * blockSize100k.
1281 --*/
1282 UInt32 *words; /*-- compress --*/
1283 Int32 *zptr; /*-- compress & uncompress --*/
1284 Int32 *ftab; /*-- compress --*/
1285
1286 UChar *block; /*-- uncompress --*/
1287 UChar *ll; /*-- uncompress --*/
1288
1289
1290
1291 /*--
1292 always: lastPP == last+1.
1293 See discussion in sortIt().
1294 --*/
1295 Int32 last;
1296 Int32 lastPP;
1297
1298
1299 /*--
1300 index in ptr[] of original string after sorting.
1301 --*/
1302 Int32 origPtr;
1303
1304
1305 /*--
1306 always: in the range 0 .. 9.
1307 The current block size is 100000 * this number.
1308 --*/
1309 Int32 blockSize100k;
1310
1311
1312 /*---------------------------------------------*/
1313 /*--
1314 Manage memory for compression/decompression.
1315 When compressing, a single block size applies to
1316 all files processed, and that's set when the
1317 program starts. But when decompressing, each file
1318 processed could have been compressed with a
1319 different block size, so we may have to free
1320 and reallocate on a per-file basis.
1321
1322 A call with argument of zero means
1323 `free up everything.' And a value of zero for
1324 blockSize100k means no memory is currently allocated.
1325 --*/
1326
1327
1328 /*---------------------------------------------*/
allocateCompressStructures(void)1329 void allocateCompressStructures ( void )
1330 {
1331 Int32 n = 100000 * blockSize100k;
1332 words = malloc ( (n + MAX_DENORM_OFFSET) * sizeof(Int32) );
1333 zptr = malloc ( n * sizeof(Int32) );
1334 ftab = malloc ( 65537 * sizeof(Int32) );
1335
1336 if (words == NULL || zptr == NULL || ftab == NULL) {
1337 Int32 totalDraw = (n + MAX_DENORM_OFFSET) * sizeof(Int32) +
1338 n * sizeof(Int32) +
1339 65537 * sizeof(Int32);
1340
1341 compressOutOfMemory ( totalDraw, n );
1342 }
1343 }
1344
1345
1346 /*---------------------------------------------*/
setDecompressStructureSizes(Int32 newSize100k)1347 void setDecompressStructureSizes ( Int32 newSize100k )
1348 {
1349 assert (0 <= newSize100k && newSize100k <= 9);
1350 assert (0 <= blockSize100k && blockSize100k <= 9);
1351
1352 if (newSize100k == blockSize100k)
1353 return;
1354
1355 blockSize100k = newSize100k;
1356
1357 if (block != NULL) free ( block );
1358 if (ll != NULL) free ( ll );
1359 if (zptr != NULL) free ( zptr );
1360
1361 if (newSize100k == 0) {
1362 block = NULL;
1363 ll = NULL;
1364 zptr = NULL;
1365 } else {
1366 Int32 n = 100000 * newSize100k;
1367 block = malloc ( n * sizeof(UChar) );
1368 ll = malloc ( n * sizeof(UChar) );
1369 zptr = malloc ( n * sizeof(Int32) );
1370
1371 if (block == NULL || ll == NULL || zptr == NULL) {
1372 Int32 totalDraw = 6 * n * sizeof(UChar);
1373 uncompressOutOfMemory ( totalDraw, n );
1374 }
1375 }
1376 }
1377
1378
1379 /*---------------------------------------------*/
1380 #define IF_THEN_ELSE(c,t,e) ((c) ? (t) : (e))
1381
1382 #define GETFIRST(a) ((UChar)(words[a] >> 24))
1383 #define GETREST(a) (words[a] & 0x00ffffff)
1384 #define SETALL(a,w) words[a] = (w)
1385 #define GETFIRST16(a) ((UInt32)(words[a] >> 16))
1386 #define GETREST16(a) (words[a] & 0x0000ffff)
1387
GETALL(Int32 a)1388 INLINE UInt32 GETALL ( Int32 a )
1389 {
1390 #if DEBUG
1391 assert (a >= 0 && a < lastPP + 4 * NUM_FULLGT_UNROLLINGS);
1392 if (a >= lastPP) assert (words[a] == words[a-lastPP]);
1393 #endif
1394 return words[a];
1395 }
1396
SETREST16(Int32 a,UInt32 w)1397 INLINE void SETREST16 ( Int32 a, UInt32 w )
1398 {
1399 words[a] = (words[a] & 0xffff0000) | (((UInt32)(w)) & 0x0000ffff);
1400 }
1401
SETFIRST16(Int32 a,UInt32 w)1402 INLINE void SETFIRST16 ( Int32 a, UInt32 w )
1403 {
1404 words[a] = (words[a] & 0x0000ffff) | (((UInt32)(w)) << 16);
1405 }
1406
SETREST(Int32 a,UInt32 w)1407 INLINE void SETREST ( Int32 a, UInt32 w )
1408 {
1409 words[a] = (words[a] & 0xff000000) | (((UInt32)(w)) & 0x00ffffff);
1410 }
1411
SETFIRST(Int32 a,UChar c)1412 INLINE void SETFIRST ( Int32 a, UChar c )
1413 {
1414 words[a] = (words[a] & 0x00ffffff) | (((UInt32)(c)) << 24);
1415 }
1416
SETSECOND(Int32 a,UChar c)1417 INLINE void SETSECOND ( Int32 a, UChar c )
1418 {
1419 words[a] = (words[a] & 0xff00ffff) | (((UInt32)(c)) << 16);
1420 }
1421
SETTHIRD(Int32 a,UChar c)1422 INLINE void SETTHIRD ( Int32 a, UChar c )
1423 {
1424 words[a] = (words[a] & 0xffff00ff) | (((UInt32)(c)) << 8);
1425 }
1426
SETFOURTH(Int32 a,UChar c)1427 INLINE void SETFOURTH ( Int32 a, UChar c )
1428 {
1429 words[a] = (words[a] & 0xffffff00) | (((UInt32)(c)));
1430 }
1431
1432
1433 /*---------------------------------------------*/
NORMALISE(Int32 p)1434 INLINE Int32 NORMALISE ( Int32 p )
1435 {
1436 return
1437 IF_THEN_ELSE ( ((p) < 0),
1438 ((p)+lastPP),
1439 IF_THEN_ELSE ( ((p)>=lastPP),
1440 ((p)-lastPP),
1441 (p)
1442 )
1443 );
1444 }
1445
1446
1447 /*---------------------------------------------*/
NORMALISEHI(Int32 p)1448 INLINE Int32 NORMALISEHI ( Int32 p )
1449 {
1450 return
1451 IF_THEN_ELSE ( ((p)>=lastPP),
1452 ((p)-lastPP),
1453 (p)
1454 );
1455 }
1456
1457
1458 /*---------------------------------------------*/
NORMALISELO(Int32 p)1459 INLINE Int32 NORMALISELO ( Int32 p )
1460 {
1461 return
1462 IF_THEN_ELSE ( ((p) < 0),
1463 ((p)+lastPP),
1464 (p)
1465 );
1466 }
1467
1468
1469 /*---------------------------------------------*/
1470 /* The above normalisers are quick but only work when
1471 * p exceeds the block by less than lastPP, since
1472 * they renormalise merely by adding or subtracting
1473 * lastPP. This one always works, although slowly.
1474 */
STRONG_NORMALISE(Int32 p)1475 INLINE Int32 STRONG_NORMALISE ( Int32 p )
1476 {
1477 /* -ve number MOD +ve number always
1478 * was one of life's little mysteries ...
1479 */
1480 while (p < 0) { p += lastPP; };
1481 return
1482 p % lastPP;
1483 }
1484
1485
1486 /*---------------------------------------------*/
sendZeroes(BitStream * outStream,Int32 zeroesPending)1487 void sendZeroes ( BitStream *outStream, Int32 zeroesPending )
1488 {
1489 UInt32 bitsToSend;
1490 Int32 numBits;
1491
1492 if (zeroesPending == 0)
1493 return;
1494
1495 bitsToSend = 0;
1496 numBits = 0;
1497 while (zeroesPending != 0) {
1498 numBits++;
1499 bitsToSend <<= 1;
1500 zeroesPending--;
1501 if ((zeroesPending & 0x1) == 1) bitsToSend |= 1;
1502 zeroesPending >>= 1;
1503 }
1504 while (numBits > 0) {
1505 if ((bitsToSend & 0x1) == 1)
1506 sendMTFVal ( outStream, RUNA ); else
1507 sendMTFVal ( outStream, RUNB );
1508 bitsToSend >>= 1;
1509 numBits--;
1510 }
1511 }
1512
1513
1514 /*---------------------------------------------*/
moveToFrontCodeAndSend(BitStream * outStream,Bool thisIsTheLastBlock)1515 void moveToFrontCodeAndSend ( BitStream *outStream,
1516 Bool thisIsTheLastBlock
1517 )
1518 {
1519 UChar yy[256];
1520 Int32 i, j;
1521 UChar tmp;
1522 UChar tmp2;
1523 Int32 zeroesPending;
1524
1525 zeroesPending = 0;
1526 if (thisIsTheLastBlock)
1527 putInt32 ( outStream, - ( origPtr+1 ) ); else
1528 putInt32 ( outStream, ( origPtr+1 ) );
1529
1530 initModels ();
1531
1532 for (i = 0; i <= 255; i++)
1533 yy[i] = (UChar) i;
1534
1535 for (i = 0; i <= last; i++) {
1536 UChar ll_i;
1537
1538 ll_i = GETFIRST ( NORMALISELO ( zptr[i] - 1 ) );
1539
1540 j = 0;
1541 tmp = yy[j];
1542 while ( ll_i != tmp ) {
1543 j++;
1544 tmp2 = tmp;
1545 tmp = yy[j];
1546 yy[j] = tmp2;
1547 };
1548 yy[0] = tmp;
1549
1550 if (j == 0) {
1551 zeroesPending++;
1552 } else {
1553 sendZeroes ( outStream, zeroesPending );
1554 zeroesPending = 0;
1555 sendMTFVal ( outStream, j );
1556 }
1557
1558 }
1559 sendZeroes ( outStream, zeroesPending );
1560 sendMTFVal ( outStream, EOB );
1561 }
1562
1563
1564 /*---------------------------------------------*/
getAndMoveToFrontDecode(BitStream * inStream)1565 Bool getAndMoveToFrontDecode ( BitStream *inStream )
1566 {
1567 UChar yy[256];
1568 Int32 i, j, tmpOrigPtr, nextSym, limit;
1569
1570 limit = 100000 * blockSize100k;
1571
1572 tmpOrigPtr = getInt32 ( inStream );
1573 if (tmpOrigPtr < 0)
1574 origPtr = ( -tmpOrigPtr ) - 1; else
1575 origPtr = tmpOrigPtr - 1;
1576
1577 initModels ();
1578
1579 for (i = 0; i <= 255; i++)
1580 yy[i] = (UChar) i;
1581
1582 last = -1;
1583
1584 nextSym = getMTFVal ( inStream );
1585
1586 LOOPSTART:
1587
1588 if (nextSym == EOB)
1589 return (tmpOrigPtr < 0);
1590
1591 /*--- acquire run-length bits, most significant first ---*/
1592 if (nextSym == RUNA || nextSym == RUNB) {
1593 Int32 n = 0;
1594 do {
1595 n <<= 1;
1596 if (nextSym == RUNA) n |= 1;
1597 n++;
1598 nextSym = getMTFVal ( inStream );
1599 }
1600 while (nextSym == RUNA || nextSym == RUNB);
1601 while (n > 0) {
1602 last++; if (last >= limit) blockOverrun();
1603 ll[last] = yy[0];
1604 n--;
1605 }
1606 goto LOOPSTART;
1607 }
1608
1609 if (nextSym >= 1 && nextSym <= 255) {
1610 last++; if (last >= limit) blockOverrun();
1611 ll[last] = yy[nextSym];
1612
1613 /*--
1614 This loop is hammered during decompression,
1615 hence the unrolling.
1616
1617 for (j = nextSym; j > 0; j--) yy[j] = yy[j-1];
1618 --*/
1619
1620 j = nextSym;
1621 for (; j > 3; j -= 4) {
1622 yy[j] = yy[j-1];
1623 yy[j-1] = yy[j-2];
1624 yy[j-2] = yy[j-3];
1625 yy[j-3] = yy[j-4];
1626 }
1627 for (; j > 0; j--) yy[j] = yy[j-1];
1628
1629 yy[0] = ll[last];
1630 nextSym = getMTFVal ( inStream );
1631 goto LOOPSTART;
1632 }
1633
1634 fprintf ( stderr, "bad MTF value %d\n", nextSym );
1635 panic ( "getAndMoveToFrontDecode\n" );
1636 /*--- panic never returns ---*/
1637 return True;
1638 }
1639
1640
1641 /*---------------------------------------------------*/
1642 /*--- Block-sorting machinery ---*/
1643 /*---------------------------------------------------*/
1644
1645 /*---------------------------------------------*/
1646 /* Doesn't work when lastPP < 4.
1647 */
stripe(void)1648 void stripe ( void )
1649 {
1650 Int32 i;
1651
1652 for (i = 0; i < lastPP; i++) {
1653 UChar c = GETFIRST(i);
1654 SETSECOND ( NORMALISELO(i-1), c );
1655 SETTHIRD ( NORMALISELO(i-2), c );
1656 SETFOURTH ( NORMALISELO(i-3), c );
1657 }
1658 }
1659
1660
1661 /*---------------------------------------------*/
1662 /* Doesn't work when
1663 * lastPP < 4 * NUM_FULLGT_UNROLLINGS.
1664 */
copyOffsetWords(void)1665 void copyOffsetWords ( void )
1666 {
1667 Int32 i;
1668
1669 for (i = 0; i < 4 * NUM_FULLGT_UNROLLINGS; i++)
1670 words[lastPP+i] = words[i];
1671 }
1672
1673
1674
1675 /*---------------------------------------------*/
1676 /* Doesn't work when
1677 * lastPP < 4 * NUM_FULLGT_UNROLLINGS.
1678 */
fullGt(Int32 i1,Int32 i2)1679 INLINE Bool fullGt ( Int32 i1, Int32 i2 )
1680 {
1681 Int32 i1orig = i1;
1682
1683 if (i1 == i2) return False;
1684
1685 do {
1686 UInt32 w1;
1687 UInt32 w2;
1688
1689 #if DEBUG
1690 assert (i1 >= 0);
1691 assert (i2 >= 0);
1692 assert (i1 != i2);
1693 assert (i1 < lastPP);
1694 assert (i2 < lastPP);
1695 #endif
1696
1697 w1 = GETALL(i1);
1698 w2 = GETALL(i2);
1699 if (w1 != w2) return (w1 > w2);
1700 i1 += 4;
1701 i2 += 4;
1702
1703 #if DEBUG
1704 assert (i1 < lastPP + 1 * NUM_FULLGT_UNROLLINGS);
1705 assert (i2 < lastPP + 1 * NUM_FULLGT_UNROLLINGS);
1706 #endif
1707
1708 w1 = GETALL(i1);
1709 w2 = GETALL(i2);
1710 if (w1 != w2) return (w1 > w2);
1711 i1 += 4;
1712 i2 += 4;
1713
1714 #if DEBUG
1715 assert (i1 < lastPP + 2 * NUM_FULLGT_UNROLLINGS);
1716 assert (i2 < lastPP + 2 * NUM_FULLGT_UNROLLINGS);
1717 #endif
1718
1719 w1 = GETALL(i1);
1720 w2 = GETALL(i2);
1721 if (w1 != w2) return (w1 > w2);
1722 i1 += 4;
1723 i2 += 4;
1724
1725 #if DEBUG
1726 assert (i1 < lastPP + 3 * NUM_FULLGT_UNROLLINGS);
1727 assert (i2 < lastPP + 3 * NUM_FULLGT_UNROLLINGS);
1728 #endif
1729
1730 w1 = GETALL(i1);
1731 w2 = GETALL(i2);
1732 if (w1 != w2) return (w1 > w2);
1733 i1 += 4;
1734 i2 += 4;
1735
1736 #if DEBUG
1737 assert (i1 < lastPP + 4 * NUM_FULLGT_UNROLLINGS);
1738 assert (i2 < lastPP + 4 * NUM_FULLGT_UNROLLINGS);
1739 #endif
1740
1741 i1 = NORMALISEHI(i1);
1742 i2 = NORMALISEHI(i2);
1743
1744 #if DEBUG
1745 assert (i1 >= 0);
1746 assert (i2 >= 0);
1747 assert (i1 != i2);
1748 assert (i1 < lastPP);
1749 assert (i2 < lastPP);
1750 #endif
1751
1752 }
1753 while (i1 != i1orig);
1754 return False;
1755 }
1756
1757
1758 /*---------------------------------------------*/
1759 /* Requires striping, and therefore doesn't
1760 work when lastPP < 4. This qsort is
1761 derived from Weiss' book "Data Structures and
1762 Algorithm Analysis in C", Section 7.7.
1763 */
1764
1765 #define ISORT_BELOW 10
1766
1767 #if DEBUG
rangeErr(Int32 x,Int32 wuL,Int32 wuR)1768 Int32 rangeErr ( Int32 x, Int32 wuL, Int32 wuR )
1769 {
1770 fprintf ( stderr, "\nRANGE ERROR: %d (%d, %d)\n",
1771 x, wuL, wuR);
1772 exit(2);
1773 }
1774 #define RC(x) \
1775 ( ((x) >= wuL && (x) <= wuR) ? (x) : rangeErr((x),wuL, wuR) )
1776 #else
1777 #define RC(x) (x)
1778 #endif
1779
1780 #define SWAP(za,zb) \
1781 { Int32 zl = (za); Int32 zr = (zb); \
1782 Int32 zt = zptr[RC(zl)]; zptr[RC(zl)] = zptr[RC(zr)]; \
1783 zptr[RC(zr)] = zt; \
1784 }
1785
1786
qsortFull(Int32 left,Int32 right)1787 void qsortFull ( Int32 left, Int32 right )
1788 {
1789 Int32 pivot, v;
1790 Int32 i, j;
1791 Int32 wuC;
1792
1793 Int32 stackL[40];
1794 Int32 stackR[40];
1795 Int32 sp = 0;
1796
1797 Int32 wuL = left;
1798 Int32 wuR = right;
1799
1800 while (True) {
1801
1802 /*--
1803 At the beginning of this loop, wuL and wuR hold the
1804 bounds of the next work-unit.
1805 --*/
1806 if (wuR - wuL > ISORT_BELOW) {
1807
1808 /*--
1809 a large Work Unit; partition-exchange
1810 --*/
1811 wuC = (wuL + wuR) >> 1;
1812 if (fullGt ( zptr[RC(wuL)], zptr[RC(wuC)] )) SWAP ( wuL, wuC );
1813 if (fullGt ( zptr[RC(wuL)], zptr[RC(wuR)] )) SWAP ( wuL, wuR );
1814 if (fullGt ( zptr[RC(wuC)], zptr[RC(wuR)] )) SWAP ( wuC, wuR );
1815
1816 SWAP ( wuC, wuR-1 );
1817 pivot = zptr[RC(wuR-1)];
1818
1819 i = wuL;
1820 j = wuR - 1;
1821 for (;;) {
1822 do i++; while (fullGt ( pivot, zptr[RC(i)] ));
1823 do j--; while (fullGt ( zptr[RC(j)], pivot ));
1824 if (i < j) SWAP ( i, j ) else break;
1825 }
1826 SWAP ( i, wuR-1 );
1827
1828 if ((i - wuL) > (wuR - i)) {
1829 stackL[sp] = wuL; stackR[sp] = i-1; sp++; wuL = i+1;
1830 } else {
1831 stackL[sp] = i+1; stackR[sp] = wuR; sp++; wuR = i-1;
1832 }
1833 #if DEBUG
1834 assert (sp <= 14);
1835 #endif
1836
1837 } else {
1838
1839 /*--
1840 a small Work-Unit; insertion-sort it
1841 --*/
1842 for (i = wuL + 1; i <= wuR; i++) {
1843 v = zptr[RC(i)];
1844 j = i;
1845 while ( fullGt ( zptr[RC(j-1)], v )) {
1846 zptr[RC(j)] = zptr[RC(j-1)];
1847 j = j - 1;
1848 if (j <= wuL) goto zero;
1849 }
1850 zero:
1851 zptr[RC(j)] = v;
1852 }
1853 if (sp == 0) return;
1854 sp--; wuL = stackL[sp]; wuR = stackR[sp];
1855 #if DEBUG
1856 assert (sp >= 0);
1857 #endif
1858
1859 } /* if this is a small work-unit */
1860 }
1861 }
1862
1863 #undef RC
1864 #undef SWAP
1865 #undef ISORT_BELOW
1866
1867
1868 /*---------------------------------------------*/
1869 /* Use of NORMALISEHI is safe here, for any
1870 * lastPP >= 1 (which is guaranteed), since
1871 * the max denormalisation is 1.
1872 */
trivialGt(Int32 i1,Int32 i2)1873 INLINE Bool trivialGt ( Int32 i1, Int32 i2 )
1874 {
1875 Int32 k;
1876
1877 for (k = 0; k <= last; k++) {
1878 UChar c1 = GETFIRST(i1);
1879 UChar c2 = GETFIRST(i2);
1880 if (c1 == c2) {
1881 i1++; i1 = NORMALISEHI(i1);
1882 i2++; i2 = NORMALISEHI(i2);
1883 } else
1884 if (c1 > c2) return True; else return False;
1885 };
1886 return False;
1887 }
1888
1889
1890 /*---------------------------------------------*/
1891 /* Always works.
1892 */
shellTrivial(void)1893 void shellTrivial ( void )
1894 {
1895 Int32 i, j, h, bigN;
1896 Int32 v;
1897
1898 Int32 ptrLo = 0;
1899 Int32 ptrHi = last;
1900 bigN = ptrHi - ptrLo + 1;
1901 h = 1;
1902 do { h = 3 * h + 1; } while ( ! (h > bigN) );
1903 do {
1904 h = h / 3;
1905 for (i = ptrLo + h; i <= ptrHi; i++) {
1906 v = zptr[i];
1907 j = i;
1908 while ( trivialGt ( zptr[j-h], v ) ) {
1909 zptr[j] = zptr[j-h];
1910 j = j - h;
1911 if (j <= (ptrLo + h - 1)) goto zero;
1912 }
1913 zero:
1914 zptr[j] = v;
1915 }
1916 } while (h != 1);
1917 }
1918
1919
1920 /*---------------------------------------------*/
1921 /* We have to be pretty careful for small block
1922 * sizes; the usual mechanism won't work properly,
1923 * all, ultimately, because the pointer normalisation
1924 * machinery doesn't work right whenever the amount
1925 * of denormalisation exceeds lastPP. And the
1926 * greatest possible amount of denormalisation here
1927 * is generated in fullGt, as
1928 * 4 * NUM_FULLGT_UNROLLINGS.
1929 * To make blocks smaller than 4 * NUM_... sort
1930 * correctly, it seems easiest simply
1931 * to forget about striping, &c, and just do a simple
1932 * shellsort on the un-striped block. The performance
1933 * loss has to be inconsequential, since 4 * NUM_... is
1934 * tiny (16 at present).
1935 */
sortIt(void)1936 void sortIt ( void )
1937 {
1938 /*--
1939 lastPP is `last++', ie, is always == last+1.
1940 The two (lastPP and last+1) should be interchangeable.
1941 lastPP is more convenient for fast renormalisation,
1942 that's all.
1943
1944 In the various block-sized structures, live data runs
1945 from 0 to last inclusive, so lastPP is the number
1946 of live data items.
1947 --*/
1948 lastPP = last + 1;
1949
1950 if (lastPP <= 1024) {
1951
1952 /*--
1953 shellTrivial *must* be used for
1954 lastPP <= 4 * NUM_FULLGT_UNROLLINGS.
1955 The 1024 limit is much higher; the purpose is to avoid
1956 lumbering sorting of small blocks with the
1957 fixed overhead of the full counting-sort mechanism.
1958 --*/
1959
1960 Int32 i;
1961
1962 if (veryVerbose) fprintf ( stderr, "trivialSort ...\n" );
1963 for (i = 0; i <= last; i++) zptr[i] = i;
1964 shellTrivial();
1965 if (veryVerbose) fprintf ( stderr, "trivialSort done.\n" );
1966
1967 } else {
1968 Int32 i;
1969 Int32 grade;
1970 Int32 notDone;
1971
1972 stripe ();
1973
1974 if (veryVerbose) fprintf ( stderr, "bucket sorting ...\n" );
1975
1976 for (i = 0; i <= 65536; i++)
1977 ftab[i] = 0;
1978 for (i = 0; i <= last; i++)
1979 ftab[GETFIRST16(i)]++;
1980 for (i = 1; i <= 65536; i++)
1981 ftab[i] += ftab[i-1];
1982
1983 for (i = 0; i <= last; i++) {
1984 UInt32 j = GETFIRST16(i);
1985 ftab[j]--;
1986 zptr[ftab[j]] = i;
1987 }
1988
1989 copyOffsetWords();
1990
1991 notDone = lastPP;
1992 for (grade = 1; grade <= 5; grade++) {
1993 Int32 candNo;
1994 Int32 loBound;
1995 Int32 hiBound;
1996
1997 switch (grade) {
1998 case 1: loBound = 2; hiBound = 15; break;
1999 case 2: loBound = 16; hiBound = 255; break;
2000 case 3: loBound = 256; hiBound = 4095; break;
2001 case 4: loBound = 4096; hiBound = 65535; break;
2002 case 5: loBound = 65536; hiBound = 900000; break;
2003 default: panic ( "gradedSort" ); break;
2004 }
2005 if (loBound > lastPP) continue;
2006
2007 candNo = 0;
2008 for (i = 0; i <= 65535; i++) {
2009
2010 Int32 freqHere = ftab[i+1] - ftab[i];
2011
2012 if (freqHere >= loBound && freqHere <= hiBound) {
2013 Int32 j, k;
2014 Int32 lower = ftab[i];
2015 Int32 upper = ftab[i+1] - 1;
2016
2017 candNo++;
2018 notDone -= freqHere;
2019
2020 if (veryVerbose) {
2021 fprintf ( stderr,
2022 " %d -> %d: cand %5d, freq = %6d, notdone = %6d",
2023 loBound, hiBound, candNo, freqHere, notDone );
2024 fflush ( stderr );
2025 }
2026
2027 qsortFull ( lower, upper );
2028
2029 if (freqHere < 65535) {
2030 for (j = lower, k = 0; j <= upper; j++, k++) {
2031 Int32 a2update = zptr[j];
2032 SETREST16(a2update, k);
2033 if (a2update < (4 * NUM_FULLGT_UNROLLINGS))
2034 SETREST16(a2update + lastPP, k);
2035 }
2036 }
2037 if (veryVerbose)
2038 fprintf ( stderr, "\n" );
2039
2040 }
2041 }
2042 }
2043 }
2044 }
2045
2046
2047 /*---------------------------------------------------*/
2048 /*--- The Reversible Transformation (tm) ---*/
2049 /*---------------------------------------------------*/
2050
2051 /*---------------------------------------------*/
2052 /*--- Use: block [0 .. last]
2053 Def: origPtr. ll [0 .. last] is synthesised later.
2054 ---*/
doReversibleTransformation(void)2055 void doReversibleTransformation ( void )
2056 {
2057 Int32 i;
2058
2059 if (veryVerbose) fprintf ( stderr, "\n" );
2060
2061 sortIt ();
2062
2063 origPtr = -1;
2064 for (i = 0; i <= last; i++)
2065 if (zptr[i] == 0)
2066 { origPtr = i; break; };
2067
2068 if (origPtr == -1) panic ( "doReversibleTransformation" );
2069
2070 }
2071
2072
2073 /*---------------------------------------------*/
2074 /*--- Use: ll[0 .. last] and origPtr
2075 Def: block[0 .. last]
2076 ---*/
undoReversibleTransformation(void)2077 void undoReversibleTransformation ( void )
2078 {
2079 Int32 cc[256];
2080 Int32 i, j, ch, sum;
2081
2082 for (i = 0; i <= 255; i++) cc[i] = 0;
2083
2084 for (i = 0; i <= last; i++) {
2085 UChar ll_i = ll[i];
2086 zptr[i] = cc[ll_i];
2087 cc[ll_i] ++;
2088 };
2089
2090 sum = 0;
2091 for (ch = 0; ch <= 255; ch++) {
2092 sum = sum + cc[ch];
2093 cc[ch] = sum - cc[ch];
2094 };
2095
2096 i = origPtr;
2097 for (j = last; j >= 0; j--) {
2098 UChar ll_i = ll[i];
2099 block[j] = ll_i;
2100 i = zptr[i] + cc[ll_i];
2101 };
2102 }
2103
2104
2105 /*---------------------------------------------------*/
2106 /*--- The block loader and RLEr ---*/
2107 /*---------------------------------------------------*/
2108
2109 #define SPOT_BASIS_STEP 8000
2110
2111 /*---------------------------------------------*/
spotBlock(Bool weAreCompressing)2112 void spotBlock ( Bool weAreCompressing )
2113 {
2114 Int32 pos, delta, newdelta;
2115
2116 pos = SPOT_BASIS_STEP;
2117 delta = 1;
2118
2119 while (pos < last) {
2120
2121 Int32 n;
2122
2123 if (weAreCompressing)
2124 n = (Int32)GETFIRST(pos) + 1; else
2125 n = (Int32)block[pos] - 1;
2126
2127 if (n == 256) n = 0; else if (n == -1) n = 255;
2128
2129 if (! (n >= 0 && n <= 255) ) panic ( "spotBlock" );
2130
2131 if (weAreCompressing)
2132 SETFIRST(pos, (UChar)n); else
2133 block[pos] = (UChar)n;
2134
2135 switch (delta) {
2136 case 3: newdelta = 1; break;
2137 case 1: newdelta = 4; break;
2138 case 4: newdelta = 5; break;
2139 case 5: newdelta = 9; break;
2140 case 9: newdelta = 2; break;
2141 case 2: newdelta = 6; break;
2142 case 6: newdelta = 7; break;
2143 case 8: newdelta = 8; break;
2144 case 7: newdelta = 3; break;
2145 default: newdelta = 1; break;
2146 }
2147 delta = newdelta;
2148
2149 pos = pos + SPOT_BASIS_STEP + 17 * (newdelta - 5);
2150 }
2151 }
2152
2153
2154
2155 /*---------------------------------------------*/
2156 /* Top 16: run length, 1 to 255.
2157 * Lower 16: the char, or MY_EOF for EOF.
2158 */
2159
2160 #define MY_EOF 257
2161
getRLEpair(FILE * src)2162 INLINE Int32 getRLEpair ( FILE* src )
2163 {
2164 Int32 runLength;
2165 IntNative ch, chLatest;
2166
2167 ch = getc ( src );
2168
2169 /*--- Because I have no idea what kind of a value EOF is. ---*/
2170 if (ch == EOF) {
2171 ERROR_IF_NOT_ZERO ( errno );
2172 return (1 << 16) | MY_EOF;
2173 }
2174
2175 runLength = 0;
2176 do {
2177 chLatest = getc ( src );
2178 runLength++;
2179 bytesIn++;
2180 }
2181 while (ch == chLatest && runLength < 255);
2182
2183 if ( chLatest != EOF ) {
2184 if ( ungetc ( chLatest, src ) == EOF )
2185 panic ( "getRLEpair: ungetc failed" );
2186 } else {
2187 ERROR_IF_NOT_ZERO ( errno );
2188 }
2189
2190 /*--- Conditional is just a speedup hack. ---*/
2191 if (runLength == 1) {
2192 UPDATE_CRC ( globalCrc, (UChar)ch );
2193 return (1 << 16) | ch;
2194 } else {
2195 Int32 i;
2196 for (i = 1; i <= runLength; i++)
2197 UPDATE_CRC ( globalCrc, (UChar)ch );
2198 return (runLength << 16) | ch;
2199 }
2200 }
2201
2202
2203 /*---------------------------------------------*/
loadAndRLEsource(FILE * src)2204 Bool loadAndRLEsource ( FILE* src )
2205 {
2206 Int32 ch, allowableBlockSize;
2207
2208 last = -1;
2209 ch = 0;
2210
2211 /*--- 20 is just a paranoia constant ---*/
2212 allowableBlockSize = 100000 * blockSize100k - 20;
2213
2214 while (last < allowableBlockSize && ch != MY_EOF) {
2215 Int32 rlePair, runLen;
2216 rlePair = getRLEpair ( src );
2217 ch = rlePair & 0xFFFF;
2218 runLen = (UInt32)rlePair >> 16;
2219
2220 #if DEBUG
2221 assert (runLen >= 1 && runLen <= 255);
2222 #endif
2223
2224 if (ch == MY_EOF)
2225 { last++; SETFIRST(last, ((UChar)42)); }
2226 else
2227 switch (runLen) {
2228 case 1:
2229 last++; SETFIRST(last, ((UChar)ch)); break;
2230 case 2:
2231 last++; SETFIRST(last, ((UChar)ch));
2232 last++; SETFIRST(last, ((UChar)ch)); break;
2233 case 3:
2234 last++; SETFIRST(last, ((UChar)ch));
2235 last++; SETFIRST(last, ((UChar)ch));
2236 last++; SETFIRST(last, ((UChar)ch)); break;
2237 default:
2238 last++; SETFIRST(last, ((UChar)ch));
2239 last++; SETFIRST(last, ((UChar)ch));
2240 last++; SETFIRST(last, ((UChar)ch));
2241 last++; SETFIRST(last, ((UChar)ch));
2242 last++; SETFIRST(last, ((UChar)(runLen-4))); break;
2243 }
2244 }
2245 return (ch == MY_EOF);
2246 }
2247
2248
2249 /*---------------------------------------------*/
2250 /*--
2251 This new version is derived from some code
2252 sent to me Christian von Roques.
2253 --*/
unRLEandDump(FILE * dst,Bool thisIsTheLastBlock)2254 void unRLEandDump ( FILE* dst, Bool thisIsTheLastBlock )
2255 {
2256 IntNative retVal;
2257 Int32 lastCharToSpew, i, count, chPrev, ch;
2258 UInt32 localCrc;
2259
2260 if (thisIsTheLastBlock)
2261 lastCharToSpew = last - 1; else
2262 lastCharToSpew = last;
2263
2264 count = 0;
2265 i = 0;
2266 ch = 256; /*-- not a char and not EOF --*/
2267 localCrc = getGlobalCRC();
2268
2269 while ( i <= lastCharToSpew ) {
2270 chPrev = ch;
2271 ch = block[i];
2272 i++;
2273
2274 retVal = putc ( ch, dst );
2275 ERROR_IF_EOF ( retVal );
2276 UPDATE_CRC ( localCrc, (UChar)ch );
2277
2278 if (ch != chPrev) {
2279 count = 1;
2280 } else {
2281 count++;
2282 if (count >= 4) {
2283 Int32 j;
2284 for (j = 0; j < (Int32)block[i]; j++) {
2285 retVal = putc (ch, dst);
2286 ERROR_IF_EOF ( retVal );
2287 UPDATE_CRC ( localCrc, (UChar)ch );
2288 }
2289 i++;
2290 count = 0;
2291 }
2292 }
2293 }
2294
2295 setGlobalCRC ( localCrc );
2296
2297 if (thisIsTheLastBlock && block[last] != 42) unblockError ();
2298 }
2299
2300
2301 /*---------------------------------------------------*/
2302 /*--- Processing of complete files and streams ---*/
2303 /*---------------------------------------------------*/
2304
2305 /*---------------------------------------------*/
compressStream(FILE * stream,FILE * zStream)2306 void compressStream ( FILE *stream, FILE *zStream )
2307 {
2308 IntNative retVal;
2309 Bool thisIsTheLastBlock;
2310 BitStream *zbs;
2311 UInt32 crcToSend;
2312 Int32 blockNo = 1;
2313
2314 bytesIn = 0;
2315 bytesOut = 0;
2316
2317 SET_BINARY_MODE(stream);
2318 SET_BINARY_MODE(zStream);
2319
2320 zbs = bsOpenWriteStream ( zStream );
2321
2322 /*--- Write `magic' bytes B and Z,
2323 then 0 indicating file-format revision zero,
2324 followed by a digit indicating blockSize100k.
2325 ---*/
2326 bsPutUChar ( zbs, 'B' );
2327 bsPutUChar ( zbs, 'Z' );
2328 bsPutUChar ( zbs, '0' );
2329 bsPutUChar ( zbs, '0' + blockSize100k );
2330
2331 initialiseCRC ();
2332 initBogusModel ();
2333 arithCodeStartEncoding ( zbs );
2334
2335 do {
2336 if (veryVerbose)
2337 fprintf ( stderr, "\nBEGIN block %d\n", blockNo );
2338 blockNo++;
2339 thisIsTheLastBlock = loadAndRLEsource ( stream );
2340 spotBlock ( True );
2341 doReversibleTransformation ();
2342 moveToFrontCodeAndSend ( zbs, thisIsTheLastBlock );
2343 }
2344 while ( ! thisIsTheLastBlock );
2345
2346 crcToSend = getFinalCRC ();
2347 putUInt32 ( zbs, crcToSend );
2348 if (veryVerbose)
2349 fprintf ( stderr, "\nCRC = 0x%x\n", crcToSend );
2350
2351 arithCodeDoneEncoding ( zbs );
2352 bsClose ( zbs );
2353 ERROR_IF_NOT_ZERO ( ferror(stream) );
2354 retVal = fclose ( stream );
2355 ERROR_IF_EOF ( retVal );
2356
2357 if (veryVerbose) {
2358 fprintf ( stderr, "\n" );
2359 dumpAllModelStats ();
2360 fprintf ( stderr, "\n" );
2361 }
2362
2363 if (bytesIn == 0) bytesIn = 1;
2364 if (bytesOut == 0) bytesOut = 1;
2365
2366 if (verbose)
2367 fprintf ( stderr, "%6.3f:1, %6.3f bits/byte, "
2368 "%5.2f%% saved, %d in, %d out.\n",
2369 (float)bytesIn / (float)bytesOut,
2370 (8.0 * (float)bytesOut) / (float)bytesIn,
2371 100.0 * (1.0 - (float)bytesOut / (float)bytesIn),
2372 bytesIn,
2373 bytesOut
2374 );
2375
2376 if (veryVerbose)
2377 fprintf ( stderr, "\n" );
2378 }
2379
2380
2381 /*---------------------------------------------*/
uncompressStream(FILE * zStream,FILE * stream)2382 Bool uncompressStream ( FILE *zStream, FILE *stream )
2383 {
2384 Bool thisIsTheLastBlock;
2385 BitStream *zbs;
2386 Int32 magic1, magic2, magic3, magic4;
2387 UInt32 crcStored, crcComputed;
2388 Int32 currBlockNo;
2389 IntNative retVal;
2390
2391 SET_BINARY_MODE(stream);
2392 SET_BINARY_MODE(zStream);
2393
2394 zbs = ( bsOpenReadStream ( zStream ) );
2395
2396 /*--
2397 A bad magic number is `recoverable from';
2398 return with False so the caller skips the file.
2399 --*/
2400 magic1 = (Int32)bsGetUChar ( zbs );
2401 magic2 = (Int32)bsGetUChar ( zbs );
2402 magic3 = (Int32)bsGetUChar ( zbs );
2403 magic4 = (Int32)bsGetUChar ( zbs );
2404 if (magic1 != 'B' ||
2405 magic2 != 'Z' ||
2406 magic3 != '0' ||
2407 magic4 < '1' ||
2408 magic4 > '9') {
2409 bsClose ( zbs );
2410 retVal = fclose ( stream );
2411 ERROR_IF_EOF ( retVal );
2412 return False;
2413 }
2414
2415 setDecompressStructureSizes ( magic4 - '0' );
2416 initialiseCRC ();
2417 initBogusModel ();
2418 arithCodeStartDecoding ( zbs );
2419
2420 if (veryVerbose) fprintf ( stderr, "\n " );
2421 currBlockNo = 0;
2422 do {
2423 currBlockNo++;
2424 if (veryVerbose)
2425 fprintf ( stderr, "[%d: ac+mtf ", currBlockNo );
2426 thisIsTheLastBlock = getAndMoveToFrontDecode ( zbs );
2427 if (veryVerbose) fprintf ( stderr, "rt " );
2428 undoReversibleTransformation ();
2429 spotBlock ( False );
2430 if (veryVerbose) fprintf ( stderr, "rld" );
2431 unRLEandDump ( stream, thisIsTheLastBlock );
2432 if (veryVerbose) fprintf ( stderr, "] " );
2433 }
2434 while ( ! thisIsTheLastBlock );
2435
2436 if (veryVerbose) fprintf ( stderr, "\n " );
2437
2438 /*-- A bad CRC is considered a fatal error. --*/
2439 crcStored = getUInt32 ( zbs );
2440 crcComputed = getFinalCRC ();
2441 if (veryVerbose)
2442 fprintf ( stderr,
2443 "CRCs: stored = 0x%x, computed = 0x%x\n ",
2444 crcStored, crcComputed );
2445 if (crcStored != crcComputed)
2446 crcError ( crcStored, crcComputed );
2447
2448 arithCodeDoneDecoding ( zbs );
2449 bsClose ( zbs );
2450 ERROR_IF_NOT_ZERO ( ferror(stream) );
2451 retVal = fclose ( stream );
2452 ERROR_IF_EOF ( retVal );
2453 return True;
2454 }
2455
2456
2457
2458 /*---------------------------------------------------*/
2459 /*--- Error [non-] handling grunge ---*/
2460 /*---------------------------------------------------*/
2461
2462 /*---------------------------------------------*/
showFileNames(void)2463 void showFileNames ( void )
2464 {
2465 fprintf (
2466 stderr,
2467 "\tInput file = %s, output file = %s\n",
2468 (opMode == OM_STDIN_TO_STDOUT) ? "(stdin)" : inName,
2469 (opMode != OM_FILES_TO_FILES) ? "(stdout)" : outName
2470 );
2471 }
2472
2473
2474 /*---------------------------------------------*/
cleanUpAndFail(void)2475 void cleanUpAndFail ( void )
2476 {
2477 IntNative retVal;
2478
2479 if ( opMode == OM_FILES_TO_FILES ) {
2480 fprintf ( stderr, "%s: Deleting output file %s, if it exists.\n",
2481 progName, outName );
2482 if (outputHandleJustInCase != NULL)
2483 fclose ( outputHandleJustInCase );
2484 retVal = remove ( outName );
2485 if (retVal != 0)
2486 fprintf ( stderr,
2487 "%s: WARNING: deletion of output file (apparently) failed.\n",
2488 progName );
2489 }
2490 exit ( 1 );
2491 }
2492
2493
2494 /*---------------------------------------------*/
panic(Char * s)2495 void panic ( Char* s )
2496 {
2497 fprintf ( stderr,
2498 "\n%s: PANIC -- internal consistency error:\n"
2499 "\t%s\n"
2500 "\tThis is a BUG. Please report it to me at:\n"
2501 "\tsewardj@cs.man.ac.uk.\n",
2502 progName, s );
2503 showFileNames();
2504 cleanUpAndFail();
2505 }
2506
2507
2508 /*---------------------------------------------*/
crcError(UInt32 crcStored,UInt32 crcComputed)2509 void crcError ( UInt32 crcStored, UInt32 crcComputed )
2510 {
2511 fprintf ( stderr,
2512 "\n%s: Data integrity error when decompressing.\n"
2513 "\tStored CRC = 0x%x, computed CRC = 0x%x\n"
2514 "\tThis could be a bug -- please report it to me at:\n"
2515 "\tsewardj@cs.man.ac.uk.\n",
2516 progName, crcStored, crcComputed );
2517 showFileNames();
2518 cleanUpAndFail();
2519 }
2520
2521
2522 /*---------------------------------------------*/
compressedStreamEOF(void)2523 void compressedStreamEOF ( void )
2524 {
2525 fprintf ( stderr,
2526 "\n%s: Compressed file ends unexpectedly;\n\t"
2527 "perhaps it is corrupted? *Possible* reason follows.\n",
2528 progName );
2529 perror ( progName );
2530 showFileNames();
2531 cleanUpAndFail();
2532 }
2533
2534
2535 /*---------------------------------------------*/
ioError()2536 void ioError ( )
2537 {
2538 fprintf ( stderr,
2539 "\n%s: I/O or other error, bailing out. Possible reason follows.\n",
2540 progName );
2541 perror ( progName );
2542 showFileNames();
2543 cleanUpAndFail();
2544 }
2545
2546
2547 /*---------------------------------------------*/
blockOverrun()2548 void blockOverrun ()
2549 {
2550 fprintf ( stderr,
2551 "\n%s: block overrun during decompression,\n"
2552 "\twhich probably means the compressed file\n"
2553 "\tis corrupted.\n",
2554 progName );
2555 showFileNames();
2556 cleanUpAndFail();
2557 }
2558
2559
2560 /*---------------------------------------------*/
unblockError()2561 void unblockError ()
2562 {
2563 fprintf ( stderr,
2564 "\n%s: compressed file didn't unblock correctly,\n"
2565 "\twhich probably means it is corrupted.\n",
2566 progName );
2567 showFileNames();
2568 cleanUpAndFail();
2569 }
2570
2571
2572 /*---------------------------------------------*/
bitStreamEOF()2573 void bitStreamEOF ()
2574 {
2575 fprintf ( stderr,
2576 "\n%s: read past the end of compressed data,\n"
2577 "\twhich probably means it is corrupted.\n",
2578 progName );
2579 showFileNames();
2580 cleanUpAndFail();
2581 }
2582
2583
2584 /*---------------------------------------------*/
mySignalCatcher(int n __unused)2585 void mySignalCatcher (int n __unused)
2586 {
2587 fprintf ( stderr,
2588 "\n%s: Control-C (or similar) caught, quitting.\n",
2589 progName );
2590 cleanUpAndFail();
2591 }
2592
2593
2594 /*---------------------------------------------*/
mySIGSEGVorSIGBUScatcher(int n __unused)2595 void mySIGSEGVorSIGBUScatcher (int n __unused)
2596 {
2597 if (compressing)
2598 fprintf ( stderr,
2599 "\n%s: Caught a SIGSEGV or SIGBUS whilst compressing,\n"
2600 "\twhich probably indicates a bug in BZIP. Please\n"
2601 "\treport it to me at: sewardj@cs.man.ac.uk\n",
2602 progName );
2603 else
2604 fprintf ( stderr,
2605 "\n%s: Caught a SIGSEGV or SIGBUS whilst decompressing,\n"
2606 "\twhich probably indicates that the compressed data\n"
2607 "\tis corrupted.\n",
2608 progName );
2609
2610 showFileNames();
2611 cleanUpAndFail();
2612 }
2613
2614
2615 /*---------------------------------------------*/
uncompressOutOfMemory(Int32 draw,Int32 blockSize)2616 void uncompressOutOfMemory ( Int32 draw, Int32 blockSize )
2617 {
2618 fprintf ( stderr,
2619 "\n%s: Can't allocate enough memory for decompression.\n"
2620 "\tRequested %d bytes for a block size of %d.\n"
2621 "\tFind a machine with more memory, perhaps?\n",
2622 progName, draw, blockSize );
2623 showFileNames();
2624 cleanUpAndFail();
2625 }
2626
2627
2628 /*---------------------------------------------*/
compressOutOfMemory(Int32 draw,Int32 blockSize)2629 void compressOutOfMemory ( Int32 draw, Int32 blockSize )
2630 {
2631 fprintf ( stderr,
2632 "\n%s: Can't allocate enough memory for compression.\n"
2633 "\tRequested %d bytes for a block size of %d.\n"
2634 "\tReduce the block size, and/or use the -e flag.\n",
2635 progName, draw, blockSize );
2636 showFileNames();
2637 cleanUpAndFail();
2638 }
2639
2640
2641 /*---------------------------------------------------*/
2642 /*--- The main driver machinery ---*/
2643 /*---------------------------------------------------*/
2644
2645 /*---------------------------------------------*/
pad(Char * s)2646 void pad ( Char *s )
2647 {
2648 Int32 i;
2649 if ( (Int32)strlen(s) >= longestFileName ) return;
2650 for (i = 1; i <= longestFileName - (Int32)strlen(s); i++)
2651 fprintf ( stderr, " " );
2652 }
2653
2654
2655 /*---------------------------------------------*/
fileExists(Char * name)2656 Bool fileExists ( Char* name )
2657 {
2658 FILE *tmp = fopen ( name, "rb" );
2659 Bool exists = (tmp != NULL);
2660 if (tmp != NULL) fclose ( tmp );
2661 return exists;
2662 }
2663
2664
2665 /*---------------------------------------------*/
2666 /*--
2667 if in doubt, return True
2668 --*/
notABogStandardFile(Char * name)2669 Bool notABogStandardFile ( Char* name )
2670 {
2671 IntNative i;
2672 struct MY_STAT statBuf;
2673
2674 i = MY_LSTAT ( name, &statBuf );
2675 if (i != 0) return True;
2676 if (MY_S_IFREG(statBuf.st_mode)) return False;
2677 return True;
2678 }
2679
2680
2681 /*---------------------------------------------*/
copyDateAndPermissions(Char * srcName,Char * dstName)2682 void copyDateAndPermissions ( Char *srcName, Char *dstName )
2683 {
2684 IntNative retVal;
2685 struct MY_STAT statBuf;
2686 struct utimbuf uTimBuf;
2687
2688 retVal = MY_LSTAT ( srcName, &statBuf );
2689 ERROR_IF_NOT_ZERO ( retVal );
2690 uTimBuf.actime = statBuf.st_atime;
2691 uTimBuf.modtime = statBuf.st_mtime;
2692
2693 retVal = chmod ( dstName, statBuf.st_mode );
2694 ERROR_IF_NOT_ZERO ( retVal );
2695 retVal = utime ( dstName, &uTimBuf );
2696 ERROR_IF_NOT_ZERO ( retVal );
2697 }
2698
2699
2700 /*---------------------------------------------*/
endsInBz(Char * name)2701 Bool endsInBz ( Char* name )
2702 {
2703 Int32 n = strlen ( name );
2704 if (n <= 3) return False;
2705 return
2706 (name[n-3] == '.' &&
2707 name[n-2] == 'b' &&
2708 name[n-1] == 'z');
2709 }
2710
2711
2712 /*---------------------------------------------*/
containsDubiousChars(Char * name)2713 Bool containsDubiousChars ( Char* name )
2714 {
2715 Bool cdc = False;
2716 for (; *name != '\0'; name++)
2717 if (*name == '?' || *name == '*') cdc = True;
2718 return cdc;
2719 }
2720
2721
2722 /*---------------------------------------------*/
compress(Char * name)2723 void compress ( Char *name )
2724 {
2725 FILE *inStr;
2726 FILE *outStr;
2727
2728 strcpy ( inName, name );
2729 strcpy ( outName, name );
2730 strcat ( outName, ".bz" );
2731
2732 if ( opMode != OM_STDIN_TO_STDOUT && containsDubiousChars ( inName ) ) {
2733 fprintf ( stderr, "%s: There are no files matching `%s'.\n",
2734 progName, inName );
2735 return;
2736 }
2737 if ( opMode != OM_STDIN_TO_STDOUT && !fileExists ( inName ) ) {
2738 fprintf ( stderr, "%s: Input file %s doesn't exist, skipping.\n",
2739 progName, inName );
2740 return;
2741 }
2742 if ( opMode != OM_STDIN_TO_STDOUT && endsInBz ( inName )) {
2743 fprintf ( stderr, "%s: Input file name %s ends in `.bz', skipping.\n",
2744 progName, inName );
2745 return;
2746 }
2747 if ( opMode != OM_STDIN_TO_STDOUT && notABogStandardFile ( inName )) {
2748 fprintf ( stderr, "%s: Input file %s is not a normal file, skipping.\n",
2749 progName, inName );
2750 return;
2751 }
2752 if ( opMode == OM_FILES_TO_FILES && fileExists ( outName ) ) {
2753 fprintf ( stderr, "%s: Output file %s already exists, skipping.\n",
2754 progName, outName );
2755 return;
2756 }
2757
2758 switch ( opMode ) {
2759
2760 case OM_STDIN_TO_STDOUT:
2761 inStr = stdin;
2762 outStr = stdout;
2763 if ( isatty ( fileno ( stdout ) ) ) {
2764 fprintf ( stderr,
2765 "%s: I won't write compressed data to a terminal.\n",
2766 progName );
2767 fprintf ( stderr, "%s: For help, type: `%s --help'.\n",
2768 progName, progName );
2769 return;
2770 };
2771 break;
2772
2773 case OM_FILE_TO_STDOUT:
2774 inStr = fopen ( inName, "rb" );
2775 outStr = stdout;
2776 if ( isatty ( fileno ( stdout ) ) ) {
2777 fprintf ( stderr,
2778 "%s: I won't write compressed data to a terminal.\n",
2779 progName );
2780 fprintf ( stderr, "%s: For help, type: `%s --help'.\n",
2781 progName, progName );
2782 return;
2783 };
2784 if ( inStr == NULL ) {
2785 fprintf ( stderr, "%s: Can't open input file %s, skipping.\n",
2786 progName, inName );
2787 return;
2788 };
2789 break;
2790
2791 case OM_FILES_TO_FILES:
2792 inStr = fopen ( inName, "rb" );
2793 outStr = fopen ( outName, "wb" );
2794 if ( outStr == NULL) {
2795 fprintf ( stderr, "%s: Can't create output file %s, skipping.\n",
2796 progName, outName );
2797 return;
2798 }
2799 if ( inStr == NULL ) {
2800 fprintf ( stderr, "%s: Can't open input file %s, skipping.\n",
2801 progName, inName );
2802 return;
2803 };
2804 break;
2805
2806 default:
2807 panic ( "compress: bad opMode" );
2808 break;
2809 }
2810
2811 if (verbose) {
2812 fprintf ( stderr,
2813 " %s: ",
2814 opMode == OM_STDIN_TO_STDOUT ? "(stdin)" : inName );
2815 pad ( opMode == OM_STDIN_TO_STDOUT ? "(stdin)" : inName );
2816 fflush ( stderr );
2817 }
2818
2819 /*--- Now the input and output handles are sane. Do the Biz. ---*/
2820 errno = 0;
2821 outputHandleJustInCase = outStr;
2822 compressStream ( inStr, outStr );
2823 outputHandleJustInCase = NULL;
2824
2825 /*--- If there was an I/O error, we won't get here. ---*/
2826 if ( opMode == OM_FILES_TO_FILES ) {
2827 copyDateAndPermissions ( inName, outName );
2828 if ( !keepInputFiles ) {
2829 IntNative retVal = remove ( inName );
2830 ERROR_IF_NOT_ZERO ( retVal );
2831 }
2832 }
2833 }
2834
2835
2836 /*---------------------------------------------*/
uncompress(Char * name)2837 void uncompress ( Char *name )
2838 {
2839 FILE *inStr;
2840 FILE *outStr;
2841 Bool magicNumberOK;
2842
2843 strcpy ( inName, name );
2844 strcpy ( outName, name );
2845 if ( endsInBz ( inName ) )
2846 outName [ strlen ( outName ) - 3 ] = '\0';
2847
2848 if ( opMode != OM_STDIN_TO_STDOUT && containsDubiousChars ( inName ) ) {
2849 fprintf ( stderr, "%s: There are no files matching `%s'.\n",
2850 progName, inName );
2851 return;
2852 }
2853 if ( opMode != OM_STDIN_TO_STDOUT && !fileExists ( inName ) ) {
2854 fprintf ( stderr, "%s: Input file %s doesn't exist, skipping.\n",
2855 progName, inName );
2856 return;
2857 }
2858 if ( opMode != OM_STDIN_TO_STDOUT && !endsInBz ( inName )) {
2859 fprintf ( stderr,
2860 "%s: Input file name %s doesn't end in `.bz', skipping.\n",
2861 progName, inName );
2862 return;
2863 }
2864 if ( opMode != OM_STDIN_TO_STDOUT && notABogStandardFile ( inName )) {
2865 fprintf ( stderr, "%s: Input file %s is not a normal file, skipping.\n",
2866 progName, inName );
2867 return;
2868 }
2869 if ( opMode == OM_FILES_TO_FILES && fileExists ( outName ) ) {
2870 fprintf ( stderr, "%s: Output file %s already exists, skipping.\n",
2871 progName, outName );
2872 return;
2873 }
2874
2875 switch ( opMode ) {
2876
2877 case OM_STDIN_TO_STDOUT:
2878 inStr = stdin;
2879 outStr = stdout;
2880 if ( isatty ( fileno ( stdin ) ) ) {
2881 fprintf ( stderr,
2882 "%s: I won't read compressed data from a terminal.\n",
2883 progName );
2884 fprintf ( stderr, "%s: For help, type: `%s --help'.\n",
2885 progName, progName );
2886 return;
2887 };
2888 break;
2889
2890 case OM_FILE_TO_STDOUT:
2891 inStr = fopen ( inName, "rb" );
2892 outStr = stdout;
2893 if ( inStr == NULL ) {
2894 fprintf ( stderr, "%s: Can't open input file %s, skipping.\n",
2895 progName, inName );
2896 return;
2897 };
2898 break;
2899
2900 case OM_FILES_TO_FILES:
2901 inStr = fopen ( inName, "rb" );
2902 outStr = fopen ( outName, "wb" );
2903 if ( outStr == NULL) {
2904 fprintf ( stderr, "%s: Can't create output file %s, skipping.\n",
2905 progName, outName );
2906 return;
2907 }
2908 if ( inStr == NULL ) {
2909 fprintf ( stderr, "%s: Can't open input file %s, skipping.\n",
2910 progName, inName );
2911 return;
2912 };
2913 break;
2914
2915 default:
2916 panic ( "uncompress: bad opMode" );
2917 break;
2918 }
2919
2920 if (verbose) {
2921 fprintf ( stderr,
2922 " %s: ",
2923 opMode == OM_STDIN_TO_STDOUT ? "(stdin)" : inName );
2924 pad ( opMode == OM_STDIN_TO_STDOUT ? "(stdin)" : inName );
2925 fflush ( stderr );
2926 }
2927
2928 /*--- Now the input and output handles are sane. Do the Biz. ---*/
2929 errno = 0;
2930 outputHandleJustInCase = outStr;
2931 magicNumberOK = uncompressStream ( inStr, outStr );
2932 outputHandleJustInCase = NULL;
2933
2934 /*--- If there was an I/O error, we won't get here. ---*/
2935 if ( magicNumberOK ) {
2936 if ( opMode == OM_FILES_TO_FILES ) {
2937 copyDateAndPermissions ( inName, outName );
2938 if ( !keepInputFiles ) {
2939 IntNative retVal = remove ( inName );
2940 ERROR_IF_NOT_ZERO ( retVal );
2941 }
2942 }
2943 } else {
2944 if ( opMode == OM_FILES_TO_FILES ) {
2945 IntNative retVal = remove ( outName );
2946 ERROR_IF_NOT_ZERO ( retVal );
2947 }
2948 }
2949
2950 if ( magicNumberOK ) {
2951 if (verbose)
2952 fprintf ( stderr, "done\n" );
2953 } else {
2954 if (verbose)
2955 fprintf ( stderr, "not a BZIP file, skipping.\n" ); else
2956 fprintf ( stderr,
2957 "%s: %s is not a BZIP file, skipping.\n",
2958 progName,
2959 opMode == OM_STDIN_TO_STDOUT ? "(stdin)" : inName );
2960 }
2961
2962 }
2963
2964
2965 /*---------------------------------------------*/
license(void)2966 void license ( void )
2967 {
2968 fprintf ( stderr,
2969
2970 " \n"
2971 " Copyright (C) 1996 by Julian Seward.\n"
2972 " \n"
2973 " This program is free software; you can redistribute it and/or modify\n"
2974 " it under the terms of the GNU General Public License as published by\n"
2975 " the Free Software Foundation; either version 2 of the License, or\n"
2976 " (at your option) any later version.\n"
2977 " \n"
2978 " This program is distributed in the hope that it will be useful,\n"
2979 " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"
2980 " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"
2981 " GNU General Public License for more details.\n"
2982 " \n"
2983 " You should have received a copy of the GNU General Public License\n"
2984 " along with this program; if not, write to the Free Software\n"
2985 " Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.\n"
2986 " \n"
2987 " The GNU General Public License is contained in the file LICENSE.\n"
2988 " \n"
2989 );
2990 }
2991
2992
2993 /*---------------------------------------------*/
usage(Char * fullProgName)2994 void usage ( Char *fullProgName )
2995 {
2996 fprintf (
2997 stderr,
2998 "\nusage: %s [flags and input files in any order]\n"
2999 "\n"
3000 " Flags: -d force decompression\n"
3001 " -f force compression\n"
3002 " -c output to standard out\n"
3003 " -v, -V be verbose, or very verbose\n"
3004 " -k keep (don't delete) input files\n"
3005 " -L display software license\n"
3006 " -1 .. -9 set block size of 100k .. 900k\n"
3007 "\n"
3008 " If invoked as `bzip', the default action is to compress.\n"
3009 " as `bunzip', the default action is to decompress.\n"
3010 "\n"
3011 " If no file names are given, bzip compresses or decompresses\n"
3012 " from standard input to standard output. You can combine\n"
3013 " flags, so `-v -e -4' means the same as -ve4 or -4ev, &c.\n"
3014 "\n"
3015 " The default block size is 900k, which soaks up a lot of\n"
3016 " memory for compression (7700k) and decompression (4500k).\n"
3017 " You may want to select a smaller block size; see the manual\n"
3018 " for details. Smaller sizes give slightly less compression.\n"
3019 " -e also saves memory during compression, at some speed cost.\n"
3020 "\n",
3021
3022 fullProgName
3023 );
3024 }
3025
3026
3027 /*---------------------------------------------*/
3028 /*--
3029 All the garbage from here to main() is purely to
3030 implement a linked list of command-line arguments,
3031 into which main() copies argv[1 .. argc-1].
3032
3033 The purpose of this ridiculous exercise is to
3034 facilitate the expansion of wildcard characters
3035 * and ? in filenames for halfwitted OSs like
3036 MSDOS, Windows 95 and NT ... yawn.
3037
3038 The actual Dirty Work is done by the platform-specific
3039 macro APPEND_FILESPEC.
3040 --*/
3041
3042 typedef
3043 struct zzzz {
3044 Char *name;
3045 struct zzzz *link;
3046 }
3047 Cell;
3048
3049
3050 /*---------------------------------------------*/
myMalloc(size_t n)3051 void *myMalloc ( size_t n )
3052 {
3053 void* p;
3054
3055 p = malloc ( n );
3056 if (p == NULL) {
3057 fprintf (
3058 stderr,
3059 "%s: `malloc' failed during processing of command-line args.\n",
3060 progName
3061 );
3062 exit ( 1 );
3063 }
3064 return p;
3065 }
3066
3067
3068 /*---------------------------------------------*/
mkCell(void)3069 Cell *mkCell ( void )
3070 {
3071 Cell *c;
3072
3073 c = (Cell*) myMalloc ( sizeof ( Cell ) );
3074 c->name = NULL;
3075 c->link = NULL;
3076 return c;
3077 }
3078
3079
3080 /*---------------------------------------------*/
snocString(Cell * root,Char * name)3081 Cell *snocString ( Cell *root, Char *name )
3082 {
3083 if (root == NULL) {
3084 Cell *tmp = mkCell();
3085 tmp->name = (Char*) myMalloc ( 5 + strlen(name) );
3086 strcpy ( tmp->name, name );
3087 return tmp;
3088 } else {
3089 Cell *tmp = root;
3090 while (tmp->link != NULL) tmp = tmp->link;
3091 tmp->link = snocString ( tmp->link, name );
3092 return root;
3093 }
3094 }
3095
3096
3097
3098 /*---------------------------------------------*/
main(IntNative argc,Char * argv[])3099 IntNative main ( IntNative argc, Char *argv[] )
3100 {
3101 Int32 numFileNames;
3102 Int32 i, j;
3103 Char *tmp;
3104 Cell *argList;
3105 Cell *aa;
3106
3107 outputHandleJustInCase = NULL;
3108 ftab = NULL;
3109 block = NULL;
3110 ll = NULL;
3111 words = NULL;
3112 zptr = NULL;
3113 bsInUse = False;
3114 errno = 0;
3115
3116 strcpy ( progNameReally, argv[0] );
3117 progName = &progNameReally[0];
3118 for (tmp = &progNameReally[0]; *tmp != '\0'; tmp++)
3119 if (*tmp == PATH_SEP) progName = tmp + 1;
3120
3121 argList = NULL;
3122 for (i = 1; i <= argc-1; i++)
3123 APPEND_FILESPEC(argList, argv[i]);
3124
3125 strcpy ( inName, "-" );
3126 strcpy ( outName, "-" );
3127
3128
3129 signal (SIGINT, mySignalCatcher);
3130 signal (SIGTERM, mySignalCatcher);
3131 signal (SIGSEGV, mySIGSEGVorSIGBUScatcher);
3132 #if BZ_UNIX
3133 signal (SIGHUP, mySignalCatcher);
3134 signal (SIGBUS, mySIGSEGVorSIGBUScatcher);
3135 #endif
3136
3137 #if DEBUG
3138 if ( ! (argc > 1 && strcmp ( "-Q", argv[1] ) == 0) )
3139 fprintf ( stderr, "BZIP: *** compiled with debugging ON ***\n" );
3140 #endif
3141
3142 if (sizeof(Int32) != 4 || sizeof(UInt32) != 4 ||
3143 sizeof(Char) != 1 || sizeof(UChar) != 1) {
3144 fprintf ( stderr,
3145 "BZIP: I require sizeof(Int32) == 4 bytes and\n"
3146 "\tsizeof(Char) == 1 byte to run properly, sorry!\n"
3147 "\tProbably you can fix this by defining them correctly,\n"
3148 "\tand recompiling.\n" );
3149 exit(1);
3150 }
3151
3152 longestFileName = 7;
3153 numFileNames = 0;
3154 for (aa = argList; aa != NULL; aa = aa->link)
3155 if (aa->name[0] != '-') {
3156 numFileNames++;
3157 if (longestFileName < (Int32)strlen(aa->name) )
3158 longestFileName = (Int32)strlen(aa->name);
3159 }
3160
3161 keepInputFiles = False;
3162 compressing = True;
3163 verbose = False;
3164 veryVerbose = False;
3165
3166 if (numFileNames == 0)
3167 opMode = OM_STDIN_TO_STDOUT; else
3168 opMode = OM_FILES_TO_FILES;
3169
3170 if ( (strcmp ( "bunzip", progName ) == 0) ||
3171 (strcmp ( "BUNZIP", progName ) == 0) ||
3172 (strcmp ( "bunzip.exe", progName ) == 0) ||
3173 (strcmp ( "BUNZIP.EXE", progName ) == 0) )
3174 compressing = False;
3175
3176 if (compressing) blockSize100k = 9;
3177
3178 for (aa = argList; aa != NULL; aa = aa->link)
3179 if (aa->name[0] == '-')
3180 for (j = 1; aa->name[j] != '\0'; j++)
3181 switch (aa->name[j]) {
3182 case 'Q': break;
3183 case 'c': opMode = OM_FILE_TO_STDOUT; break;
3184 case 'd': compressing = False; break;
3185 case 'f': compressing = True; break;
3186 case 'v': verbose = True; break;
3187 case 'k': keepInputFiles = True; break;
3188 case '1': blockSize100k = 1; break;
3189 case '2': blockSize100k = 2; break;
3190 case '3': blockSize100k = 3; break;
3191 case '4': blockSize100k = 4; break;
3192 case '5': blockSize100k = 5; break;
3193 case '6': blockSize100k = 6; break;
3194 case '7': blockSize100k = 7; break;
3195 case '8': blockSize100k = 8; break;
3196 case '9': blockSize100k = 9; break;
3197 case 'L': license(); break;
3198 case 'V': verbose = True;
3199 veryVerbose = True; break;
3200 default: fprintf ( stderr, "%s: Bad flag `%s'\n",
3201 progName, aa->name );
3202 usage ( progName );
3203 exit ( 1 );
3204 break;
3205 }
3206
3207 if (verbose) {
3208 fprintf ( stderr,
3209 "BZIP, a block-sorting file compressor. "
3210 "Version 0.21, 25-August-96.\n" );
3211 }
3212
3213 if ( opMode == OM_FILE_TO_STDOUT && numFileNames != 1) {
3214 fprintf ( stderr, "%s: Option -c requires you to supply exactly one filename.\n",
3215 progName );
3216 exit ( 1 );
3217 }
3218
3219 if ( !compressing ) blockSize100k = 0;
3220
3221 if (compressing) {
3222 allocateCompressStructures();
3223 if (opMode == OM_STDIN_TO_STDOUT)
3224 compress ( "-" );
3225 else
3226 for (aa = argList; aa != NULL; aa = aa->link)
3227 if (aa->name[0] != '-') compress ( aa->name );
3228 } else {
3229 if (opMode == OM_STDIN_TO_STDOUT)
3230 uncompress ( "-" );
3231 else
3232 for (aa = argList; aa != NULL; aa = aa->link)
3233 if (aa->name[0] != '-') uncompress ( aa->name );
3234 }
3235
3236 exit ( 0 );
3237 }
3238
3239
3240 /*-----------------------------------------------------------*/
3241 /*--- end bzip.c ---*/
3242 /*-----------------------------------------------------------*/
3243