1 /*-
2 * Copyright (c) 1985, 1986, 1992, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Diomidis Spinellis and James A. Woods, derived from original
7 * work by Spencer Thomas and Joseph Orost.
8 *
9 * %sccs.include.redist.c%
10 */
11
12 #if defined(LIBC_SCCS) && !defined(lint)
13 static char sccsid[] = "@(#)zopen.c 8.2 (Berkeley) 04/28/95";
14 #endif /* LIBC_SCCS and not lint */
15
16 /*-
17 * fcompress.c - File compression ala IEEE Computer, June 1984.
18 *
19 * Compress authors:
20 * Spencer W. Thomas (decvax!utah-cs!thomas)
21 * Jim McKie (decvax!mcvax!jim)
22 * Steve Davies (decvax!vax135!petsd!peora!srd)
23 * Ken Turkowski (decvax!decwrl!turtlevax!ken)
24 * James A. Woods (decvax!ihnp4!ames!jaw)
25 * Joe Orost (decvax!vax135!petsd!joe)
26 *
27 * Cleaned up and converted to library returning I/O streams by
28 * Diomidis Spinellis <dds@doc.ic.ac.uk>.
29 *
30 * zopen(filename, mode, bits)
31 * Returns a FILE * that can be used for read or write. The modes
32 * supported are only "r" and "w". Seeking is not allowed. On
33 * reading the file is decompressed, on writing it is compressed.
34 * The output is compatible with compress(1) with 16 bit tables.
35 * Any file produced by compress(1) can be read.
36 */
37
38 #include <sys/param.h>
39 #include <sys/stat.h>
40
41 #include <ctype.h>
42 #include <errno.h>
43 #include <signal.h>
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <unistd.h>
48
49 #define BITS 16 /* Default bits. */
50 #define HSIZE 69001 /* 95% occupancy */
51
52 /* A code_int must be able to hold 2**BITS values of type int, and also -1. */
53 typedef long code_int;
54 typedef long count_int;
55
56 typedef u_char char_type;
57 static char_type magic_header[] =
58 {'\037', '\235'}; /* 1F 9D */
59
60 #define BIT_MASK 0x1f /* Defines for third byte of header. */
61 #define BLOCK_MASK 0x80
62
63 /*
64 * Masks 0x40 and 0x20 are free. I think 0x20 should mean that there is
65 * a fourth header byte (for expansion).
66 */
67 #define INIT_BITS 9 /* Initial number of bits/code. */
68
69 #define MAXCODE(n_bits) ((1 << (n_bits)) - 1)
70
71 struct s_zstate {
72 FILE *zs_fp; /* File stream for I/O */
73 char zs_mode; /* r or w */
74 enum {
75 S_START, S_MIDDLE, S_EOF
76 } zs_state; /* State of computation */
77 int zs_n_bits; /* Number of bits/code. */
78 int zs_maxbits; /* User settable max # bits/code. */
79 code_int zs_maxcode; /* Maximum code, given n_bits. */
80 code_int zs_maxmaxcode; /* Should NEVER generate this code. */
81 count_int zs_htab [HSIZE];
82 u_short zs_codetab [HSIZE];
83 code_int zs_hsize; /* For dynamic table sizing. */
84 code_int zs_free_ent; /* First unused entry. */
85 /*
86 * Block compression parameters -- after all codes are used up,
87 * and compression rate changes, start over.
88 */
89 int zs_block_compress;
90 int zs_clear_flg;
91 long zs_ratio;
92 count_int zs_checkpoint;
93 int zs_offset;
94 long zs_in_count; /* Length of input. */
95 long zs_bytes_out; /* Length of compressed output. */
96 long zs_out_count; /* # of codes output (for debugging). */
97 char_type zs_buf[BITS];
98 union {
99 struct {
100 long zs_fcode;
101 code_int zs_ent;
102 code_int zs_hsize_reg;
103 int zs_hshift;
104 } w; /* Write paramenters */
105 struct {
106 char_type *zs_stackp;
107 int zs_finchar;
108 code_int zs_code, zs_oldcode, zs_incode;
109 int zs_roffset, zs_size;
110 char_type zs_gbuf[BITS];
111 } r; /* Read parameters */
112 } u;
113 };
114
115 /* Definitions to retain old variable names */
116 #define fp zs->zs_fp
117 #define zmode zs->zs_mode
118 #define state zs->zs_state
119 #define n_bits zs->zs_n_bits
120 #define maxbits zs->zs_maxbits
121 #define maxcode zs->zs_maxcode
122 #define maxmaxcode zs->zs_maxmaxcode
123 #define htab zs->zs_htab
124 #define codetab zs->zs_codetab
125 #define hsize zs->zs_hsize
126 #define free_ent zs->zs_free_ent
127 #define block_compress zs->zs_block_compress
128 #define clear_flg zs->zs_clear_flg
129 #define ratio zs->zs_ratio
130 #define checkpoint zs->zs_checkpoint
131 #define offset zs->zs_offset
132 #define in_count zs->zs_in_count
133 #define bytes_out zs->zs_bytes_out
134 #define out_count zs->zs_out_count
135 #define buf zs->zs_buf
136 #define fcode zs->u.w.zs_fcode
137 #define hsize_reg zs->u.w.zs_hsize_reg
138 #define ent zs->u.w.zs_ent
139 #define hshift zs->u.w.zs_hshift
140 #define stackp zs->u.r.zs_stackp
141 #define finchar zs->u.r.zs_finchar
142 #define code zs->u.r.zs_code
143 #define oldcode zs->u.r.zs_oldcode
144 #define incode zs->u.r.zs_incode
145 #define roffset zs->u.r.zs_roffset
146 #define size zs->u.r.zs_size
147 #define gbuf zs->u.r.zs_gbuf
148
149 /*
150 * To save much memory, we overlay the table used by compress() with those
151 * used by decompress(). The tab_prefix table is the same size and type as
152 * the codetab. The tab_suffix table needs 2**BITS characters. We get this
153 * from the beginning of htab. The output stack uses the rest of htab, and
154 * contains characters. There is plenty of room for any possible stack
155 * (stack used to be 8000 characters).
156 */
157
158 #define htabof(i) htab[i]
159 #define codetabof(i) codetab[i]
160
161 #define tab_prefixof(i) codetabof(i)
162 #define tab_suffixof(i) ((char_type *)(htab))[i]
163 #define de_stack ((char_type *)&tab_suffixof(1 << BITS))
164
165 #define CHECK_GAP 10000 /* Ratio check interval. */
166
167 /*
168 * the next two codes should not be changed lightly, as they must not
169 * lie within the contiguous general code space.
170 */
171 #define FIRST 257 /* First free entry. */
172 #define CLEAR 256 /* Table clear output code. */
173
174 static int cl_block __P((struct s_zstate *));
175 static void cl_hash __P((struct s_zstate *, count_int));
176 static code_int getcode __P((struct s_zstate *));
177 static int output __P((struct s_zstate *, code_int));
178 static int zclose __P((void *));
179 static int zread __P((void *, char *, int));
180 static int zwrite __P((void *, const char *, int));
181
182 /*-
183 * Algorithm from "A Technique for High Performance Data Compression",
184 * Terry A. Welch, IEEE Computer Vol 17, No 6 (June 1984), pp 8-19.
185 *
186 * Algorithm:
187 * Modified Lempel-Ziv method (LZW). Basically finds common
188 * substrings and replaces them with a variable size code. This is
189 * deterministic, and can be done on the fly. Thus, the decompression
190 * procedure needs no input table, but tracks the way the table was built.
191 */
192
193 /*-
194 * compress write
195 *
196 * Algorithm: use open addressing double hashing (no chaining) on the
197 * prefix code / next character combination. We do a variant of Knuth's
198 * algorithm D (vol. 3, sec. 6.4) along with G. Knott's relatively-prime
199 * secondary probe. Here, the modular division first probe is gives way
200 * to a faster exclusive-or manipulation. Also do block compression with
201 * an adaptive reset, whereby the code table is cleared when the compression
202 * ratio decreases, but after the table fills. The variable-length output
203 * codes are re-sized at this point, and a special CLEAR code is generated
204 * for the decompressor. Late addition: construct the table according to
205 * file size for noticeable speed improvement on small files. Please direct
206 * questions about this implementation to ames!jaw.
207 */
208 static int
zwrite(cookie,wbp,num)209 zwrite(cookie, wbp, num)
210 void *cookie;
211 const char *wbp;
212 int num;
213 {
214 register code_int i;
215 register int c, disp;
216 struct s_zstate *zs;
217 const u_char *bp;
218 u_char tmp;
219 int count;
220
221 if (num == 0)
222 return (0);
223
224 zs = cookie;
225 count = num;
226 bp = (u_char *)wbp;
227 if (state == S_MIDDLE)
228 goto middle;
229 state = S_MIDDLE;
230
231 maxmaxcode = 1L << maxbits;
232 if (fwrite(magic_header,
233 sizeof(char), sizeof(magic_header), fp) != sizeof(magic_header))
234 return (-1);
235 tmp = (u_char)(maxbits | block_compress);
236 if (fwrite(&tmp, sizeof(char), sizeof(tmp), fp) != sizeof(tmp))
237 return (-1);
238
239 offset = 0;
240 bytes_out = 3; /* Includes 3-byte header mojo. */
241 out_count = 0;
242 clear_flg = 0;
243 ratio = 0;
244 in_count = 1;
245 checkpoint = CHECK_GAP;
246 maxcode = MAXCODE(n_bits = INIT_BITS);
247 free_ent = ((block_compress) ? FIRST : 256);
248
249 ent = *bp++;
250 --count;
251
252 hshift = 0;
253 for (fcode = (long)hsize; fcode < 65536L; fcode *= 2L)
254 hshift++;
255 hshift = 8 - hshift; /* Set hash code range bound. */
256
257 hsize_reg = hsize;
258 cl_hash(zs, (count_int)hsize_reg); /* Clear hash table. */
259
260 middle: for (i = 0; count--;) {
261 c = *bp++;
262 in_count++;
263 fcode = (long)(((long)c << maxbits) + ent);
264 i = ((c << hshift) ^ ent); /* Xor hashing. */
265
266 if (htabof(i) == fcode) {
267 ent = codetabof(i);
268 continue;
269 } else if ((long)htabof(i) < 0) /* Empty slot. */
270 goto nomatch;
271 disp = hsize_reg - i; /* Secondary hash (after G. Knott). */
272 if (i == 0)
273 disp = 1;
274 probe: if ((i -= disp) < 0)
275 i += hsize_reg;
276
277 if (htabof(i) == fcode) {
278 ent = codetabof(i);
279 continue;
280 }
281 if ((long)htabof(i) >= 0)
282 goto probe;
283 nomatch: if (output(zs, (code_int) ent) == -1)
284 return (-1);
285 out_count++;
286 ent = c;
287 if (free_ent < maxmaxcode) {
288 codetabof(i) = free_ent++; /* code -> hashtable */
289 htabof(i) = fcode;
290 } else if ((count_int)in_count >=
291 checkpoint && block_compress) {
292 if (cl_block(zs) == -1)
293 return (-1);
294 }
295 }
296 return (num);
297 }
298
299 static int
zclose(cookie)300 zclose(cookie)
301 void *cookie;
302 {
303 struct s_zstate *zs;
304 int rval;
305
306 zs = cookie;
307 if (zmode == 'w') { /* Put out the final code. */
308 if (output(zs, (code_int) ent) == -1) {
309 (void)fclose(fp);
310 free(zs);
311 return (-1);
312 }
313 out_count++;
314 if (output(zs, (code_int) - 1) == -1) {
315 (void)fclose(fp);
316 free(zs);
317 return (-1);
318 }
319 }
320 rval = fclose(fp) == EOF ? -1 : 0;
321 free(zs);
322 return (rval);
323 }
324
325 /*-
326 * Output the given code.
327 * Inputs:
328 * code: A n_bits-bit integer. If == -1, then EOF. This assumes
329 * that n_bits =< (long)wordsize - 1.
330 * Outputs:
331 * Outputs code to the file.
332 * Assumptions:
333 * Chars are 8 bits long.
334 * Algorithm:
335 * Maintain a BITS character long buffer (so that 8 codes will
336 * fit in it exactly). Use the VAX insv instruction to insert each
337 * code in turn. When the buffer fills up empty it and start over.
338 */
339
340 static char_type lmask[9] =
341 {0xff, 0xfe, 0xfc, 0xf8, 0xf0, 0xe0, 0xc0, 0x80, 0x00};
342 static char_type rmask[9] =
343 {0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff};
344
345 static int
output(zs,ocode)346 output(zs, ocode)
347 struct s_zstate *zs;
348 code_int ocode;
349 {
350 register int bits, r_off;
351 register char_type *bp;
352
353 r_off = offset;
354 bits = n_bits;
355 bp = buf;
356 if (ocode >= 0) {
357 /* Get to the first byte. */
358 bp += (r_off >> 3);
359 r_off &= 7;
360 /*
361 * Since ocode is always >= 8 bits, only need to mask the first
362 * hunk on the left.
363 */
364 *bp = (*bp & rmask[r_off]) | (ocode << r_off) & lmask[r_off];
365 bp++;
366 bits -= (8 - r_off);
367 ocode >>= 8 - r_off;
368 /* Get any 8 bit parts in the middle (<=1 for up to 16 bits). */
369 if (bits >= 8) {
370 *bp++ = ocode;
371 ocode >>= 8;
372 bits -= 8;
373 }
374 /* Last bits. */
375 if (bits)
376 *bp = ocode;
377 offset += n_bits;
378 if (offset == (n_bits << 3)) {
379 bp = buf;
380 bits = n_bits;
381 bytes_out += bits;
382 if (fwrite(bp, sizeof(char), bits, fp) != bits)
383 return (-1);
384 bp += bits;
385 bits = 0;
386 offset = 0;
387 }
388 /*
389 * If the next entry is going to be too big for the ocode size,
390 * then increase it, if possible.
391 */
392 if (free_ent > maxcode || (clear_flg > 0)) {
393 /*
394 * Write the whole buffer, because the input side won't
395 * discover the size increase until after it has read it.
396 */
397 if (offset > 0) {
398 if (fwrite(buf, 1, n_bits, fp) != n_bits)
399 return (-1);
400 bytes_out += n_bits;
401 }
402 offset = 0;
403
404 if (clear_flg) {
405 maxcode = MAXCODE(n_bits = INIT_BITS);
406 clear_flg = 0;
407 } else {
408 n_bits++;
409 if (n_bits == maxbits)
410 maxcode = maxmaxcode;
411 else
412 maxcode = MAXCODE(n_bits);
413 }
414 }
415 } else {
416 /* At EOF, write the rest of the buffer. */
417 if (offset > 0) {
418 offset = (offset + 7) / 8;
419 if (fwrite(buf, 1, offset, fp) != offset)
420 return (-1);
421 bytes_out += offset;
422 }
423 offset = 0;
424 }
425 return (0);
426 }
427
428 /*
429 * Decompress read. This routine adapts to the codes in the file building
430 * the "string" table on-the-fly; requiring no table to be stored in the
431 * compressed file. The tables used herein are shared with those of the
432 * compress() routine. See the definitions above.
433 */
434 static int
zread(cookie,rbp,num)435 zread(cookie, rbp, num)
436 void *cookie;
437 char *rbp;
438 int num;
439 {
440 register u_int count;
441 struct s_zstate *zs;
442 u_char *bp, header[3];
443
444 if (num == 0)
445 return (0);
446
447 zs = cookie;
448 count = num;
449 bp = (u_char *)rbp;
450 switch (state) {
451 case S_START:
452 state = S_MIDDLE;
453 break;
454 case S_MIDDLE:
455 goto middle;
456 case S_EOF:
457 goto eof;
458 }
459
460 /* Check the magic number */
461 if (fread(header,
462 sizeof(char), sizeof(header), fp) != sizeof(header) ||
463 memcmp(header, magic_header, sizeof(magic_header)) != 0) {
464 errno = EFTYPE;
465 return (-1);
466 }
467 maxbits = header[2]; /* Set -b from file. */
468 block_compress = maxbits & BLOCK_MASK;
469 maxbits &= BIT_MASK;
470 maxmaxcode = 1L << maxbits;
471 if (maxbits > BITS) {
472 errno = EFTYPE;
473 return (-1);
474 }
475 /* As above, initialize the first 256 entries in the table. */
476 maxcode = MAXCODE(n_bits = INIT_BITS);
477 for (code = 255; code >= 0; code--) {
478 tab_prefixof(code) = 0;
479 tab_suffixof(code) = (char_type) code;
480 }
481 free_ent = block_compress ? FIRST : 256;
482
483 finchar = oldcode = getcode(zs);
484 if (oldcode == -1) /* EOF already? */
485 return (0); /* Get out of here */
486
487 /* First code must be 8 bits = char. */
488 *bp++ = (u_char)finchar;
489 count--;
490 stackp = de_stack;
491
492 while ((code = getcode(zs)) > -1) {
493
494 if ((code == CLEAR) && block_compress) {
495 for (code = 255; code >= 0; code--)
496 tab_prefixof(code) = 0;
497 clear_flg = 1;
498 free_ent = FIRST - 1;
499 if ((code = getcode(zs)) == -1) /* O, untimely death! */
500 break;
501 }
502 incode = code;
503
504 /* Special case for KwKwK string. */
505 if (code >= free_ent) {
506 *stackp++ = finchar;
507 code = oldcode;
508 }
509
510 /* Generate output characters in reverse order. */
511 while (code >= 256) {
512 *stackp++ = tab_suffixof(code);
513 code = tab_prefixof(code);
514 }
515 *stackp++ = finchar = tab_suffixof(code);
516
517 /* And put them out in forward order. */
518 middle: do {
519 if (count-- == 0)
520 return (num);
521 *bp++ = *--stackp;
522 } while (stackp > de_stack);
523
524 /* Generate the new entry. */
525 if ((code = free_ent) < maxmaxcode) {
526 tab_prefixof(code) = (u_short) oldcode;
527 tab_suffixof(code) = finchar;
528 free_ent = code + 1;
529 }
530
531 /* Remember previous code. */
532 oldcode = incode;
533 }
534 state = S_EOF;
535 eof: return (num - count);
536 }
537
538 /*-
539 * Read one code from the standard input. If EOF, return -1.
540 * Inputs:
541 * stdin
542 * Outputs:
543 * code or -1 is returned.
544 */
545 static code_int
getcode(zs)546 getcode(zs)
547 struct s_zstate *zs;
548 {
549 register code_int gcode;
550 register int r_off, bits;
551 register char_type *bp;
552
553 bp = gbuf;
554 if (clear_flg > 0 || roffset >= size || free_ent > maxcode) {
555 /*
556 * If the next entry will be too big for the current gcode
557 * size, then we must increase the size. This implies reading
558 * a new buffer full, too.
559 */
560 if (free_ent > maxcode) {
561 n_bits++;
562 if (n_bits == maxbits) /* Won't get any bigger now. */
563 maxcode = maxmaxcode;
564 else
565 maxcode = MAXCODE(n_bits);
566 }
567 if (clear_flg > 0) {
568 maxcode = MAXCODE(n_bits = INIT_BITS);
569 clear_flg = 0;
570 }
571 size = fread(gbuf, 1, n_bits, fp);
572 if (size <= 0) /* End of file. */
573 return (-1);
574 roffset = 0;
575 /* Round size down to integral number of codes. */
576 size = (size << 3) - (n_bits - 1);
577 }
578 r_off = roffset;
579 bits = n_bits;
580
581 /* Get to the first byte. */
582 bp += (r_off >> 3);
583 r_off &= 7;
584
585 /* Get first part (low order bits). */
586 gcode = (*bp++ >> r_off);
587 bits -= (8 - r_off);
588 r_off = 8 - r_off; /* Now, roffset into gcode word. */
589
590 /* Get any 8 bit parts in the middle (<=1 for up to 16 bits). */
591 if (bits >= 8) {
592 gcode |= *bp++ << r_off;
593 r_off += 8;
594 bits -= 8;
595 }
596
597 /* High order bits. */
598 gcode |= (*bp & rmask[bits]) << r_off;
599 roffset += n_bits;
600
601 return (gcode);
602 }
603
604 static int
cl_block(zs)605 cl_block(zs) /* Table clear for block compress. */
606 struct s_zstate *zs;
607 {
608 register long rat;
609
610 checkpoint = in_count + CHECK_GAP;
611
612 if (in_count > 0x007fffff) { /* Shift will overflow. */
613 rat = bytes_out >> 8;
614 if (rat == 0) /* Don't divide by zero. */
615 rat = 0x7fffffff;
616 else
617 rat = in_count / rat;
618 } else
619 rat = (in_count << 8) / bytes_out; /* 8 fractional bits. */
620 if (rat > ratio)
621 ratio = rat;
622 else {
623 ratio = 0;
624 cl_hash(zs, (count_int) hsize);
625 free_ent = FIRST;
626 clear_flg = 1;
627 if (output(zs, (code_int) CLEAR) == -1)
628 return (-1);
629 }
630 return (0);
631 }
632
633 static void
cl_hash(zs,cl_hsize)634 cl_hash(zs, cl_hsize) /* Reset code table. */
635 struct s_zstate *zs;
636 register count_int cl_hsize;
637 {
638 register count_int *htab_p;
639 register long i, m1;
640
641 m1 = -1;
642 htab_p = htab + cl_hsize;
643 i = cl_hsize - 16;
644 do { /* Might use Sys V memset(3) here. */
645 *(htab_p - 16) = m1;
646 *(htab_p - 15) = m1;
647 *(htab_p - 14) = m1;
648 *(htab_p - 13) = m1;
649 *(htab_p - 12) = m1;
650 *(htab_p - 11) = m1;
651 *(htab_p - 10) = m1;
652 *(htab_p - 9) = m1;
653 *(htab_p - 8) = m1;
654 *(htab_p - 7) = m1;
655 *(htab_p - 6) = m1;
656 *(htab_p - 5) = m1;
657 *(htab_p - 4) = m1;
658 *(htab_p - 3) = m1;
659 *(htab_p - 2) = m1;
660 *(htab_p - 1) = m1;
661 htab_p -= 16;
662 } while ((i -= 16) >= 0);
663 for (i += 16; i > 0; i--)
664 *--htab_p = m1;
665 }
666
667 FILE *
zopen(fname,mode,bits)668 zopen(fname, mode, bits)
669 const char *fname, *mode;
670 int bits;
671 {
672 struct s_zstate *zs;
673
674 if (mode[0] != 'r' && mode[0] != 'w' || mode[1] != '\0' ||
675 bits < 0 || bits > BITS) {
676 errno = EINVAL;
677 return (NULL);
678 }
679
680 if ((zs = calloc(1, sizeof(struct s_zstate))) == NULL)
681 return (NULL);
682
683 maxbits = bits ? bits : BITS; /* User settable max # bits/code. */
684 maxmaxcode = 1 << maxbits; /* Should NEVER generate this code. */
685 hsize = HSIZE; /* For dynamic table sizing. */
686 free_ent = 0; /* First unused entry. */
687 block_compress = BLOCK_MASK;
688 clear_flg = 0;
689 ratio = 0;
690 checkpoint = CHECK_GAP;
691 in_count = 1; /* Length of input. */
692 out_count = 0; /* # of codes output (for debugging). */
693 state = S_START;
694 roffset = 0;
695 size = 0;
696
697 /*
698 * Layering compress on top of stdio in order to provide buffering,
699 * and ensure that reads and write work with the data specified.
700 */
701 if ((fp = fopen(fname, mode)) == NULL) {
702 free(zs);
703 return (NULL);
704 }
705 switch (*mode) {
706 case 'r':
707 zmode = 'r';
708 return (funopen(zs, zread, NULL, NULL, zclose));
709 case 'w':
710 zmode = 'w';
711 return (funopen(zs, NULL, zwrite, NULL, zclose));
712 }
713 /* NOTREACHED */
714 }
715