xref: /minix/common/dist/zlib/examples/gzjoin.c (revision 44bedb31)
1 /*	$NetBSD: gzjoin.c,v 1.1.1.1 2006/01/14 20:11:09 christos Exp $	*/
2 
3 /* gzjoin -- command to join gzip files into one gzip file
4 
5   Copyright (C) 2004 Mark Adler, all rights reserved
6   version 1.0, 11 Dec 2004
7 
8   This software is provided 'as-is', without any express or implied
9   warranty.  In no event will the author be held liable for any damages
10   arising from the use of this software.
11 
12   Permission is granted to anyone to use this software for any purpose,
13   including commercial applications, and to alter it and redistribute it
14   freely, subject to the following restrictions:
15 
16   1. The origin of this software must not be misrepresented; you must not
17      claim that you wrote the original software. If you use this software
18      in a product, an acknowledgment in the product documentation would be
19      appreciated but is not required.
20   2. Altered source versions must be plainly marked as such, and must not be
21      misrepresented as being the original software.
22   3. This notice may not be removed or altered from any source distribution.
23 
24   Mark Adler    madler@alumni.caltech.edu
25  */
26 
27 /*
28  * Change history:
29  *
30  * 1.0  11 Dec 2004     - First version
31  * 1.1  12 Jun 2005     - Changed ssize_t to long for portability
32  */
33 
34 /*
35    gzjoin takes one or more gzip files on the command line and writes out a
36    single gzip file that will uncompress to the concatenation of the
37    uncompressed data from the individual gzip files.  gzjoin does this without
38    having to recompress any of the data and without having to calculate a new
39    crc32 for the concatenated uncompressed data.  gzjoin does however have to
40    decompress all of the input data in order to find the bits in the compressed
41    data that need to be modified to concatenate the streams.
42 
43    gzjoin does not do an integrity check on the input gzip files other than
44    checking the gzip header and decompressing the compressed data.  They are
45    otherwise assumed to be complete and correct.
46 
47    Each joint between gzip files removes at least 18 bytes of previous trailer
48    and subsequent header, and inserts an average of about three bytes to the
49    compressed data in order to connect the streams.  The output gzip file
50    has a minimal ten-byte gzip header with no file name or modification time.
51 
52    This program was written to illustrate the use of the Z_BLOCK option of
53    inflate() and the crc32_combine() function.  gzjoin will not compile with
54    versions of zlib earlier than 1.2.3.
55  */
56 
57 #include <stdio.h>      /* fputs(), fprintf(), fwrite(), putc() */
58 #include <stdlib.h>     /* exit(), malloc(), free() */
59 #include <fcntl.h>      /* open() */
60 #include <unistd.h>     /* close(), read(), lseek() */
61 #include "zlib.h"
62     /* crc32(), crc32_combine(), inflateInit2(), inflate(), inflateEnd() */
63 
64 #define local static
65 
66 /* exit with an error (return a value to allow use in an expression) */
bail(char * why1,char * why2)67 local int bail(char *why1, char *why2)
68 {
69     fprintf(stderr, "gzjoin error: %s%s, output incomplete\n", why1, why2);
70     exit(1);
71     return 0;
72 }
73 
74 /* -- simple buffered file input with access to the buffer -- */
75 
76 #define CHUNK 32768         /* must be a power of two and fit in unsigned */
77 
78 /* bin buffered input file type */
79 typedef struct {
80     char *name;             /* name of file for error messages */
81     int fd;                 /* file descriptor */
82     unsigned left;          /* bytes remaining at next */
83     unsigned char *next;    /* next byte to read */
84     unsigned char *buf;     /* allocated buffer of length CHUNK */
85 } bin;
86 
87 /* close a buffered file and free allocated memory */
bclose(bin * in)88 local void bclose(bin *in)
89 {
90     if (in != NULL) {
91         if (in->fd != -1)
92             close(in->fd);
93         if (in->buf != NULL)
94             free(in->buf);
95         free(in);
96     }
97 }
98 
99 /* open a buffered file for input, return a pointer to type bin, or NULL on
100    failure */
bopen(char * name)101 local bin *bopen(char *name)
102 {
103     bin *in;
104 
105     in = malloc(sizeof(bin));
106     if (in == NULL)
107         return NULL;
108     in->buf = malloc(CHUNK);
109     in->fd = open(name, O_RDONLY, 0);
110     if (in->buf == NULL || in->fd == -1) {
111         bclose(in);
112         return NULL;
113     }
114     in->left = 0;
115     in->next = in->buf;
116     in->name = name;
117     return in;
118 }
119 
120 /* load buffer from file, return -1 on read error, 0 or 1 on success, with
121    1 indicating that end-of-file was reached */
bload(bin * in)122 local int bload(bin *in)
123 {
124     long len;
125 
126     if (in == NULL)
127         return -1;
128     if (in->left != 0)
129         return 0;
130     in->next = in->buf;
131     do {
132         len = (long)read(in->fd, in->buf + in->left, CHUNK - in->left);
133         if (len < 0)
134             return -1;
135         in->left += (unsigned)len;
136     } while (len != 0 && in->left < CHUNK);
137     return len == 0 ? 1 : 0;
138 }
139 
140 /* get a byte from the file, bail if end of file */
141 #define bget(in) (in->left ? 0 : bload(in), \
142                   in->left ? (in->left--, *(in->next)++) : \
143                     bail("unexpected end of file on ", in->name))
144 
145 /* get a four-byte little-endian unsigned integer from file */
bget4(bin * in)146 local unsigned long bget4(bin *in)
147 {
148     unsigned long val;
149 
150     val = bget(in);
151     val += (unsigned long)(bget(in)) << 8;
152     val += (unsigned long)(bget(in)) << 16;
153     val += (unsigned long)(bget(in)) << 24;
154     return val;
155 }
156 
157 /* skip bytes in file */
bskip(bin * in,unsigned skip)158 local void bskip(bin *in, unsigned skip)
159 {
160     /* check pointer */
161     if (in == NULL)
162         return;
163 
164     /* easy case -- skip bytes in buffer */
165     if (skip <= in->left) {
166         in->left -= skip;
167         in->next += skip;
168         return;
169     }
170 
171     /* skip what's in buffer, discard buffer contents */
172     skip -= in->left;
173     in->left = 0;
174 
175     /* seek past multiples of CHUNK bytes */
176     if (skip > CHUNK) {
177         unsigned left;
178 
179         left = skip & (CHUNK - 1);
180         if (left == 0) {
181             /* exact number of chunks: seek all the way minus one byte to check
182                for end-of-file with a read */
183             lseek(in->fd, skip - 1, SEEK_CUR);
184             if (read(in->fd, in->buf, 1) != 1)
185                 bail("unexpected end of file on ", in->name);
186             return;
187         }
188 
189         /* skip the integral chunks, update skip with remainder */
190         lseek(in->fd, skip - left, SEEK_CUR);
191         skip = left;
192     }
193 
194     /* read more input and skip remainder */
195     bload(in);
196     if (skip > in->left)
197         bail("unexpected end of file on ", in->name);
198     in->left -= skip;
199     in->next += skip;
200 }
201 
202 /* -- end of buffered input functions -- */
203 
204 /* skip the gzip header from file in */
gzhead(bin * in)205 local void gzhead(bin *in)
206 {
207     int flags;
208 
209     /* verify gzip magic header and compression method */
210     if (bget(in) != 0x1f || bget(in) != 0x8b || bget(in) != 8)
211         bail(in->name, " is not a valid gzip file");
212 
213     /* get and verify flags */
214     flags = bget(in);
215     if ((flags & 0xe0) != 0)
216         bail("unknown reserved bits set in ", in->name);
217 
218     /* skip modification time, extra flags, and os */
219     bskip(in, 6);
220 
221     /* skip extra field if present */
222     if (flags & 4) {
223         unsigned len;
224 
225         len = bget(in);
226         len += (unsigned)(bget(in)) << 8;
227         bskip(in, len);
228     }
229 
230     /* skip file name if present */
231     if (flags & 8)
232         while (bget(in) != 0)
233             ;
234 
235     /* skip comment if present */
236     if (flags & 16)
237         while (bget(in) != 0)
238             ;
239 
240     /* skip header crc if present */
241     if (flags & 2)
242         bskip(in, 2);
243 }
244 
245 /* write a four-byte little-endian unsigned integer to out */
put4(unsigned long val,FILE * out)246 local void put4(unsigned long val, FILE *out)
247 {
248     putc(val & 0xff, out);
249     putc((val >> 8) & 0xff, out);
250     putc((val >> 16) & 0xff, out);
251     putc((val >> 24) & 0xff, out);
252 }
253 
254 /* Load up zlib stream from buffered input, bail if end of file */
zpull(z_streamp strm,bin * in)255 local void zpull(z_streamp strm, bin *in)
256 {
257     if (in->left == 0)
258         bload(in);
259     if (in->left == 0)
260         bail("unexpected end of file on ", in->name);
261     strm->avail_in = in->left;
262     strm->next_in = in->next;
263 }
264 
265 /* Write header for gzip file to out and initialize trailer. */
gzinit(unsigned long * crc,unsigned long * tot,FILE * out)266 local void gzinit(unsigned long *crc, unsigned long *tot, FILE *out)
267 {
268     fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
269     *crc = crc32(0L, Z_NULL, 0);
270     *tot = 0;
271 }
272 
273 /* Copy the compressed data from name, zeroing the last block bit of the last
274    block if clr is true, and adding empty blocks as needed to get to a byte
275    boundary.  If clr is false, then the last block becomes the last block of
276    the output, and the gzip trailer is written.  crc and tot maintains the
277    crc and length (modulo 2^32) of the output for the trailer.  The resulting
278    gzip file is written to out.  gzinit() must be called before the first call
279    of gzcopy() to write the gzip header and to initialize crc and tot. */
gzcopy(char * name,int clr,unsigned long * crc,unsigned long * tot,FILE * out)280 local void gzcopy(char *name, int clr, unsigned long *crc, unsigned long *tot,
281                   FILE *out)
282 {
283     int ret;                /* return value from zlib functions */
284     int pos;                /* where the "last block" bit is in byte */
285     int last;               /* true if processing the last block */
286     bin *in;                /* buffered input file */
287     unsigned char *start;   /* start of compressed data in buffer */
288     unsigned char *junk;    /* buffer for uncompressed data -- discarded */
289     z_off_t len;            /* length of uncompressed data (support > 4 GB) */
290     z_stream strm;          /* zlib inflate stream */
291 
292     /* open gzip file and skip header */
293     in = bopen(name);
294     if (in == NULL)
295         bail("could not open ", name);
296     gzhead(in);
297 
298     /* allocate buffer for uncompressed data and initialize raw inflate
299        stream */
300     junk = malloc(CHUNK);
301     strm.zalloc = Z_NULL;
302     strm.zfree = Z_NULL;
303     strm.opaque = Z_NULL;
304     strm.avail_in = 0;
305     strm.next_in = Z_NULL;
306     ret = inflateInit2(&strm, -15);
307     if (junk == NULL || ret != Z_OK)
308         bail("out of memory", "");
309 
310     /* inflate and copy compressed data, clear last-block bit if requested */
311     len = 0;
312     zpull(&strm, in);
313     start = strm.next_in;
314     last = start[0] & 1;
315     if (last && clr)
316         start[0] &= ~1;
317     strm.avail_out = 0;
318     for (;;) {
319         /* if input used and output done, write used input and get more */
320         if (strm.avail_in == 0 && strm.avail_out != 0) {
321             fwrite(start, 1, strm.next_in - start, out);
322             start = in->buf;
323             in->left = 0;
324             zpull(&strm, in);
325         }
326 
327         /* decompress -- return early when end-of-block reached */
328         strm.avail_out = CHUNK;
329         strm.next_out = junk;
330         ret = inflate(&strm, Z_BLOCK);
331         switch (ret) {
332         case Z_MEM_ERROR:
333             bail("out of memory", "");
334         case Z_DATA_ERROR:
335             bail("invalid compressed data in ", in->name);
336         }
337 
338         /* update length of uncompressed data */
339         len += CHUNK - strm.avail_out;
340 
341         /* check for block boundary (only get this when block copied out) */
342         if (strm.data_type & 128) {
343             /* if that was the last block, then done */
344             if (last)
345                 break;
346 
347             /* number of unused bits in last byte */
348             pos = strm.data_type & 7;
349 
350             /* find the next last-block bit */
351             if (pos != 0) {
352                 /* next last-block bit is in last used byte */
353                 pos = 0x100 >> pos;
354                 last = strm.next_in[-1] & pos;
355                 if (last && clr)
356                     strm.next_in[-1] &= ~pos;
357             }
358             else {
359                 /* next last-block bit is in next unused byte */
360                 if (strm.avail_in == 0) {
361                     /* don't have that byte yet -- get it */
362                     fwrite(start, 1, strm.next_in - start, out);
363                     start = in->buf;
364                     in->left = 0;
365                     zpull(&strm, in);
366                 }
367                 last = strm.next_in[0] & 1;
368                 if (last && clr)
369                     strm.next_in[0] &= ~1;
370             }
371         }
372     }
373 
374     /* update buffer with unused input */
375     in->left = strm.avail_in;
376     in->next = strm.next_in;
377 
378     /* copy used input, write empty blocks to get to byte boundary */
379     pos = strm.data_type & 7;
380     fwrite(start, 1, in->next - start - 1, out);
381     last = in->next[-1];
382     if (pos == 0 || !clr)
383         /* already at byte boundary, or last file: write last byte */
384         putc(last, out);
385     else {
386         /* append empty blocks to last byte */
387         last &= ((0x100 >> pos) - 1);       /* assure unused bits are zero */
388         if (pos & 1) {
389             /* odd -- append an empty stored block */
390             putc(last, out);
391             if (pos == 1)
392                 putc(0, out);               /* two more bits in block header */
393             fwrite("\0\0\xff\xff", 1, 4, out);
394         }
395         else {
396             /* even -- append 1, 2, or 3 empty fixed blocks */
397             switch (pos) {
398             case 6:
399                 putc(last | 8, out);
400                 last = 0;
401             case 4:
402                 putc(last | 0x20, out);
403                 last = 0;
404             case 2:
405                 putc(last | 0x80, out);
406                 putc(0, out);
407             }
408         }
409     }
410 
411     /* update crc and tot */
412     *crc = crc32_combine(*crc, bget4(in), len);
413     *tot += (unsigned long)len;
414 
415     /* clean up */
416     inflateEnd(&strm);
417     free(junk);
418     bclose(in);
419 
420     /* write trailer if this is the last gzip file */
421     if (!clr) {
422         put4(*crc, out);
423         put4(*tot, out);
424     }
425 }
426 
427 /* join the gzip files on the command line, write result to stdout */
main(int argc,char ** argv)428 int main(int argc, char **argv)
429 {
430     unsigned long crc, tot;     /* running crc and total uncompressed length */
431 
432     /* skip command name */
433     argc--;
434     argv++;
435 
436     /* show usage if no arguments */
437     if (argc == 0) {
438         fputs("gzjoin usage: gzjoin f1.gz [f2.gz [f3.gz ...]] > fjoin.gz\n",
439               stderr);
440         return 0;
441     }
442 
443     /* join gzip files on command line and write to stdout */
444     gzinit(&crc, &tot, stdout);
445     while (argc--)
446         gzcopy(*argv++, argc, &crc, &tot, stdout);
447 
448     /* done */
449     return 0;
450 }
451