1 /*-
2  * Copyright (c) 2008 Joerg Sonnenberger
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17  * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 
26 /*-
27  * Copyright (c) 1985, 1986, 1992, 1993
28  *	The Regents of the University of California.  All rights reserved.
29  *
30  * This code is derived from software contributed to Berkeley by
31  * Diomidis Spinellis and James A. Woods, derived from original
32  * work by Spencer Thomas and Joseph Orost.
33  *
34  * Redistribution and use in source and binary forms, with or without
35  * modification, are permitted provided that the following conditions
36  * are met:
37  * 1. Redistributions of source code must retain the above copyright
38  *    notice, this list of conditions and the following disclaimer.
39  * 2. Redistributions in binary form must reproduce the above copyright
40  *    notice, this list of conditions and the following disclaimer in the
41  *    documentation and/or other materials provided with the distribution.
42  * 3. Neither the name of the University nor the names of its contributors
43  *    may be used to endorse or promote products derived from this software
44  *    without specific prior written permission.
45  *
46  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56  * SUCH DAMAGE.
57  */
58 
59 #include "archive_platform.h"
60 
61 __FBSDID("$FreeBSD: head/lib/libarchive/archive_write_set_compression_compress.c 201111 2009-12-28 03:33:05Z kientzle $");
62 
63 #ifdef HAVE_ERRNO_H
64 #include <errno.h>
65 #endif
66 #ifdef HAVE_STDLIB_H
67 #include <stdlib.h>
68 #endif
69 #ifdef HAVE_STRING_H
70 #include <string.h>
71 #endif
72 
73 #include "archive.h"
74 #include "archive_private.h"
75 #include "archive_write_private.h"
76 
77 #define	HSIZE		69001	/* 95% occupancy */
78 #define	HSHIFT		8	/* 8 - trunc(log2(HSIZE / 65536)) */
79 #define	CHECK_GAP 10000		/* Ratio check interval. */
80 
81 #define	MAXCODE(bits)	((1 << (bits)) - 1)
82 
83 /*
84  * the next two codes should not be changed lightly, as they must not
85  * lie within the contiguous general code space.
86  */
87 #define	FIRST	257		/* First free entry. */
88 #define	CLEAR	256		/* Table clear output code. */
89 
90 struct private_data {
91 	int64_t in_count, out_count, checkpoint;
92 
93 	int code_len;			/* Number of bits/code. */
94 	int cur_maxcode;		/* Maximum code, given n_bits. */
95 	int max_maxcode;		/* Should NEVER generate this code. */
96 	int hashtab [HSIZE];
97 	unsigned short codetab [HSIZE];
98 	int first_free;		/* First unused entry. */
99 	int compress_ratio;
100 
101 	int cur_code, cur_fcode;
102 
103 	int bit_offset;
104 	unsigned char bit_buf;
105 
106 	unsigned char	*compressed;
107 	size_t		 compressed_buffer_size;
108 	size_t		 compressed_offset;
109 };
110 
111 static int archive_compressor_compress_open(struct archive_write_filter *);
112 static int archive_compressor_compress_write(struct archive_write_filter *,
113 		    const void *, size_t);
114 static int archive_compressor_compress_close(struct archive_write_filter *);
115 static int archive_compressor_compress_free(struct archive_write_filter *);
116 
117 #if ARCHIVE_VERSION_NUMBER < 4000000
118 int
119 archive_write_set_compression_compress(struct archive *a)
120 {
121 	__archive_write_filters_free(a);
122 	return (archive_write_add_filter_compress(a));
123 }
124 #endif
125 
126 /*
127  * Add a compress filter to this write handle.
128  */
129 int
130 archive_write_add_filter_compress(struct archive *_a)
131 {
132 	struct archive_write *a = (struct archive_write *)_a;
133 	struct archive_write_filter *f = __archive_write_allocate_filter(_a);
134 
135 	archive_check_magic(&a->archive, ARCHIVE_WRITE_MAGIC,
136 	    ARCHIVE_STATE_NEW, "archive_write_add_filter_compress");
137 	f->open = &archive_compressor_compress_open;
138 	f->code = ARCHIVE_COMPRESSION_COMPRESS;
139 	f->name = "compress";
140 	return (ARCHIVE_OK);
141 }
142 
143 /*
144  * Setup callback.
145  */
146 static int
147 archive_compressor_compress_open(struct archive_write_filter *f)
148 {
149 	int ret;
150 	struct private_data *state;
151 
152 	f->code = ARCHIVE_COMPRESSION_COMPRESS;
153 	f->name = "compress";
154 
155 	ret = __archive_write_open_filter(f->next_filter);
156 	if (ret != ARCHIVE_OK)
157 		return (ret);
158 
159 	state = (struct private_data *)calloc(1, sizeof(*state));
160 	if (state == NULL) {
161 		archive_set_error(f->archive, ENOMEM,
162 		    "Can't allocate data for compression");
163 		return (ARCHIVE_FATAL);
164 	}
165 
166 	state->compressed_buffer_size = 65536;
167 	state->compressed = malloc(state->compressed_buffer_size);
168 
169 	if (state->compressed == NULL) {
170 		archive_set_error(f->archive, ENOMEM,
171 		    "Can't allocate data for compression buffer");
172 		free(state);
173 		return (ARCHIVE_FATAL);
174 	}
175 
176 	f->write = archive_compressor_compress_write;
177 	f->close = archive_compressor_compress_close;
178 	f->free = archive_compressor_compress_free;
179 
180 	state->max_maxcode = 0x10000;	/* Should NEVER generate this code. */
181 	state->in_count = 0;		/* Length of input. */
182 	state->bit_buf = 0;
183 	state->bit_offset = 0;
184 	state->out_count = 3;		/* Includes 3-byte header mojo. */
185 	state->compress_ratio = 0;
186 	state->checkpoint = CHECK_GAP;
187 	state->code_len = 9;
188 	state->cur_maxcode = MAXCODE(state->code_len);
189 	state->first_free = FIRST;
190 
191 	memset(state->hashtab, 0xff, sizeof(state->hashtab));
192 
193 	/* Prime output buffer with a gzip header. */
194 	state->compressed[0] = 0x1f; /* Compress */
195 	state->compressed[1] = 0x9d;
196 	state->compressed[2] = 0x90; /* Block mode, 16bit max */
197 	state->compressed_offset = 3;
198 
199 	f->data = state;
200 	return (0);
201 }
202 
203 /*-
204  * Output the given code.
205  * Inputs:
206  * 	code:	A n_bits-bit integer.  If == -1, then EOF.  This assumes
207  *		that n_bits <= (long)wordsize - 1.
208  * Outputs:
209  * 	Outputs code to the file.
210  * Assumptions:
211  *	Chars are 8 bits long.
212  * Algorithm:
213  * 	Maintain a BITS character long buffer (so that 8 codes will
214  * fit in it exactly).  Use the VAX insv instruction to insert each
215  * code in turn.  When the buffer fills up empty it and start over.
216  */
217 
218 static const unsigned char rmask[9] =
219 	{0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff};
220 
221 static int
222 output_byte(struct archive_write_filter *f, unsigned char c)
223 {
224 	struct private_data *state = f->data;
225 
226 	state->compressed[state->compressed_offset++] = c;
227 	++state->out_count;
228 
229 	if (state->compressed_buffer_size == state->compressed_offset) {
230 		int ret = __archive_write_filter(f->next_filter,
231 		    state->compressed, state->compressed_buffer_size);
232 		if (ret != ARCHIVE_OK)
233 			return ARCHIVE_FATAL;
234 		state->compressed_offset = 0;
235 	}
236 
237 	return ARCHIVE_OK;
238 }
239 
240 static int
241 output_code(struct archive_write_filter *f, int ocode)
242 {
243 	struct private_data *state = f->data;
244 	int bits, ret, clear_flg, bit_offset;
245 
246 	clear_flg = ocode == CLEAR;
247 
248 	/*
249 	 * Since ocode is always >= 8 bits, only need to mask the first
250 	 * hunk on the left.
251 	 */
252 	bit_offset = state->bit_offset % 8;
253 	state->bit_buf |= (ocode << bit_offset) & 0xff;
254 	output_byte(f, state->bit_buf);
255 
256 	bits = state->code_len - (8 - bit_offset);
257 	ocode >>= 8 - bit_offset;
258 	/* Get any 8 bit parts in the middle (<=1 for up to 16 bits). */
259 	if (bits >= 8) {
260 		output_byte(f, ocode & 0xff);
261 		ocode >>= 8;
262 		bits -= 8;
263 	}
264 	/* Last bits. */
265 	state->bit_offset += state->code_len;
266 	state->bit_buf = ocode & rmask[bits];
267 	if (state->bit_offset == state->code_len * 8)
268 		state->bit_offset = 0;
269 
270 	/*
271 	 * If the next entry is going to be too big for the ocode size,
272 	 * then increase it, if possible.
273 	 */
274 	if (clear_flg || state->first_free > state->cur_maxcode) {
275 	       /*
276 		* Write the whole buffer, because the input side won't
277 		* discover the size increase until after it has read it.
278 		*/
279 		if (state->bit_offset > 0) {
280 			while (state->bit_offset < state->code_len * 8) {
281 				ret = output_byte(f, state->bit_buf);
282 				if (ret != ARCHIVE_OK)
283 					return ret;
284 				state->bit_offset += 8;
285 				state->bit_buf = 0;
286 			}
287 		}
288 		state->bit_buf = 0;
289 		state->bit_offset = 0;
290 
291 		if (clear_flg) {
292 			state->code_len = 9;
293 			state->cur_maxcode = MAXCODE(state->code_len);
294 		} else {
295 			state->code_len++;
296 			if (state->code_len == 16)
297 				state->cur_maxcode = state->max_maxcode;
298 			else
299 				state->cur_maxcode = MAXCODE(state->code_len);
300 		}
301 	}
302 
303 	return (ARCHIVE_OK);
304 }
305 
306 static int
307 output_flush(struct archive_write_filter *f)
308 {
309 	struct private_data *state = f->data;
310 	int ret;
311 
312 	/* At EOF, write the rest of the buffer. */
313 	if (state->bit_offset % 8) {
314 		state->code_len = (state->bit_offset % 8 + 7) / 8;
315 		ret = output_byte(f, state->bit_buf);
316 		if (ret != ARCHIVE_OK)
317 			return ret;
318 	}
319 
320 	return (ARCHIVE_OK);
321 }
322 
323 /*
324  * Write data to the compressed stream.
325  */
326 static int
327 archive_compressor_compress_write(struct archive_write_filter *f,
328     const void *buff, size_t length)
329 {
330 	struct private_data *state = (struct private_data *)f->data;
331 	int i;
332 	int ratio;
333 	int c, disp, ret;
334 	const unsigned char *bp;
335 
336 	if (length == 0)
337 		return ARCHIVE_OK;
338 
339 	bp = buff;
340 
341 	if (state->in_count == 0) {
342 		state->cur_code = *bp++;
343 		++state->in_count;
344 		--length;
345 	}
346 
347 	while (length--) {
348 		c = *bp++;
349 		state->in_count++;
350 		state->cur_fcode = (c << 16) + state->cur_code;
351 		i = ((c << HSHIFT) ^ state->cur_code);	/* Xor hashing. */
352 
353 		if (state->hashtab[i] == state->cur_fcode) {
354 			state->cur_code = state->codetab[i];
355 			continue;
356 		}
357 		if (state->hashtab[i] < 0)	/* Empty slot. */
358 			goto nomatch;
359 		/* Secondary hash (after G. Knott). */
360 		if (i == 0)
361 			disp = 1;
362 		else
363 			disp = HSIZE - i;
364  probe:
365 		if ((i -= disp) < 0)
366 			i += HSIZE;
367 
368 		if (state->hashtab[i] == state->cur_fcode) {
369 			state->cur_code = state->codetab[i];
370 			continue;
371 		}
372 		if (state->hashtab[i] >= 0)
373 			goto probe;
374  nomatch:
375 		ret = output_code(f, state->cur_code);
376 		if (ret != ARCHIVE_OK)
377 			return ret;
378 		state->cur_code = c;
379 		if (state->first_free < state->max_maxcode) {
380 			state->codetab[i] = state->first_free++;	/* code -> hashtable */
381 			state->hashtab[i] = state->cur_fcode;
382 			continue;
383 		}
384 		if (state->in_count < state->checkpoint)
385 			continue;
386 
387 		state->checkpoint = state->in_count + CHECK_GAP;
388 
389 		if (state->in_count <= 0x007fffff)
390 			ratio = state->in_count * 256 / state->out_count;
391 		else if ((ratio = state->out_count / 256) == 0)
392 			ratio = 0x7fffffff;
393 		else
394 			ratio = state->in_count / ratio;
395 
396 		if (ratio > state->compress_ratio)
397 			state->compress_ratio = ratio;
398 		else {
399 			state->compress_ratio = 0;
400 			memset(state->hashtab, 0xff, sizeof(state->hashtab));
401 			state->first_free = FIRST;
402 			ret = output_code(f, CLEAR);
403 			if (ret != ARCHIVE_OK)
404 				return ret;
405 		}
406 	}
407 
408 	return (ARCHIVE_OK);
409 }
410 
411 
412 /*
413  * Finish the compression...
414  */
415 static int
416 archive_compressor_compress_close(struct archive_write_filter *f)
417 {
418 	struct private_data *state = (struct private_data *)f->data;
419 	int ret, ret2;
420 
421 	ret = output_code(f, state->cur_code);
422 	if (ret != ARCHIVE_OK)
423 		goto cleanup;
424 	ret = output_flush(f);
425 	if (ret != ARCHIVE_OK)
426 		goto cleanup;
427 
428 	/* Write the last block */
429 	ret = __archive_write_filter(f->next_filter,
430 	    state->compressed, state->compressed_offset);
431 cleanup:
432 	ret2 = __archive_write_close_filter(f->next_filter);
433 	if (ret > ret2)
434 		ret = ret2;
435 	free(state->compressed);
436 	free(state);
437 	return (ret);
438 }
439 
440 static int
441 archive_compressor_compress_free(struct archive_write_filter *f)
442 {
443 	(void)f; /* UNUSED */
444 	return (ARCHIVE_OK);
445 }
446