xref: /freebsd/usr.bin/mkuzip/mkuzip.c (revision 4f52dfbb)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2004-2016 Maxim Sobolev <sobomax@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/types.h>
33 #include <sys/endian.h>
34 #include <sys/param.h>
35 #include <sys/sysctl.h>
36 #include <sys/stat.h>
37 #include <sys/uio.h>
38 #include <netinet/in.h>
39 #include <assert.h>
40 #include <ctype.h>
41 #include <err.h>
42 #include <fcntl.h>
43 #include <pthread.h>
44 #include <signal.h>
45 #include <stdint.h>
46 #include <stdio.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <unistd.h>
50 
51 #include "mkuzip.h"
52 #include "mkuz_cloop.h"
53 #include "mkuz_blockcache.h"
54 #include "mkuz_zlib.h"
55 #include "mkuz_lzma.h"
56 #include "mkuz_blk.h"
57 #include "mkuz_cfg.h"
58 #include "mkuz_conveyor.h"
59 #include "mkuz_format.h"
60 #include "mkuz_fqueue.h"
61 #include "mkuz_time.h"
62 #include "mkuz_insize.h"
63 
64 #define DEFAULT_CLSTSIZE	16384
65 
66 static struct mkuz_format uzip_fmt = {
67 	.magic = CLOOP_MAGIC_ZLIB,
68 	.default_sufx = DEFAULT_SUFX_ZLIB,
69 	.f_init = &mkuz_zlib_init,
70 	.f_compress = &mkuz_zlib_compress
71 };
72 
73 static struct mkuz_format ulzma_fmt = {
74         .magic = CLOOP_MAGIC_LZMA,
75         .default_sufx = DEFAULT_SUFX_LZMA,
76         .f_init = &mkuz_lzma_init,
77         .f_compress = &mkuz_lzma_compress
78 };
79 
80 static struct mkuz_blk *readblock(int, u_int32_t);
81 static void usage(void);
82 static void cleanup(void);
83 
84 static char *cleanfile = NULL;
85 
86 static int
87 cmp_blkno(const struct mkuz_blk *bp, void *p)
88 {
89 	uint32_t *ap;
90 
91 	ap = (uint32_t *)p;
92 
93 	return (bp->info.blkno == *ap);
94 }
95 
96 int main(int argc, char **argv)
97 {
98 	struct mkuz_cfg cfs;
99 	char *oname;
100 	uint64_t *toc;
101 	int i, io, opt, tmp;
102 	struct {
103 		int en;
104 		FILE *f;
105 	} summary;
106 	struct iovec iov[2];
107 	uint64_t offset, last_offset;
108 	struct cloop_header hdr;
109 	struct mkuz_conveyor *cvp;
110         void *c_ctx;
111 	struct mkuz_blk_info *chit;
112 	size_t ncpusz, ncpu, magiclen;
113 	double st, et;
114 
115 	st = getdtime();
116 
117 	ncpusz = sizeof(size_t);
118 	if (sysctlbyname("hw.ncpu", &ncpu, &ncpusz, NULL, 0) < 0) {
119 		ncpu = 1;
120 	} else if (ncpu > MAX_WORKERS_AUTO) {
121 		ncpu = MAX_WORKERS_AUTO;
122 	}
123 
124 	memset(&hdr, 0, sizeof(hdr));
125 	cfs.blksz = DEFAULT_CLSTSIZE;
126 	oname = NULL;
127 	cfs.verbose = 0;
128 	cfs.no_zcomp = 0;
129 	cfs.en_dedup = 0;
130 	summary.en = 0;
131 	summary.f = stderr;
132 	cfs.handler = &uzip_fmt;
133 	cfs.nworkers = ncpu;
134 	struct mkuz_blk *iblk, *oblk;
135 
136 	while((opt = getopt(argc, argv, "o:s:vZdLSj:")) != -1) {
137 		switch(opt) {
138 		case 'o':
139 			oname = optarg;
140 			break;
141 
142 		case 's':
143 			tmp = atoi(optarg);
144 			if (tmp <= 0) {
145 				errx(1, "invalid cluster size specified: %s",
146 				    optarg);
147 				/* Not reached */
148 			}
149 			cfs.blksz = tmp;
150 			break;
151 
152 		case 'v':
153 			cfs.verbose = 1;
154 			break;
155 
156 		case 'Z':
157 			cfs.no_zcomp = 1;
158 			break;
159 
160 		case 'd':
161 			cfs.en_dedup = 1;
162 			break;
163 
164 		case 'L':
165 			cfs.handler = &ulzma_fmt;
166 			break;
167 
168 		case 'S':
169 			summary.en = 1;
170 			summary.f = stdout;
171 			break;
172 
173 		case 'j':
174 			tmp = atoi(optarg);
175 			if (tmp <= 0) {
176 				errx(1, "invalid number of compression threads"
177                                     " specified: %s", optarg);
178 				/* Not reached */
179 			}
180 			cfs.nworkers = tmp;
181 			break;
182 
183 		default:
184 			usage();
185 			/* Not reached */
186 		}
187 	}
188 	argc -= optind;
189 	argv += optind;
190 
191 	if (argc != 1) {
192 		usage();
193 		/* Not reached */
194 	}
195 
196 	magiclen = strlcpy(hdr.magic, cfs.handler->magic, sizeof(hdr.magic));
197 	assert(magiclen < sizeof(hdr.magic));
198 
199 	if (cfs.en_dedup != 0) {
200 		hdr.magic[CLOOP_OFS_VERSN] = CLOOP_MAJVER_3;
201 		hdr.magic[CLOOP_OFS_COMPR] =
202 		    tolower(hdr.magic[CLOOP_OFS_COMPR]);
203 	}
204 
205 	c_ctx = cfs.handler->f_init(cfs.blksz);
206 
207 	cfs.iname = argv[0];
208 	if (oname == NULL) {
209 		asprintf(&oname, "%s%s", cfs.iname, cfs.handler->default_sufx);
210 		if (oname == NULL) {
211 			err(1, "can't allocate memory");
212 			/* Not reached */
213 		}
214 	}
215 
216 	signal(SIGHUP, exit);
217 	signal(SIGINT, exit);
218 	signal(SIGTERM, exit);
219 	signal(SIGXCPU, exit);
220 	signal(SIGXFSZ, exit);
221 	atexit(cleanup);
222 
223 	cfs.fdr = open(cfs.iname, O_RDONLY);
224 	if (cfs.fdr < 0) {
225 		err(1, "open(%s)", cfs.iname);
226 		/* Not reached */
227 	}
228 	cfs.isize = mkuz_get_insize(&cfs);
229 	if (cfs.isize < 0) {
230 		errx(1, "can't determine input image size");
231 		/* Not reached */
232 	}
233 	hdr.nblocks = cfs.isize / cfs.blksz;
234 	if ((cfs.isize % cfs.blksz) != 0) {
235 		if (cfs.verbose != 0)
236 			fprintf(stderr, "file size is not multiple "
237 			"of %d, padding data\n", cfs.blksz);
238 		hdr.nblocks++;
239 	}
240 	toc = mkuz_safe_malloc((hdr.nblocks + 1) * sizeof(*toc));
241 
242 	cfs.fdw = open(oname, (cfs.en_dedup ? O_RDWR : O_WRONLY) | O_TRUNC | O_CREAT,
243 		   S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
244 	if (cfs.fdw < 0) {
245 		err(1, "open(%s)", oname);
246 		/* Not reached */
247 	}
248 	cleanfile = oname;
249 
250 	/* Prepare header that we will write later when we have index ready. */
251 	iov[0].iov_base = (char *)&hdr;
252 	iov[0].iov_len = sizeof(hdr);
253 	iov[1].iov_base = (char *)toc;
254 	iov[1].iov_len = (hdr.nblocks + 1) * sizeof(*toc);
255 	offset = iov[0].iov_len + iov[1].iov_len;
256 
257 	/* Reserve space for header */
258 	lseek(cfs.fdw, offset, SEEK_SET);
259 
260 	if (cfs.verbose != 0) {
261 		fprintf(stderr, "data size %ju bytes, number of clusters "
262 		    "%u, index length %zu bytes\n", cfs.isize,
263 		    hdr.nblocks, iov[1].iov_len);
264 	}
265 
266 	cvp = mkuz_conveyor_ctor(&cfs);
267 
268 	last_offset = 0;
269         iblk = oblk = NULL;
270 	for(i = io = 0; iblk != MKUZ_BLK_EOF; i++) {
271 		iblk = readblock(cfs.fdr, cfs.blksz);
272 		mkuz_fqueue_enq(cvp->wrk_queue, iblk);
273 		if (iblk != MKUZ_BLK_EOF &&
274 		    (i < (cfs.nworkers * ITEMS_PER_WORKER))) {
275 			continue;
276 		}
277 drain:
278 		oblk = mkuz_fqueue_deq_when(cvp->results, cmp_blkno, &io);
279 		assert(oblk->info.blkno == (unsigned)io);
280 		oblk->info.offset = offset;
281 		chit = NULL;
282 		if (cfs.en_dedup != 0 && oblk->info.len > 0) {
283 			chit = mkuz_blkcache_regblock(cfs.fdw, oblk);
284 			/*
285 			 * There should be at least one non-empty block
286 			 * between us and the backref'ed offset, otherwise
287 			 * we won't be able to parse that sequence correctly
288 			 * as it would be indistinguishible from another
289 			 * empty block.
290 			 */
291 			if (chit != NULL && chit->offset == last_offset) {
292 				chit = NULL;
293 			}
294 		}
295 		if (chit != NULL) {
296 			toc[io] = htobe64(chit->offset);
297 			oblk->info.len = 0;
298 		} else {
299 			if (oblk->info.len > 0 && write(cfs.fdw, oblk->data,
300 			    oblk->info.len) < 0) {
301 				err(1, "write(%s)", oname);
302 				/* Not reached */
303 			}
304 			toc[io] = htobe64(offset);
305 			last_offset = offset;
306 			offset += oblk->info.len;
307 		}
308 		if (cfs.verbose != 0) {
309 			fprintf(stderr, "cluster #%d, in %u bytes, "
310 			    "out len=%lu offset=%lu", io, cfs.blksz,
311 			    (u_long)oblk->info.len, (u_long)be64toh(toc[io]));
312 			if (chit != NULL) {
313 				fprintf(stderr, " (backref'ed to #%d)",
314 				    chit->blkno);
315 			}
316 			fprintf(stderr, "\n");
317 		}
318 		free(oblk);
319 		io += 1;
320 		if (iblk == MKUZ_BLK_EOF) {
321 			if (io < i)
322 				goto drain;
323 			/* Last block, see if we need to add some padding */
324 			if ((offset % DEV_BSIZE) == 0)
325 				continue;
326 			oblk = mkuz_blk_ctor(DEV_BSIZE - (offset % DEV_BSIZE));
327 			oblk->info.blkno = io;
328 			oblk->info.len = oblk->alen;
329 			if (cfs.verbose != 0) {
330 				fprintf(stderr, "padding data with %lu bytes "
331 				    "so that file size is multiple of %d\n",
332 				    (u_long)oblk->alen, DEV_BSIZE);
333 			}
334 			mkuz_fqueue_enq(cvp->results, oblk);
335 			goto drain;
336 		}
337 	}
338 
339 	close(cfs.fdr);
340 
341 	if (cfs.verbose != 0 || summary.en != 0) {
342 		et = getdtime();
343 		fprintf(summary.f, "compressed data to %ju bytes, saved %lld "
344 		    "bytes, %.2f%% decrease, %.2f bytes/sec.\n", offset,
345 		    (long long)(cfs.isize - offset),
346 		    100.0 * (long long)(cfs.isize - offset) /
347 		    (float)cfs.isize, (float)cfs.isize / (et - st));
348 	}
349 
350 	/* Convert to big endian */
351 	hdr.blksz = htonl(cfs.blksz);
352 	hdr.nblocks = htonl(hdr.nblocks);
353 	/* Write headers into pre-allocated space */
354 	lseek(cfs.fdw, 0, SEEK_SET);
355 	if (writev(cfs.fdw, iov, 2) < 0) {
356 		err(1, "writev(%s)", oname);
357 		/* Not reached */
358 	}
359 	cleanfile = NULL;
360 	close(cfs.fdw);
361 
362 	exit(0);
363 }
364 
365 static struct mkuz_blk *
366 readblock(int fd, u_int32_t clstsize)
367 {
368 	int numread;
369 	struct mkuz_blk *rval;
370 	static int blockcnt;
371 	off_t cpos;
372 
373 	rval = mkuz_blk_ctor(clstsize);
374 
375 	rval->info.blkno = blockcnt;
376 	blockcnt += 1;
377 	cpos = lseek(fd, 0, SEEK_CUR);
378 	if (cpos < 0) {
379 		err(1, "readblock: lseek() failed");
380 		/* Not reached */
381 	}
382 	rval->info.offset = cpos;
383 
384 	numread = read(fd, rval->data, clstsize);
385 	if (numread < 0) {
386 		err(1, "readblock: read() failed");
387 		/* Not reached */
388 	}
389 	if (numread == 0) {
390 		free(rval);
391 		return MKUZ_BLK_EOF;
392 	}
393 	rval->info.len = numread;
394 	return rval;
395 }
396 
397 static void
398 usage(void)
399 {
400 
401 	fprintf(stderr, "usage: mkuzip [-vZdLS] [-o outfile] [-s cluster_size] "
402 	    "[-j ncompr] infile\n");
403 	exit(1);
404 }
405 
406 void *
407 mkuz_safe_malloc(size_t size)
408 {
409 	void *retval;
410 
411 	retval = malloc(size);
412 	if (retval == NULL) {
413 		err(1, "can't allocate memory");
414 		/* Not reached */
415 	}
416 	return retval;
417 }
418 
419 void *
420 mkuz_safe_zmalloc(size_t size)
421 {
422 	void *retval;
423 
424 	retval = mkuz_safe_malloc(size);
425 	bzero(retval, size);
426 	return retval;
427 }
428 
429 static void
430 cleanup(void)
431 {
432 
433 	if (cleanfile != NULL)
434 		unlink(cleanfile);
435 }
436 
437 int
438 mkuz_memvcmp(const void *memory, unsigned char val, size_t size)
439 {
440     const u_char *mm;
441 
442     mm = (const u_char *)memory;
443     return (*mm == val) && memcmp(mm, mm + 1, size - 1) == 0;
444 }
445