1 /* -*- indent-tabs-mode: nil -*-
2 *
3 * Copyright 2011-2016 Kubo Takehiro <kubo@jiubao.org>
4 *
5 * Redistribution and use in source and binary forms, with or without modification, are
6 * permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice, this list of
9 * conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice, this list
12 * of conditions and the following disclaimer in the documentation and/or other materials
13 * provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS OR IMPLIED
16 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
19 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
21 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
22 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
23 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 *
25 * The views and conclusions contained in the software and documentation are those of the
26 * authors and should not be interpreted as representing official policies, either expressed
27 * or implied, of the authors.
28 *
29 */
30 #ifdef HAVE_CONFIG_H
31 #include "config.h"
32 #endif
33
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <stdarg.h>
37 #include <string.h>
38 #include <errno.h>
39 #include <limits.h>
40 #include <sys/types.h>
41 #include <fcntl.h>
42 #include <snappy-c.h>
43 #ifdef WIN32
44 /* Windows */
45 #include <windows.h>
46 #include <io.h>
47 #ifndef PATH_MAX
48 #define PATH_MAX MAX_PATH
49 #endif
50 #define PATH_DELIMITER '\\'
51 #define OPTIMIZE_SEQUENTIAL "S" /* flag to optimize sequential access */
52 #else
53 /* Unix */
54 #include <sys/time.h>
55 #include <sys/stat.h>
56 #include <unistd.h>
57 #define PATH_DELIMITER '/'
58 #define OPTIMIZE_SEQUENTIAL ""
59 #endif
60 #include "snzip.h"
61 #ifdef WIN32
62 #define stat _stati64
63 #define fstat _fstati64
64 #endif
65 #ifndef HAVE_GETOPT
66 #include "win32/ya_getopt.h"
67 #endif
68
69 #if defined HAVE_STRUCT_STAT_ST_MTIMENSEC
70 #define SNZ_ST_TIME_NSEC(sbuf, type) ((sbuf).st_##type##timensec)
71 #elif defined HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC
72 #define SNZ_ST_TIME_NSEC(sbuf, type) ((sbuf).st_##type##tim.tv_nsec)
73 #elif defined HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC
74 #define SNZ_ST_TIME_NSEC(sbuf, type) ((sbuf).st_##type##timespec.tv_nsec)
75 #else
76 #define SNZ_ST_TIME_NSEC(sbuf, type) (0)
77 #endif
78
79 int64_t uncompressed_source_len = -1;
80 int32_t snzip_format_block_size;
81 uint32_t hadoop_snappy_source_length;
82 uint32_t hadoop_snappy_compressed_length;
83
84 static int trace_flag = FALSE;
85
86 static void copy_file_attributes(int infd, int outfd, const char *outfile);
87 static void show_usage(const char *progname, int exit_code);
88
89 static stream_format_t *stream_formats[] = {
90 &framing2_format,
91 &hadoop_snappy_format,
92 #ifdef SUPPORT_RAW_FORMAT
93 &raw_format,
94 #endif
95 &iwa_format,
96 &framing_format,
97 &snzip_format,
98 &snappy_java_format,
99 &snappy_in_java_format,
100 &comment_43_format,
101 };
102 #define NUM_OF_STREAM_FORMATS (sizeof(stream_formats)/sizeof(stream_formats[0]))
103
find_stream_format_by_name(const char * name)104 static stream_format_t *find_stream_format_by_name(const char *name)
105 {
106 int idx;
107 for (idx = 0; idx < NUM_OF_STREAM_FORMATS; idx++) {
108 if (strcmp(stream_formats[idx]->name, name) == 0) {
109 return stream_formats[idx];
110 }
111 }
112 return NULL;
113 }
114
find_stream_format_by_suffix(const char * suffix)115 static stream_format_t *find_stream_format_by_suffix(const char *suffix)
116 {
117 int idx;
118 for (idx = 0; idx < NUM_OF_STREAM_FORMATS; idx++) {
119 if (strcmp(stream_formats[idx]->suffix, suffix) == 0) {
120 return stream_formats[idx];
121 }
122 }
123 return NULL;
124 }
125
find_stream_format_by_file_header(FILE * fp)126 static stream_format_t *find_stream_format_by_file_header(FILE *fp)
127 {
128 /* framing {0xff, 0x06, 0x00, 's', 'N', 'a', 'P', 'p', 'Y'}
129 * framing2 {0xff, 0x06, 0x00, 0x00, 's', 'N', 'a', 'P', 'p', 'Y'}
130 * hadoop-snappy {--uncompressed length--,--compressed length--,
131 * snzip {'S', 'N', 'Z', 0x01, block_size}
132 * snappy-java {0x82, 'S', 'N', 'A', 'P', 'P', 'Y', 0x00}
133 * snappy-in-java {'s', 'n', 'a', 'p', 'p', 'y', 0x00}
134 */
135 union {
136 uint8_t buf[10];
137 uint32_t len[2];
138 } u;
139 size_t idx = 0;
140 int chr;
141 size_t max_compressed_length;
142
143 #define GETCHAR() ((u.buf[idx++] = chr = getc(fp)))
144 #define CHK(chr) if (GETCHAR() != (chr)) goto error
145 switch (GETCHAR()) {
146 case 0xff:
147 CHK(0x06); CHK(0x00);
148 switch (GETCHAR()) {
149 case 's':
150 switch (GETCHAR()) {
151 case 'N':
152 CHK('a'); CHK('P'); CHK('p'); CHK('Y');
153 return &framing_format;
154 case 'n':
155 CHK('a'); CHK('p'); CHK('p'); CHK('y');
156 return &comment_43_format;
157 }
158 break;
159 case 0x00:
160 CHK('s'); CHK('N'); CHK('a'); CHK('P'); CHK('p'); CHK('Y');
161 return &framing2_format;
162 }
163 break;
164 case 'S':
165 CHK('N'); CHK('Z'); CHK(0x01);
166 snzip_format_block_size = GETCHAR();
167 if (snzip_format_block_size <= 0) {
168 goto error;
169 }
170 return &snzip_format;
171 case 0x82:
172 CHK('S'); CHK('N'); CHK('A'); CHK('P'); CHK('P'); CHK('Y'); CHK(0x00);
173 return &snappy_java_format;
174 case 's':
175 CHK('n'); CHK('a'); CHK('p'); CHK('p'); CHK('y'); CHK(0x00);
176 return &snappy_in_java_format;
177 }
178 error:
179 while (idx < sizeof(u.len)) {
180 GETCHAR();
181 }
182 hadoop_snappy_source_length = SNZ_FROM_BE32(u.len[0]);
183 hadoop_snappy_compressed_length = SNZ_FROM_BE32(u.len[1]);
184 max_compressed_length = snappy_max_compressed_length(hadoop_snappy_max_input_size(0));
185 if (hadoop_snappy_compressed_length <= max_compressed_length) {
186 /* This may be hadoop-snappy format */
187 return &hadoop_snappy_format;
188 }
189 fprintf(stderr, "Unknown file header\n");
190 return NULL;
191 }
192
193 int trc_lineno;
194 const char *trc_filename = __FILE__;
195
print_error_(const char * fmt,...)196 void print_error_(const char *fmt, ...)
197 {
198 va_list ap;
199
200 if (trace_flag) {
201 fprintf(stderr, "%s:%3d: ", trc_filename, trc_lineno);
202 }
203 va_start(ap, fmt);
204 vfprintf(stderr, fmt, ap);
205 va_end(ap);
206 }
207
trace_(const char * fmt,...)208 void trace_(const char *fmt, ...)
209 {
210 va_list ap;
211
212 if (!trace_flag) {
213 return;
214 }
215 fprintf(stderr, "%s:%3d: ", trc_filename, trc_lineno);
216 va_start(ap, fmt);
217 vfprintf(stderr, fmt, ap);
218 va_end(ap);
219 }
220
main(int argc,char ** argv)221 int main(int argc, char **argv)
222 {
223 int opt;
224 int opt_uncompress = FALSE;
225 int opt_keep = FALSE;
226 int opt_stdout = FALSE;
227 int block_size = 0;
228 size_t rsize = 0;
229 size_t wsize = 0;
230 const char *format_name = NULL;
231 stream_format_t *fmt = &DEFAULT_FORMAT;
232
233 char *progname = strrchr(argv[0], PATH_DELIMITER);
234 if (progname != NULL) {
235 progname++;
236 } else {
237 progname = argv[0];
238 }
239
240 trace("progname = %s\n", progname);
241 if (strstr(progname, "un") != NULL) {
242 trace("\"un\" is found in %s\n", progname);
243 opt_uncompress = TRUE;
244 }
245 if (strstr(progname, "cat") != NULL) {
246 trace("\"cat\" is found in %s\n", progname);
247 opt_stdout = TRUE;
248 opt_uncompress = TRUE;
249 opt_keep = TRUE;
250 }
251
252 while ((opt = getopt(argc, argv, "cdkt:hs:b:B:R:W:T")) != -1) {
253 char *endptr;
254
255 switch (opt) {
256 case 'c':
257 opt_stdout = TRUE;
258 opt_keep = TRUE;
259 break;
260 case 'd':
261 opt_uncompress = TRUE;
262 break;
263 case 'k':
264 opt_keep = TRUE;
265 break;
266 case 't':
267 format_name = optarg;
268 break;
269 case 'h':
270 show_usage(progname, 0);
271 break;
272 case 's':
273 uncompressed_source_len = strtoull(optarg, &endptr, 10);
274 if (*endptr != '\0') {
275 fprintf(stderr, "Invalid -s format: %s\n", optarg);
276 return 1;
277 }
278 break;
279 case 'b':
280 block_size = atoi(optarg);
281 break;
282 case 'B':
283 block_size = 1ul << atoi(optarg);
284 break;
285 case 'R':
286 rsize = strtoul(optarg, NULL, 10);
287 break;
288 case 'W':
289 wsize = strtoul(optarg, NULL, 10);
290 break;
291 case 'T':
292 trace_flag = TRUE;
293 break;
294 case '?':
295 show_usage(progname, 1);
296 break;
297 }
298 }
299
300 #ifdef WIN32
301 _setmode(0, _O_BINARY);
302 _setmode(1, _O_BINARY);
303 #endif
304
305 if (format_name != NULL) {
306 fmt = find_stream_format_by_name(format_name);
307 if (fmt == NULL) {
308 fprintf(stderr, "Unknown file format name %s\n", format_name);
309 return 1;
310 }
311 }
312
313 if (optind == argc) {
314 trace("no arguments are set.\n");
315
316 if (opt_uncompress) {
317 int skip_magic = 0;
318 if (format_name == NULL) {
319 fmt = find_stream_format_by_file_header(stdin);
320 if (fmt == NULL) {
321 return 1;
322 }
323 skip_magic = 1;
324 }
325 return fmt->uncompress(stdin, stdout, skip_magic);
326 } else {
327 if (isatty(1)) {
328 /* stdout is a terminal */
329 fprintf(stderr, "I won't write compressed data to a terminal.\n");
330 fprintf(stderr, "For help, type: '%s -h'.\n", progname);
331 return 1;
332 }
333 return fmt->compress(stdin, stdout, block_size);
334 }
335 }
336
337 while (optind < argc) {
338 char *infile = argv[optind++];
339 size_t infilelen = strlen(infile);
340 char outfile[PATH_MAX];
341 FILE *infp;
342 FILE *outfp;
343 int skip_magic = 0;
344
345 /* check input file and open it. */
346 const char *suffix = strrchr(infile, '.');
347 if (suffix != NULL) {
348 stream_format_t *fmt_tmp = find_stream_format_by_suffix(suffix + 1);
349 if (fmt_tmp == NULL && opt_uncompress) {
350 print_error("%s has unknown suffix.\n", infile);
351 continue;
352 }
353 if (fmt_tmp != NULL && !opt_uncompress) {
354 print_error("%s already has %s suffix\n", infile, fmt_tmp->suffix);
355 continue;
356 }
357 }
358
359 infp = fopen(infile, "rb" OPTIMIZE_SEQUENTIAL);
360 if (infp == NULL) {
361 print_error("Failed to open %s for read\n", infile);
362 exit(1);
363 }
364 if (rsize != 0) {
365 trace("setvbuf(infp, NULL, _IOFBF, %ld)\n", (long)rsize);
366 setvbuf(infp, NULL, _IOFBF, rsize);
367 }
368 #ifdef HAVE_POSIX_FADVISE
369 posix_fadvise(fileno(infp), 0, 0, POSIX_FADV_SEQUENTIAL);
370 #endif
371
372 /* determine the file format */
373 if (opt_uncompress) {
374 if (format_name == NULL) {
375 fmt = find_stream_format_by_file_header(infp);
376 skip_magic = 1;
377 }
378 if (fmt == NULL) {
379 exit(1);
380 }
381 }
382
383 /* check output file and open it. */
384 if (opt_stdout) {
385 strcpy(outfile, "-");
386 outfp = stdout;
387 } else {
388 size_t suffixlen = strlen(fmt->suffix);
389 if (opt_uncompress) {
390 /* check suffix */
391 const char *suffix = strrchr(infile, '.');
392 int remove_suffix = (suffix != NULL && strcmp(suffix + 1, fmt->suffix) == 0);
393 size_t new_size = infilelen + (remove_suffix ? (- suffixlen) : 4);
394 if (new_size >= sizeof(outfile)) {
395 print_error("%s has too long file name.\n", infile);
396 exit(1);
397 }
398 if (remove_suffix) {
399 memcpy(outfile, infile, infilelen - suffixlen - 1);
400 outfile[infilelen - suffixlen - 1] = '\0';
401 } else {
402 fprintf(stderr, "%s: Can't guess original name for %s -- using %s.out\n",
403 progname, infile, infile);
404 snprintf(outfile, sizeof(outfile), "%s.out", infile);
405 }
406 } else {
407 if (infilelen + suffixlen + 2 >= sizeof(outfile)) {
408 print_error("%s has too long file name.\n", infile);
409 exit(1);
410 }
411 sprintf(outfile, "%s.%s", infile, fmt->suffix);
412 }
413 outfp = fopen(outfile, "wb" OPTIMIZE_SEQUENTIAL);
414 if (outfp == NULL) {
415 print_error("Failed to open %s for write\n", outfile);
416 exit(1);
417 }
418 }
419 if (wsize != 0) {
420 trace("setvbuf(outfp, NULL, _IOFBF, %ld)\n", (long)wsize);
421 setvbuf(outfp, NULL, _IOFBF, wsize);
422 }
423
424 if (opt_uncompress) {
425 trace("uncompress %s\n", infile);
426 if (fmt->uncompress(infp, outfp, skip_magic) != 0) {
427 if (outfp != stdout) {
428 unlink(outfile);
429 }
430 return 1;
431 }
432 } else {
433 trace("compress %s\n", infile);
434 if (fmt->compress(infp, outfp, block_size) != 0) {
435 if (outfp != stdout) {
436 unlink(outfile);
437 }
438 return 1;
439 }
440 }
441
442 if (!opt_stdout) {
443 fflush(outfp);
444 copy_file_attributes(fileno(infp), fileno(outfp), outfile);
445 }
446
447 fclose(infp);
448 if (outfp != stdout) {
449 fclose(outfp);
450 }
451
452 if (!opt_keep) {
453 int rv = unlink(infile);
454 trace("unlink(\"%s\") => %d (errno = %d)\n",
455 infile, rv, rv ? errno : 0);
456 }
457 }
458 return 0;
459 }
460
copy_file_attributes(int infd,int outfd,const char * outfile)461 static void copy_file_attributes(int infd, int outfd, const char *outfile)
462 {
463 #ifdef WIN32
464 BY_HANDLE_FILE_INFORMATION fi;
465 BOOL bOk;
466
467 bOk = GetFileInformationByHandle((HANDLE)_get_osfhandle(infd), &fi);
468 trace("GetFileInformationByHandle(...) => %s\n", bOk ? "TRUE" : "FALSE");
469 if (bOk) {
470 bOk = SetFileTime((HANDLE)_get_osfhandle(outfd), NULL, &fi.ftLastAccessTime, &fi.ftLastWriteTime);
471 trace("SetFileTime(...) => %s\n", bOk ? "TRUE" : "FALSE");
472 bOk = SetFileAttributesA(outfile, fi.dwFileAttributes);
473 trace("SetFileAttributesA(...) => %s\n", bOk ? "TRUE" : "FALSE");
474 }
475 #else
476 struct stat sbuf;
477 #ifdef HAVE_FUTIMENS
478 struct timespec times[2];
479 #else
480 struct timeval times[2];
481 #endif
482 int rv;
483
484 if ((rv = fstat(infd, &sbuf)) != 0) {
485 trace("fstat(%d, &sbuf) => %d (errno = %d)\n",
486 infd, rv, errno);
487 return;
488 }
489
490 /* copy file times. */
491 #ifdef HAVE_FUTIMENS
492 times[0].tv_sec = sbuf.st_atime;
493 times[0].tv_nsec = SNZ_ST_TIME_NSEC(sbuf, a);
494 times[1].tv_sec = sbuf.st_mtime;
495 times[1].tv_nsec = SNZ_ST_TIME_NSEC(sbuf, m);
496 rv = futimens(outfd, times);
497 trace("futimens(%d, [{%ld, %ld}, {%ld, %ld}]) => %d\n",
498 outfd, times[0].tv_sec, times[0].tv_nsec,
499 times[1].tv_sec, times[1].tv_nsec, rv);
500 #else /* HAVE_FUTIMENS */
501 times[0].tv_sec = sbuf.st_atime;
502 times[0].tv_usec = SNZ_ST_TIME_NSEC(sbuf, a) / 1000;
503 times[1].tv_sec = sbuf.st_mtime;
504 times[1].tv_usec = SNZ_ST_TIME_NSEC(sbuf, m) / 1000;
505 #ifdef HAVE_FUTIMES
506 rv = futimes(outfd, times);
507 trace("futimes(%d, [{%ld, %ld}, {%ld, %ld}]) => %d\n",
508 outfd, times[0].tv_sec, times[0].tv_usec,
509 times[1].tv_sec, times[1].tv_usec, rv);
510 #else /* HAVE_FUTIMES */
511 rv = utimes(outfile, times);
512 trace("utimes(\"%s\", [{%ld, %ld}, {%ld, %ld}]) => %d\n",
513 outfile, times[0].tv_sec, times[0].tv_usec,
514 times[1].tv_sec, times[1].tv_usec, rv);
515 #endif /* HAVE_FUTIMES */
516 #endif /* HAVE_FUTIMENS */
517
518 /* copy other attributes */
519 rv = fchown(outfd, sbuf.st_uid, sbuf.st_gid);
520 trace("fchown(%d, %d, %d) => %d\n",
521 outfd, sbuf.st_uid, sbuf.st_gid, rv);
522 rv = fchmod(outfd, sbuf.st_mode);
523 trace("fchmod(%d, 0%o) => %d\n",
524 outfd, sbuf.st_mode, rv);
525 #endif
526 }
527
show_usage(const char * progname,int exit_code)528 static void show_usage(const char *progname, int exit_code)
529 {
530 int idx;
531 int max_name_len;
532 int max_suffix_len;
533
534 fprintf(stderr,
535 PACKAGE_STRING "\n"
536 "\n"
537 " Usage: %s [option ...] [file ...]\n"
538 "\n"
539 " general options:\n"
540 " -c output to standard output, keep original files unchanged\n"
541 " -d decompress\n"
542 " -k keep (don't delete) input files\n"
543 " -t name file format name. see below. The default format is %s.\n"
544 " -h give this help\n"
545 "\n"
546 " raw_format option:\n"
547 " -s size size of input data when compressing.\n"
548 " The default value is the file size if available.\n"
549 "\n"
550 " tuning options:\n"
551 " -b num internal block size in bytes\n"
552 " -B num internal block size. 'num'-th power of two.\n"
553 " -R num size of read buffer in bytes\n"
554 " -W num size of write buffer in bytes\n"
555 " -T trace for debug\n"
556 "\n"
557 " supported formats:\n",
558 progname, DEFAULT_FORMAT.name);
559
560 max_name_len = strlen("name");
561 max_suffix_len = strlen("suffix");
562 for (idx = 0; idx < NUM_OF_STREAM_FORMATS; idx++) {
563 stream_format_t *fmt = stream_formats[idx];
564 if (max_name_len < strlen(fmt->name)) {
565 max_name_len = strlen(fmt->name);
566 }
567 if (max_suffix_len < strlen(fmt->suffix)) {
568 max_suffix_len = strlen(fmt->suffix);
569 }
570 }
571 fprintf(stderr, " %*s %*s URL\n", -max_name_len, "NAME", -max_suffix_len, "SUFFIX");
572 fprintf(stderr, " %*s %*s ---\n", -max_name_len, "----", -max_suffix_len, "------");
573 for (idx = 0; idx < NUM_OF_STREAM_FORMATS; idx++) {
574 stream_format_t *fmt = stream_formats[idx];
575 fprintf(stderr, " %*s %*s %s\n", -max_name_len, fmt->name, -max_suffix_len, fmt->suffix, fmt->url);
576 }
577 exit(exit_code);
578 }
579
work_buffer_init(work_buffer_t * wb,size_t block_size)580 int work_buffer_init(work_buffer_t *wb, size_t block_size)
581 {
582 wb->uclen = block_size;
583 wb->uc = malloc(wb->uclen);
584 if (wb->uc == NULL) {
585 fprintf(stderr, "Out of memory\n");
586 exit(1);
587 }
588 wb->clen = snappy_max_compressed_length(wb->uclen);
589 wb->c = malloc(wb->clen);
590 if (wb->c == NULL) {
591 fprintf(stderr, "Out of memory\n");
592 exit(1);
593 }
594 trace("max length of compressed data = %lu\n", wb->clen);
595 trace("max length of uncompressed data = %lu\n", wb->uclen);
596 return 0;
597 }
598
work_buffer_free(work_buffer_t * wb)599 void work_buffer_free(work_buffer_t *wb)
600 {
601 free(wb->c);
602 free(wb->uc);
603 }
604
work_buffer_resize(work_buffer_t * wb,size_t clen,size_t uclen)605 void work_buffer_resize(work_buffer_t *wb, size_t clen, size_t uclen)
606 {
607 if (clen != 0) {
608 wb->clen = clen;
609 wb->c = realloc(wb->c, clen);
610 if (wb->c == NULL) {
611 fprintf(stderr, "Out of memory\n");
612 exit(1);
613 }
614 }
615 if (uclen != 0) {
616 wb->uclen = uclen;
617 wb->uc = realloc(wb->uc, uclen);
618 if (wb->uc == NULL) {
619 fprintf(stderr, "Out of memory\n");
620 exit(1);
621 }
622 }
623 }
624
write_full(int fd,const void * buf,size_t count)625 int write_full(int fd, const void *buf, size_t count)
626 {
627 const char *ptr = (const char *)buf;
628
629 while (count > 0) {
630 int rv = write(fd, ptr, count);
631 if (rv == -1) {
632 if (errno == EINTR) {
633 continue;
634 }
635 return -1;
636 }
637 ptr += rv;
638 count -= rv;
639 }
640 return (ptr - (const char *)buf);
641 }
642