1 /* bgzip.c -- Block compression/decompression utility.
2 
3    Copyright (C) 2008, 2009 Broad Institute / Massachusetts Institute of Technology
4    Copyright (C) 2010, 2013-2017 Genome Research Ltd.
5 
6    Permission is hereby granted, free of charge, to any person obtaining a copy
7    of this software and associated documentation files (the "Software"), to deal
8    in the Software without restriction, including without limitation the rights
9    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10    copies of the Software, and to permit persons to whom the Software is
11    furnished to do so, subject to the following conditions:
12 
13    The above copyright notices and this permission notice shall be included in
14    all copies or substantial portions of the Software.
15 
16    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22    THE SOFTWARE.
23 */
24 
25 #include <config.h>
26 
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdio.h>
30 #include <fcntl.h>
31 #include <unistd.h>
32 #include <errno.h>
33 #include <stdarg.h>
34 #include <getopt.h>
35 #include <sys/stat.h>
36 #include "htslib/bgzf.h"
37 #include "htslib/hts.h"
38 
39 static const int WINDOW_SIZE = 64 * 1024;
40 
error(const char * format,...)41 static void error(const char *format, ...)
42 {
43     va_list ap;
44     va_start(ap, format);
45     vfprintf(stderr, format, ap);
46     va_end(ap);
47     exit(EXIT_FAILURE);
48 }
49 
confirm_overwrite(const char * fn)50 static int confirm_overwrite(const char *fn)
51 {
52     int save_errno = errno;
53     int ret = 0;
54 
55     if (isatty(STDIN_FILENO)) {
56         char c;
57         fprintf(stderr, "[bgzip] %s already exists; do you wish to overwrite (y or n)? ", fn);
58         if (scanf("%c", &c) == 1 && (c == 'Y' || c == 'y')) ret = 1;
59     }
60 
61     errno = save_errno;
62     return ret;
63 }
64 
bgzip_main_usage(void)65 static int bgzip_main_usage(void)
66 {
67     fprintf(stderr, "\n");
68     fprintf(stderr, "Version: %s\n", hts_version());
69     fprintf(stderr, "Usage:   bgzip [OPTIONS] [FILE] ...\n");
70     fprintf(stderr, "Options:\n");
71     fprintf(stderr, "   -b, --offset INT        decompress at virtual file pointer (0-based uncompressed offset)\n");
72     fprintf(stderr, "   -c, --stdout            write on standard output, keep original files unchanged\n");
73     fprintf(stderr, "   -d, --decompress        decompress\n");
74     fprintf(stderr, "   -f, --force             overwrite files without asking\n");
75     fprintf(stderr, "   -h, --help              give this help\n");
76     fprintf(stderr, "   -i, --index             compress and create BGZF index\n");
77     fprintf(stderr, "   -I, --index-name FILE   name of BGZF index file [file.gz.gzi]\n");
78     fprintf(stderr, "   -r, --reindex           (re)index compressed file\n");
79     fprintf(stderr, "   -g, --rebgzip           use an index file to bgzip a file\n");
80     fprintf(stderr, "   -s, --size INT          decompress INT bytes (uncompressed size)\n");
81     fprintf(stderr, "   -@, --threads INT       number of compression threads to use [1]\n");
82     fprintf(stderr, "\n");
83     return 1;
84 }
85 
main(int argc,char ** argv)86 int main(int argc, char **argv)
87 {
88     int c, compress, pstdout, is_forced, index = 0, rebgzip = 0, reindex = 0;
89     BGZF *fp;
90     void *buffer;
91     long start, end, size;
92     char *index_fname = NULL;
93     int threads = 1;
94 
95     static const struct option loptions[] =
96     {
97         {"help", no_argument, NULL, 'h'},
98         {"offset", required_argument, NULL, 'b'},
99         {"stdout", no_argument, NULL, 'c'},
100         {"decompress", no_argument, NULL, 'd'},
101         {"force", no_argument, NULL, 'f'},
102         {"index", no_argument, NULL, 'i'},
103         {"index-name", required_argument, NULL, 'I'},
104         {"reindex", no_argument, NULL, 'r'},
105         {"rebgzip",no_argument,NULL,'g'},
106         {"size", required_argument, NULL, 's'},
107         {"threads", required_argument, NULL, '@'},
108         {"version", no_argument, NULL, 1},
109         {NULL, 0, NULL, 0}
110     };
111 
112     compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
113     while((c  = getopt_long(argc, argv, "cdh?fb:@:s:iI:gr",loptions,NULL)) >= 0){
114         switch(c){
115         case 'd': compress = 0; break;
116         case 'c': pstdout = 1; break;
117         case 'b': start = atol(optarg); compress = 0; pstdout = 1; break;
118         case 's': size = atol(optarg); pstdout = 1; break;
119         case 'f': is_forced = 1; break;
120         case 'i': index = 1; break;
121         case 'I': index_fname = optarg; break;
122         case 'g': rebgzip = 1; break;
123         case 'r': reindex = 1; compress = 0; break;
124         case '@': threads = atoi(optarg); break;
125         case 1:
126             printf(
127 "bgzip (htslib) %s\n"
128 "Copyright (C) 2017 Genome Research Ltd.\n", hts_version());
129             return EXIT_SUCCESS;
130         case 'h':
131         case '?': return bgzip_main_usage();
132         }
133     }
134     if (size >= 0) end = start + size;
135     if (end >= 0 && end < start) {
136         fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end);
137         return 1;
138     }
139     if (compress == 1) {
140         struct stat sbuf;
141         int f_src = fileno(stdin);
142 
143         if ( argc>optind )
144         {
145             if ( stat(argv[optind],&sbuf)<0 )
146             {
147                 fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
148                 return 1;
149             }
150 
151             if ((f_src = open(argv[optind], O_RDONLY)) < 0) {
152                 fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
153                 return 1;
154             }
155 
156             if (pstdout)
157                 fp = bgzf_open("-", "w");
158             else
159             {
160                 char *name = malloc(strlen(argv[optind]) + 5);
161                 strcpy(name, argv[optind]);
162                 strcat(name, ".gz");
163                 fp = bgzf_open(name, is_forced? "w" : "wx");
164                 if (fp == NULL && errno == EEXIST && confirm_overwrite(name))
165                     fp = bgzf_open(name, "w");
166                 if (fp == NULL) {
167                     fprintf(stderr, "[bgzip] can't create %s: %s\n", name, strerror(errno));
168                     free(name);
169                     return 1;
170                 }
171                 free(name);
172             }
173         }
174         else if (!pstdout && isatty(fileno((FILE *)stdout)) )
175             return bgzip_main_usage();
176         else if ( index && !index_fname )
177         {
178             fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n");
179             return 1;
180         }
181         else
182             fp = bgzf_open("-", "w");
183 
184         if ( index && rebgzip )
185         {
186             fprintf(stderr, "[bgzip] Can't produce a index and rebgzip simultaneously\n");
187             return 1;
188         }
189 
190         if ( rebgzip && !index_fname )
191         {
192             fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n");
193             return 1;
194         }
195 
196         if (threads > 1)
197             bgzf_mt(fp, threads, 256);
198 
199         if ( index ) bgzf_index_build_init(fp);
200         buffer = malloc(WINDOW_SIZE);
201         if (rebgzip){
202             if ( bgzf_index_load(fp, index_fname, NULL) < 0 ) error("Could not load index: %s.gzi\n", argv[optind]);
203 
204             while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
205                 if (bgzf_block_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode);
206         }
207         else {
208             while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
209                 if (bgzf_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode);
210         }
211         if ( index )
212         {
213             if (index_fname) {
214                 if (bgzf_index_dump(fp, index_fname, NULL) < 0)
215                     error("Could not write index to '%s'\n", index_fname);
216             } else {
217                 if (bgzf_index_dump(fp, argv[optind], ".gz.gzi") < 0)
218                     error("Could not write index to '%s.gz.gzi'", argv[optind]);
219             }
220         }
221         if (bgzf_close(fp) < 0) error("Close failed: Error %d", fp->errcode);
222         if (argc > optind && !pstdout) unlink(argv[optind]);
223         free(buffer);
224         close(f_src);
225         return 0;
226     }
227     else if ( reindex )
228     {
229         if ( argc>optind )
230         {
231             fp = bgzf_open(argv[optind], "r");
232             if ( !fp ) error("[bgzip] Could not open file: %s\n", argv[optind]);
233         }
234         else
235         {
236             if ( !index_fname ) error("[bgzip] Index file name expected when reading from stdin\n");
237             fp = bgzf_open("-", "r");
238             if ( !fp ) error("[bgzip] Could not read from stdin: %s\n", strerror(errno));
239         }
240 
241         buffer = malloc(BGZF_BLOCK_SIZE);
242         bgzf_index_build_init(fp);
243         int ret;
244         while ( (ret=bgzf_read(fp, buffer, BGZF_BLOCK_SIZE))>0 ) ;
245         free(buffer);
246         if ( ret<0 ) error("Is the file gzipped or bgzipped? The latter is required for indexing.\n");
247 
248         if ( index_fname ) {
249             if (bgzf_index_dump(fp, index_fname, NULL) < 0)
250                 error("Could not write index to '%s'\n", index_fname);
251         } else {
252             if (bgzf_index_dump(fp, argv[optind], ".gzi") < 0)
253                 error("Could not write index to '%s.gzi'\n", argv[optind]);
254         }
255 
256         if ( bgzf_close(fp)<0 ) error("Close failed: Error %d\n",fp->errcode);
257         return 0;
258     }
259     else
260     {
261         struct stat sbuf;
262         int f_dst;
263 
264         if ( argc>optind )
265         {
266             if ( stat(argv[optind],&sbuf)<0 )
267             {
268                 fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
269                 return 1;
270             }
271             char *name;
272             int len = strlen(argv[optind]);
273             if ( strcmp(argv[optind]+len-3,".gz") )
274             {
275                 fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]);
276                 return 1;
277             }
278             fp = bgzf_open(argv[optind], "r");
279             if (fp == NULL) {
280                 fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]);
281                 return 1;
282             }
283 
284             if (pstdout) {
285                 f_dst = fileno(stdout);
286             }
287             else {
288                 const int wrflags = O_WRONLY | O_CREAT | O_TRUNC;
289                 name = strdup(argv[optind]);
290                 name[strlen(name) - 3] = '\0';
291                 f_dst = open(name, is_forced? wrflags : wrflags|O_EXCL, 0666);
292                 if (f_dst < 0 && errno == EEXIST && confirm_overwrite(name))
293                     f_dst = open(name, wrflags, 0666);
294                 if (f_dst < 0) {
295                     fprintf(stderr, "[bgzip] can't create %s: %s\n", name, strerror(errno));
296                     free(name);
297                     return 1;
298                 }
299                 free(name);
300             }
301         }
302         else if (!pstdout && isatty(fileno((FILE *)stdin)) )
303             return bgzip_main_usage();
304         else
305         {
306             f_dst = fileno(stdout);
307             fp = bgzf_open("-", "r");
308             if (fp == NULL) {
309                 fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno));
310                 return 1;
311             }
312         }
313         if (threads > 1)
314             bgzf_mt(fp, threads, 256);
315 
316         buffer = malloc(WINDOW_SIZE);
317         if ( start>0 )
318         {
319             if ( bgzf_index_load(fp, argv[optind], ".gzi") < 0 ) error("Could not load index: %s.gzi\n", argv[optind]);
320             if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %d-th (uncompressd) byte\n", start);
321         }
322         while (1) {
323             if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE);
324             else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
325             if (c == 0) break;
326             if (c < 0) error("Could not read %d bytes: Error %d\n", (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start), fp->errcode);
327             start += c;
328             if ( write(f_dst, buffer, c) != c ) error("Could not write %d bytes\n", c);
329             if (end >= 0 && start >= end) break;
330         }
331         free(buffer);
332         if (bgzf_close(fp) < 0) error("Close failed: Error %d\n",fp->errcode);
333         if (!pstdout) unlink(argv[optind]);
334         return 0;
335     }
336 }
337