1 /*  tabix.c -- Generic indexer for TAB-delimited genome position files.
2 
3     Copyright (C) 2009-2011 Broad Institute.
4     Copyright (C) 2010-2012, 2014-2017 Genome Research Ltd.
5 
6     Author: Heng Li <lh3@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 #include <config.h>
27 
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <unistd.h>
31 #include <string.h>
32 #include <getopt.h>
33 #include <sys/types.h>
34 #include <sys/stat.h>
35 #include <errno.h>
36 #include "htslib/tbx.h"
37 #include "htslib/sam.h"
38 #include "htslib/vcf.h"
39 #include "htslib/kseq.h"
40 #include "htslib/bgzf.h"
41 #include "htslib/hts.h"
42 #include "htslib/regidx.h"
43 
44 typedef struct
45 {
46     char *regions_fname, *targets_fname;
47     int print_header, header_only;
48 }
49 args_t;
50 
error(const char * format,...)51 static void error(const char *format, ...)
52 {
53     va_list ap;
54     va_start(ap, format);
55     vfprintf(stderr, format, ap);
56     va_end(ap);
57     exit(EXIT_FAILURE);
58 }
59 
60 #define IS_GFF  (1<<0)
61 #define IS_BED  (1<<1)
62 #define IS_SAM  (1<<2)
63 #define IS_VCF  (1<<3)
64 #define IS_BCF  (1<<4)
65 #define IS_BAM  (1<<5)
66 #define IS_CRAM (1<<6)
67 #define IS_TXT  (IS_GFF|IS_BED|IS_SAM|IS_VCF)
68 
file_type(const char * fname)69 int file_type(const char *fname)
70 {
71     int l = strlen(fname);
72     int strcasecmp(const char *s1, const char *s2);
73     if (l>=7 && strcasecmp(fname+l-7, ".gff.gz") == 0) return IS_GFF;
74     else if (l>=7 && strcasecmp(fname+l-7, ".bed.gz") == 0) return IS_BED;
75     else if (l>=7 && strcasecmp(fname+l-7, ".sam.gz") == 0) return IS_SAM;
76     else if (l>=7 && strcasecmp(fname+l-7, ".vcf.gz") == 0) return IS_VCF;
77     else if (l>=4 && strcasecmp(fname+l-4, ".bcf") == 0) return IS_BCF;
78     else if (l>=4 && strcasecmp(fname+l-4, ".bam") == 0) return IS_BAM;
79     else if (l>=4 && strcasecmp(fname+l-5, ".cram") == 0) return IS_CRAM;
80 
81     htsFile *fp = hts_open(fname,"r");
82     enum htsExactFormat format = fp->format.format;
83     hts_close(fp);
84     if ( format == bcf ) return IS_BCF;
85     if ( format == bam ) return IS_BAM;
86     if ( format == cram ) return IS_CRAM;
87     if ( format == vcf ) return IS_VCF;
88 
89     return 0;
90 }
91 
parse_regions(char * regions_fname,char ** argv,int argc,int * nregs)92 static char **parse_regions(char *regions_fname, char **argv, int argc, int *nregs)
93 {
94     kstring_t str = {0,0,0};
95     int iseq = 0, ireg = 0;
96     char **regs = NULL;
97     *nregs = argc;
98 
99     if ( regions_fname )
100     {
101         // improve me: this is a too heavy machinery for parsing regions...
102 
103         regidx_t *idx = regidx_init(regions_fname, NULL, NULL, 0, NULL);
104         if ( !idx ) error("Could not read %s\n", regions_fname);
105 
106         (*nregs) += regidx_nregs(idx);
107         regs = (char**) malloc(sizeof(char*)*(*nregs));
108 
109         int nseq;
110         char **seqs = regidx_seq_names(idx, &nseq);
111         for (iseq=0; iseq<nseq; iseq++)
112         {
113             regitr_t itr;
114             regidx_overlap(idx, seqs[iseq], 0, UINT32_MAX, &itr);
115             while ( itr.i < itr.n )
116             {
117                 str.l = 0;
118                 ksprintf(&str, "%s:%d-%d", seqs[iseq], REGITR_START(itr)+1, REGITR_END(itr)+1);
119                 regs[ireg++] = strdup(str.s);
120                 itr.i++;
121             }
122         }
123         regidx_destroy(idx);
124     }
125     free(str.s);
126 
127     if ( !ireg )
128     {
129         if ( argc )
130             regs = (char**) malloc(sizeof(char*)*argc);
131         else
132         {
133             regs = (char**) malloc(sizeof(char*));
134             regs[0] = strdup(".");
135             *nregs = 1;
136         }
137     }
138 
139     for (iseq=0; iseq<argc; iseq++) regs[ireg++] = strdup(argv[iseq]);
140     return regs;
141 }
query_regions(args_t * args,char * fname,char ** regs,int nregs)142 static int query_regions(args_t *args, char *fname, char **regs, int nregs)
143 {
144     int i;
145     htsFile *fp = hts_open(fname,"r");
146     if ( !fp ) error("Could not read %s\n", fname);
147     enum htsExactFormat format = hts_get_format(fp)->format;
148 
149     regidx_t *reg_idx = NULL;
150     if ( args->targets_fname )
151     {
152         reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL);
153         if ( !reg_idx ) error("Could not read %s\n", args->targets_fname);
154     }
155 
156     if ( format == bcf )
157     {
158         htsFile *out = hts_open("-","w");
159         if ( !out ) error("Could not open stdout\n", fname);
160         hts_idx_t *idx = bcf_index_load(fname);
161         if ( !idx ) error("Could not load .csi index of %s\n", fname);
162         bcf_hdr_t *hdr = bcf_hdr_read(fp);
163         if ( !hdr ) error("Could not read the header: %s\n", fname);
164         if ( args->print_header )
165             bcf_hdr_write(out,hdr);
166         if ( !args->header_only )
167         {
168             bcf1_t *rec = bcf_init();
169             for (i=0; i<nregs; i++)
170             {
171                 hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]);
172                 while ( bcf_itr_next(fp, itr, rec) >=0 )
173                 {
174                     if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue;
175                     bcf_write(out,hdr,rec);
176                 }
177                 tbx_itr_destroy(itr);
178             }
179             bcf_destroy(rec);
180         }
181         if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n");
182         bcf_hdr_destroy(hdr);
183         hts_idx_destroy(idx);
184     }
185     else if ( format==vcf || format==sam || format==unknown_format )
186     {
187         tbx_t *tbx = tbx_index_load(fname);
188         if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname);
189         kstring_t str = {0,0,0};
190         if ( args->print_header )
191         {
192             while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 )
193             {
194                 if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break;
195                 puts(str.s);
196             }
197         }
198         if ( !args->header_only )
199         {
200             int nseq;
201             const char **seq = NULL;
202             if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq);
203             for (i=0; i<nregs; i++)
204             {
205                 hts_itr_t *itr = tbx_itr_querys(tbx, regs[i]);
206                 if ( !itr ) continue;
207                 while (tbx_itr_next(fp, tbx, itr, &str) >= 0)
208                 {
209                     if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue;
210                     puts(str.s);
211                 }
212                 tbx_itr_destroy(itr);
213             }
214             free(seq);
215         }
216         free(str.s);
217         tbx_destroy(tbx);
218     }
219     else if ( format==bam )
220         error("Please use \"samtools view\" for querying BAM files.\n");
221 
222     if ( reg_idx ) regidx_destroy(reg_idx);
223     if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname);
224 
225     for (i=0; i<nregs; i++) free(regs[i]);
226     free(regs);
227     return 0;
228 }
query_chroms(char * fname)229 static int query_chroms(char *fname)
230 {
231     const char **seq;
232     int i, nseq, ftype = file_type(fname);
233     if ( ftype & IS_TXT || !ftype )
234     {
235         tbx_t *tbx = tbx_index_load(fname);
236         if ( !tbx ) error("Could not load .tbi index of %s\n", fname);
237         seq = tbx_seqnames(tbx, &nseq);
238         for (i=0; i<nseq; i++)
239             printf("%s\n", seq[i]);
240         free(seq);
241         tbx_destroy(tbx);
242     }
243     else if ( ftype==IS_BCF )
244     {
245         htsFile *fp = hts_open(fname,"r");
246         if ( !fp ) error("Could not read %s\n", fname);
247         bcf_hdr_t *hdr = bcf_hdr_read(fp);
248         if ( !hdr ) error("Could not read the header: %s\n", fname);
249         hts_close(fp);
250         hts_idx_t *idx = bcf_index_load(fname);
251         if ( !idx ) error("Could not load .csi index of %s\n", fname);
252         seq = bcf_index_seqnames(idx, hdr, &nseq);
253         for (i=0; i<nseq; i++)
254             printf("%s\n", seq[i]);
255         free(seq);
256         bcf_hdr_destroy(hdr);
257         hts_idx_destroy(idx);
258     }
259     else if ( ftype==IS_BAM )   // todo: BAM
260         error("BAM: todo\n");
261     return 0;
262 }
263 
reheader_file(const char * fname,const char * header,int ftype,tbx_conf_t * conf)264 int reheader_file(const char *fname, const char *header, int ftype, tbx_conf_t *conf)
265 {
266     if ( ftype & IS_TXT || !ftype )
267     {
268         BGZF *fp = bgzf_open(fname,"r");
269         if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) return -1;
270 
271         char *buffer = fp->uncompressed_block;
272         int skip_until = 0;
273 
274         // Skip the header: find out the position of the data block
275         if ( buffer[0]==conf->meta_char )
276         {
277             skip_until = 1;
278             while (1)
279             {
280                 if ( buffer[skip_until]=='\n' )
281                 {
282                     skip_until++;
283                     if ( skip_until>=fp->block_length )
284                     {
285                         if ( bgzf_read_block(fp) != 0 || !fp->block_length ) error("FIXME: No body in the file: %s\n", fname);
286                         skip_until = 0;
287                     }
288                     // The header has finished
289                     if ( buffer[skip_until]!=conf->meta_char ) break;
290                 }
291                 skip_until++;
292                 if ( skip_until>=fp->block_length )
293                 {
294                     if (bgzf_read_block(fp) != 0 || !fp->block_length) error("FIXME: No body in the file: %s\n", fname);
295                     skip_until = 0;
296                 }
297             }
298         }
299 
300         // Output the new header
301         FILE *hdr  = fopen(header,"r");
302         if ( !hdr ) error("%s: %s", header,strerror(errno));
303         const size_t page_size = 32768;
304         char *buf = malloc(page_size);
305         BGZF *bgzf_out = bgzf_open("-", "w");
306         ssize_t nread;
307         while ( (nread=fread(buf,1,page_size-1,hdr))>0 )
308         {
309             if ( nread<page_size-1 && buf[nread-1]!='\n' ) buf[nread++] = '\n';
310             if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %d\n",bgzf_out->errcode);
311         }
312         if ( fclose(hdr) ) error("close failed: %s\n", header);
313 
314         // Output all remainig data read with the header block
315         if ( fp->block_length - skip_until > 0 )
316         {
317             if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) error("Error: %d\n",fp->errcode);
318         }
319         if (bgzf_flush(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
320 
321         while (1)
322         {
323             nread = bgzf_raw_read(fp, buf, page_size);
324             if ( nread<=0 ) break;
325 
326             int count = bgzf_raw_write(bgzf_out, buf, nread);
327             if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread);
328         }
329         if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
330         if (bgzf_close(fp) < 0) error("Error: %d\n",fp->errcode);
331         free(buf);
332     }
333     else
334         error("todo: reheader BCF, BAM\n");  // BCF is difficult, records contain pointers to the header.
335     return 0;
336 }
337 
usage(void)338 static int usage(void)
339 {
340     fprintf(stderr, "\n");
341     fprintf(stderr, "Version: %s\n", hts_version());
342     fprintf(stderr, "Usage:   tabix [OPTIONS] [FILE] [REGION [...]]\n");
343     fprintf(stderr, "\n");
344     fprintf(stderr, "Indexing Options:\n");
345     fprintf(stderr, "   -0, --zero-based           coordinates are zero-based\n");
346     fprintf(stderr, "   -b, --begin INT            column number for region start [4]\n");
347     fprintf(stderr, "   -c, --comment CHAR         skip comment lines starting with CHAR [null]\n");
348     fprintf(stderr, "   -C, --csi                  generate CSI index for VCF (default is TBI)\n");
349     fprintf(stderr, "   -e, --end INT              column number for region end (if no end, set INT to -b) [5]\n");
350     fprintf(stderr, "   -f, --force                overwrite existing index without asking\n");
351     fprintf(stderr, "   -m, --min-shift INT        set minimal interval size for CSI indices to 2^INT [14]\n");
352     fprintf(stderr, "   -p, --preset STR           gff, bed, sam, vcf\n");
353     fprintf(stderr, "   -s, --sequence INT         column number for sequence names (suppressed by -p) [1]\n");
354     fprintf(stderr, "   -S, --skip-lines INT       skip first INT lines [0]\n");
355     fprintf(stderr, "\n");
356     fprintf(stderr, "Querying and other options:\n");
357     fprintf(stderr, "   -h, --print-header         print also the header lines\n");
358     fprintf(stderr, "   -H, --only-header          print only the header lines\n");
359     fprintf(stderr, "   -l, --list-chroms          list chromosome names\n");
360     fprintf(stderr, "   -r, --reheader FILE        replace the header with the content of FILE\n");
361     fprintf(stderr, "   -R, --regions FILE         restrict to regions listed in the file\n");
362     fprintf(stderr, "   -T, --targets FILE         similar to -R but streams rather than index-jumps\n");
363     fprintf(stderr, "\n");
364     return 1;
365 }
366 
main(int argc,char * argv[])367 int main(int argc, char *argv[])
368 {
369     int c, detect = 1, min_shift = 0, is_force = 0, list_chroms = 0, do_csi = 0;
370     tbx_conf_t conf = tbx_conf_gff;
371     char *reheader = NULL;
372     args_t args;
373     memset(&args,0,sizeof(args_t));
374 
375     static const struct option loptions[] =
376     {
377         {"help", no_argument, NULL, 2},
378         {"regions", required_argument, NULL, 'R'},
379         {"targets", required_argument, NULL, 'T'},
380         {"csi", no_argument, NULL, 'C'},
381         {"zero-based", no_argument, NULL, '0'},
382         {"print-header", no_argument, NULL, 'h'},
383         {"only-header", no_argument, NULL, 'H'},
384         {"begin", required_argument, NULL, 'b'},
385         {"comment", required_argument, NULL, 'c'},
386         {"end", required_argument, NULL, 'e'},
387         {"force", no_argument, NULL, 'f'},
388         {"preset", required_argument, NULL, 'p'},
389         {"sequence", required_argument, NULL, 's'},
390         {"skip-lines", required_argument, NULL, 'S'},
391         {"list-chroms", no_argument, NULL, 'l'},
392         {"reheader", required_argument, NULL, 'r'},
393         {"version", no_argument, NULL, 1},
394         {NULL, 0, NULL, 0}
395     };
396 
397     char *tmp;
398     while ((c = getopt_long(argc, argv, "hH?0b:c:e:fm:p:s:S:lr:CR:T:", loptions,NULL)) >= 0)
399     {
400         switch (c)
401         {
402             case 'R': args.regions_fname = optarg; break;
403             case 'T': args.targets_fname = optarg; break;
404             case 'C': do_csi = 1; break;
405             case 'r': reheader = optarg; break;
406             case 'h': args.print_header = 1; break;
407             case 'H': args.print_header = 1; args.header_only = 1; break;
408             case 'l': list_chroms = 1; break;
409             case '0': conf.preset |= TBX_UCSC; detect = 0; break;
410             case 'b':
411                 conf.bc = strtol(optarg,&tmp,10);
412                 if ( *tmp ) error("Could not parse argument: -b %s\n", optarg);
413                 detect = 0;
414                 break;
415             case 'e':
416                 conf.ec = strtol(optarg,&tmp,10);
417                 if ( *tmp ) error("Could not parse argument: -e %s\n", optarg);
418                 detect = 0;
419                 break;
420             case 'c': conf.meta_char = *optarg; detect = 0; break;
421             case 'f': is_force = 1; break;
422             case 'm':
423                 min_shift = strtol(optarg,&tmp,10);
424                 if ( *tmp ) error("Could not parse argument: -m %s\n", optarg);
425                 break;
426             case 'p':
427                 detect = 0;
428                 if (strcmp(optarg, "gff") == 0) conf = tbx_conf_gff;
429                 else if (strcmp(optarg, "bed") == 0) conf = tbx_conf_bed;
430                 else if (strcmp(optarg, "sam") == 0) conf = tbx_conf_sam;
431                 else if (strcmp(optarg, "vcf") == 0) conf = tbx_conf_vcf;
432                 else if (strcmp(optarg, "bcf") == 0) detect = 1; // bcf is autodetected, preset is not needed
433                 else if (strcmp(optarg, "bam") == 0) detect = 1; // same as bcf
434                 else error("The preset string not recognised: '%s'\n", optarg);
435                 break;
436             case 's':
437                 conf.sc = strtol(optarg,&tmp,10);
438                 if ( *tmp ) error("Could not parse argument: -s %s\n", optarg);
439                 detect = 0;
440                 break;
441             case 'S':
442                 conf.line_skip = strtol(optarg,&tmp,10);
443                 if ( *tmp ) error("Could not parse argument: -S %s\n", optarg);
444                 detect = 0;
445                 break;
446             case 1:
447                 printf(
448 "tabix (htslib) %s\n"
449 "Copyright (C) 2017 Genome Research Ltd.\n", hts_version());
450                 return EXIT_SUCCESS;
451             default: return usage();
452         }
453     }
454 
455     if ( optind==argc ) return usage();
456 
457     if ( list_chroms )
458         return query_chroms(argv[optind]);
459 
460     if ( argc > optind+1 || args.header_only || args.regions_fname || args.targets_fname )
461     {
462         int nregs = 0;
463         char **regs = NULL;
464         if ( !args.header_only )
465             regs = parse_regions(args.regions_fname, argv+optind+1, argc-optind-1, &nregs);
466         return query_regions(&args, argv[optind], regs, nregs);
467     }
468 
469     char *fname = argv[optind];
470     int ftype = file_type(fname);
471     if ( detect )  // no preset given
472     {
473         if ( ftype==IS_GFF ) conf = tbx_conf_gff;
474         else if ( ftype==IS_BED ) conf = tbx_conf_bed;
475         else if ( ftype==IS_SAM ) conf = tbx_conf_sam;
476         else if ( ftype==IS_VCF )
477         {
478             conf = tbx_conf_vcf;
479             if ( !min_shift && do_csi ) min_shift = 14;
480         }
481         else if ( ftype==IS_BCF )
482         {
483             if ( !min_shift ) min_shift = 14;
484         }
485         else if ( ftype==IS_BAM )
486         {
487             if ( !min_shift ) min_shift = 14;
488         }
489     }
490     if ( do_csi )
491     {
492         if ( !min_shift ) min_shift = 14;
493         min_shift *= do_csi;  // positive for CSIv2, negative for CSIv1
494     }
495     if ( min_shift!=0 && !do_csi ) do_csi = 1;
496 
497     if ( reheader )
498         return reheader_file(fname, reheader, ftype, &conf);
499 
500     char *suffix = ".tbi";
501     if ( do_csi ) suffix = ".csi";
502     else if ( ftype==IS_BAM ) suffix = ".bai";
503     else if ( ftype==IS_CRAM ) suffix = ".crai";
504 
505     char *idx_fname = calloc(strlen(fname) + 5, 1);
506     strcat(strcpy(idx_fname, fname), suffix);
507 
508     struct stat stat_tbi, stat_file;
509     if ( !is_force && stat(idx_fname, &stat_tbi)==0 )
510     {
511         // Before complaining about existing index, check if the VCF file isn't
512         // newer. This is a common source of errors, people tend not to notice
513         // that tabix failed
514         stat(fname, &stat_file);
515         if ( stat_file.st_mtime <= stat_tbi.st_mtime )
516             error("[tabix] the index file exists. Please use '-f' to overwrite.\n");
517     }
518     free(idx_fname);
519 
520     if ( ftype==IS_CRAM )
521     {
522         if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname);
523         return 0;
524     }
525     else if ( do_csi )
526     {
527         if ( ftype==IS_BCF )
528         {
529             if ( bcf_index_build(fname, min_shift)!=0 ) error("bcf_index_build failed: %s\n", fname);
530             return 0;
531         }
532         if ( ftype==IS_BAM )
533         {
534             if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname);
535             return 0;
536         }
537         if ( tbx_index_build(fname, min_shift, &conf)!=0 ) error("tbx_index_build failed: %s\n", fname);
538         return 0;
539     }
540     else    // TBI index
541     {
542         if ( tbx_index_build(fname, min_shift, &conf) ) error("tbx_index_build failed: %s\n", fname);
543         return 0;
544     }
545     return 0;
546 }
547