1 /*  bam_tview.c -- tview subcommand.
2 
3     Copyright (C) 2008-2016, 2019-2020 Genome Research Ltd.
4     Portions copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, Université de Nantes.
5 
6     Author: Heng Li <lh3@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notices and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 #include <config.h>
27 
28 #include <regex.h>
29 #include <assert.h>
30 #include "bam_tview.h"
31 #include <htslib/faidx.h>
32 #include <htslib/sam.h>
33 #include <htslib/bgzf.h>
34 #include "samtools.h"
35 #include "sam_opts.h"
36 
37 // Threshold where spacing of numbers on the scale line changes from 10 to 20
38 // to stop them running into each other.
39 #define TEN_DIGITS 1000000000
40 
destroy_rg_hash(khash_t (kh_rg)* rg_hash)41 static void destroy_rg_hash(khash_t(kh_rg)* rg_hash)
42 {
43     khiter_t k;
44 
45     if (!rg_hash)
46         return;
47 
48     for (k = 0; k < kh_end(rg_hash); k++) {
49         if (kh_exist(rg_hash, k))
50             free((char *) kh_key(rg_hash, k));
51     }
52     kh_destroy(kh_rg, rg_hash);
53 }
54 
55 static
khash_t(kh_rg)56 khash_t(kh_rg)* get_rg_sample(sam_hdr_t* header, const char* sample)
57 {
58     int n_rg, i;
59     kstring_t id_val = KS_INITIALIZE, sm_val = KS_INITIALIZE;
60     khash_t(kh_rg)* rg_hash = kh_init(kh_rg);
61     if (!rg_hash) return NULL;
62 
63     // There may be more than one @RG with a given SM:, so iterate through them
64     n_rg = sam_hdr_count_lines(header, "RG");
65     if (n_rg < 0) {
66         print_error("tview", "couldn't parse header");
67         goto fail;
68     }
69 
70     for (i = 0; i < n_rg; i++) {
71         // Try ID: first
72         int r = sam_hdr_find_tag_pos(header, "RG", i, "ID", &id_val);
73         if (r < -1) goto memfail;
74         if (r == -1) continue;  // Shouldn't happen as ID is compulsory
75 
76         if (strcmp(sample, id_val.s) != 0) {
77             // No match, try SM:
78             r = sam_hdr_find_tag_pos(header, "RG", i, "SM", &sm_val);
79             if (r < -1) goto memfail;
80             if (r < 0 || strcmp(sample, sm_val.s) != 0)
81                 continue;
82         }
83 
84         // Found a match, add ID to rg_hash
85         kh_put(kh_rg, rg_hash, id_val.s, &r);
86         if (r < 0) goto memfail;
87         ks_release(&id_val); // Now owned by hash table
88     }
89 
90     ks_free(&id_val);
91     ks_free(&sm_val);
92     return rg_hash;
93 
94  memfail:
95     perror("tview");
96  fail:
97     ks_free(&id_val);
98     ks_free(&sm_val);
99     destroy_rg_hash(rg_hash);
100     return NULL;
101 }
102 
base_tv_init(tview_t * tv,const char * fn,const char * fn_fa,const char * fn_idx,const char * samples,const htsFormat * fmt)103 int base_tv_init(tview_t* tv, const char *fn, const char *fn_fa, const char *fn_idx,
104                  const char *samples, const htsFormat *fmt)
105 {
106     assert(tv!=NULL);
107     assert(fn!=NULL);
108     tv->mrow = 24; tv->mcol = 80;
109     tv->color_for = TV_COLOR_MAPQ;
110     tv->is_dot = 1;
111 
112     tv->fp = sam_open_format(fn, "r", fmt);
113     if(tv->fp == NULL)
114     {
115         print_error_errno("tview", "can't open \"%s\"", fn);
116         exit(EXIT_FAILURE);
117     }
118     // TODO bgzf_set_cache_size(tv->fp->fp.bgzf, 8 * 1024 *1024);
119     assert(tv->fp);
120 
121     tv->header = sam_hdr_read(tv->fp);
122     if(tv->header == NULL)
123     {
124         print_error("tview", "cannot read \"%s\"", fn);
125         exit(EXIT_FAILURE);
126     }
127     // If index filename has not been specfied, look in BAM folder
128     if (fn_idx != NULL) {
129         tv->idx = sam_index_load2(tv->fp, fn, fn_idx);
130     } else {
131         tv->idx = sam_index_load(tv->fp, fn);
132     }
133 
134     if (tv->idx == NULL)
135     {
136         print_error("tview", "cannot read index for \"%s\"", fn);
137         exit(EXIT_FAILURE);
138     }
139     tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv);
140     if (fn_fa) tv->fai = fai_load(fn_fa);
141     tv->bca = bcf_call_init(0.83, 13);
142     tv->ins = 1;
143 
144     // If the user has asked for specific samples find out create a list of readgroups make up these samples
145     if ( samples )
146     {
147         tv->rg_hash = get_rg_sample(tv->header, samples); // Init the list of rg's
148         if (kh_size(tv->rg_hash) == 0) {
149             print_error("tview",
150                         "The sample or read group \"%s\" not present.",
151                         samples);
152             exit(EXIT_FAILURE);
153         }
154     }
155 
156     return 0;
157 }
158 
159 
base_tv_destroy(tview_t * tv)160 void base_tv_destroy(tview_t* tv)
161 {
162     bam_lplbuf_destroy(tv->lplbuf);
163     bcf_call_destroy(tv->bca);
164     hts_idx_destroy(tv->idx);
165     if (tv->fai) fai_destroy(tv->fai);
166     free(tv->ref);
167     sam_hdr_destroy(tv->header);
168     destroy_rg_hash(tv->rg_hash);
169     sam_close(tv->fp);
170 }
171 
172 
tv_pl_func(uint32_t tid,hts_pos_t pos,int n,const bam_pileup1_t * pl,void * data)173 int tv_pl_func(uint32_t tid, hts_pos_t pos, int n, const bam_pileup1_t *pl, void *data)
174 {
175     tview_t *tv = (tview_t*)data;
176     hts_pos_t cp;
177     int i, j, c, rb, attr, max_ins = 0, interval;
178     uint32_t call = 0;
179     kstring_t ks = KS_INITIALIZE;
180     if (pos < tv->left_pos || tv->ccol > tv->mcol) return 0; // out of screen
181     // print reference
182     rb = (tv->ref && pos - tv->left_pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N';
183     for (cp = tv->last_pos + 1; cp < pos; ++cp) {
184         interval = cp < TEN_DIGITS ? 10 : 20;
185         if (cp%interval == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-"PRIhts_pos, cp+1);
186         c = tv->ref? tv->ref[cp - tv->left_pos] : 'N';
187         tv->my_mvaddch(tv,1, tv->ccol++, c);
188     }
189     interval = pos < TEN_DIGITS ? 10 : 20;
190     if (pos%interval == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-"PRIhts_pos, pos+1);
191     { // call consensus
192         bcf_callret1_t bcr;
193         memset(&bcr, 0, sizeof bcr);
194         int qsum[4], a1, a2, tmp;
195         double p[3], prior = 30;
196         bcf_call_glfgen(n, pl, seq_nt16_table[rb], tv->bca, &bcr);
197         for (i = 0; i < 4; ++i) qsum[i] = ((int)bcr.qsum[i])<<2 | i;
198         for (i = 1; i < 4; ++i) // insertion sort
199             for (j = i; j > 0 && qsum[j] > qsum[j-1]; --j)
200                 tmp = qsum[j], qsum[j] = qsum[j-1], qsum[j-1] = tmp;
201         a1 = qsum[0]&3; a2 = qsum[1]&3;
202         p[0] = bcr.p[a1*5+a1]; p[1] = bcr.p[a1*5+a2] + prior; p[2] = bcr.p[a2*5+a2];
203         if ("ACGT"[a1] != toupper(rb)) p[0] += prior + 3;
204         if ("ACGT"[a2] != toupper(rb)) p[2] += prior + 3;
205         if (p[0] < p[1] && p[0] < p[2]) call = (1<<a1)<<16 | (int)((p[1]<p[2]?p[1]:p[2]) - p[0] + .499);
206         else if (p[2] < p[1] && p[2] < p[0]) call = (1<<a2)<<16 | (int)((p[0]<p[1]?p[0]:p[1]) - p[2] + .499);
207         else call = (1<<a1|1<<a2)<<16 | (int)((p[0]<p[2]?p[0]:p[2]) - p[1] + .499);
208     }
209     attr = tv->my_underline(tv);
210     c = ",ACMGRSVTWYHKDBN"[call>>16&0xf];
211     i = (call&0xffff)/10+1;
212     if (i > 4) i = 4;
213     attr |= tv->my_colorpair(tv,i);
214     if (c == toupper(rb)) c = '.';
215     tv->my_attron(tv,attr);
216     tv->my_mvaddch(tv,2, tv->ccol, c);
217     tv->my_attroff(tv,attr);
218     if(tv->ins) {
219         // calculate maximum insert
220         for (i = 0; i < n; ++i) {
221             const bam_pileup1_t *p = pl + i;
222             int len = bam_plp_insertion(p, &ks, NULL);
223             if (len < 0) {
224                 print_error("tview", "Memory allocation failure.");
225                 exit(1);
226             }
227             if (max_ins < len) max_ins = len;
228         }
229     }
230     // core loop
231     for (j = 0; j <= max_ins; ++j) {
232         for (i = 0; i < n; ++i) {
233             const bam_pileup1_t *p = pl + i;
234             int row = TV_MIN_ALNROW + p->level - tv->row_shift;
235             if (j == 0) {
236                 if (!p->is_del) {
237                     if (tv->base_for == TV_BASE_COLOR_SPACE &&
238                             (c = bam_aux_getCSi(p->b, p->qpos))) {
239                         // assume that if we found one color, we will be able to get the color error
240                         if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos)) c = bam_is_rev(p->b)? ',' : '.';
241                     } else {
242                         if (tv->show_name) {
243                             char *name = bam_get_qname(p->b);
244                             c = (p->qpos + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos];
245                         } else {
246                             c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos)];
247                             if (tv->is_dot && toupper(c) == toupper(rb)) c = bam_is_rev(p->b)? ',' : '.';
248                         }
249                     }
250                 } else c = p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : '*';
251             } else { // padding
252                 int len = bam_plp_insertion(p, &ks, NULL);
253                 if (len < 0) {
254                     print_error("tview", "Memory allocation failure.");
255                     exit(1);
256                 }
257 
258                 if (j > len) c = '*';
259                 else { // insertion
260                     if (tv->base_for ==  TV_BASE_NUCL) {
261                         if (tv->show_name) {
262                             char *name = bam_get_qname(p->b);
263                             c = (p->qpos + j + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos + j];
264                         } else {
265                             c = ks.s[j-1];
266                             if (j == 0 && tv->is_dot && toupper(c) == toupper(rb)) c = bam_is_rev(p->b)? ',' : '.';
267                         }
268                     } else {
269                         c = bam_aux_getCSi(p->b, p->qpos + j);
270                         if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos + j)) c = bam_is_rev(p->b)? ',' : '.';
271                     }
272                 }
273             }
274             if (row > TV_MIN_ALNROW && row < tv->mrow) {
275                 int x;
276                 attr = 0;
277                 if (((p->b->core.flag&BAM_FPAIRED) && !(p->b->core.flag&BAM_FPROPER_PAIR))
278                         || (p->b->core.flag & BAM_FSECONDARY)) attr |= tv->my_underline(tv);
279                 if (tv->color_for == TV_COLOR_BASEQ) {
280                     x = bam_get_qual(p->b)[p->qpos]/10 + 1;
281                     if (x > 4) x = 4;
282                     attr |= tv->my_colorpair(tv,x);
283                 } else if (tv->color_for == TV_COLOR_MAPQ) {
284                     x = p->b->core.qual/10 + 1;
285                     if (x > 4) x = 4;
286                     attr |= tv->my_colorpair(tv,x);
287                 } else if (tv->color_for == TV_COLOR_NUCL) {
288                     x = seq_nt16_int[bam_seqi(bam_get_seq(p->b), p->qpos)] + 5;
289                     attr |= tv->my_colorpair(tv,x);
290                 } else if(tv->color_for == TV_COLOR_COL) {
291                     x = 0;
292                     switch(bam_aux_getCSi(p->b, p->qpos)) {
293                         case '0': x = 0; break;
294                         case '1': x = 1; break;
295                         case '2': x = 2; break;
296                         case '3': x = 3; break;
297                         case '4': x = 4; break;
298                         default: x = seq_nt16_int[bam_seqi(bam_get_seq(p->b), p->qpos)]; break;
299                     }
300                     x+=5;
301                     attr |= tv->my_colorpair(tv,x);
302                 } else if(tv->color_for == TV_COLOR_COLQ) {
303                     x = bam_aux_getCQi(p->b, p->qpos);
304                     if(0 == x) x = bam_get_qual(p->b)[p->qpos];
305                     x = x/10 + 1;
306                     if (x > 4) x = 4;
307                     attr |= tv->my_colorpair(tv,x);
308                 }
309                 tv->my_attron(tv,attr);
310                 tv->my_mvaddch(tv,row, tv->ccol, bam_is_rev(p->b)? tolower(c) : toupper(c));
311                 tv->my_attroff(tv,attr);
312             }
313         }
314         c = j? '*' : rb;
315         if (c == '*') {
316             attr = tv->my_colorpair(tv,8);
317             tv->my_attron(tv,attr);
318             tv->my_mvaddch(tv,1, tv->ccol++, c);
319             tv->my_attroff(tv,attr);
320         } else tv->my_mvaddch(tv,1, tv->ccol++, c);
321     }
322     tv->last_pos = pos;
323     ks_free(&ks);
324     return 0;
325 }
326 
327 
328 
329 
tv_push_aln(const bam1_t * b,tview_t * tv)330 static int tv_push_aln(const bam1_t *b, tview_t *tv)
331 {
332     /* If we are restricted to specific readgroups check RG is in the list */
333     if ( tv->rg_hash )
334     {
335         const uint8_t *rg = bam_aux_get(b, "RG");
336         if ( !rg ) return 0; // If we don't have an RG tag exclude read
337         khiter_t k = kh_get(kh_rg, tv->rg_hash, (const char*)(rg + 1));
338         if ( k == kh_end(tv->rg_hash) ) return 0; // if RG tag is not in list of allowed tags exclude read
339     }
340     if (tv->no_skip) {
341         uint32_t *cigar = bam_get_cigar(b); // this is cheating...
342         int i;
343         for (i = 0; i <b->core.n_cigar; ++i) {
344             if ((cigar[i]&0xf) == BAM_CREF_SKIP)
345                 cigar[i] = cigar[i]>>4<<4 | BAM_CDEL;
346         }
347     }
348     bam_lplbuf_push(b, tv->lplbuf);
349     return 0;
350 }
351 
base_draw_aln(tview_t * tv,int tid,hts_pos_t pos)352 int base_draw_aln(tview_t *tv, int tid, hts_pos_t pos)
353 {
354     int ret;
355     assert(tv!=NULL);
356     // reset
357     tv->my_clear(tv);
358     tv->curr_tid = tid; tv->left_pos = pos;
359     tv->last_pos = tv->left_pos - 1;
360     tv->ccol = 0;
361     // print ref and consensus
362     if (tv->fai) {
363         char *str;
364         if (tv->ref) free(tv->ref);
365         assert(tv->curr_tid>=0);
366 
367         const char *ref_name = sam_hdr_tid2name(tv->header, tv->curr_tid);
368         str = (char*)calloc(strlen(ref_name) + 30, 1);
369         assert(str!=NULL);
370         sprintf(str, "%s:%"PRIhts_pos"-%"PRIhts_pos, ref_name, tv->left_pos + 1, tv->left_pos + tv->mcol);
371         tv->ref = fai_fetch64(tv->fai, str, &tv->l_ref);
372         free(str);
373         if ( !tv->ref )
374         {
375             fprintf(stderr,"Could not read the reference sequence. Is it seekable (plain text or compressed + .gzi indexed with bgzip)?\n");
376             exit(1);
377         }
378     }
379     // draw aln
380     bam_lplbuf_reset(tv->lplbuf);
381     hts_itr_t *iter = sam_itr_queryi(tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol);
382     bam1_t *b = bam_init1();
383     while ((ret = sam_itr_next(tv->fp, iter, b)) >= 0) tv_push_aln(b, tv);
384     bam_destroy1(b);
385     hts_itr_destroy(iter);
386     if (ret < -1) {
387         print_error("tview", "could not read from input file");
388         exit(1);
389     }
390 
391     bam_lplbuf_push(0, tv->lplbuf);
392 
393     while (tv->ccol < tv->mcol) {
394         hts_pos_t pos = tv->last_pos + 1;
395         int interval = pos < TEN_DIGITS ? 10 : 20;
396         if (pos%interval == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-"PRIhts_pos, pos+1);
397         tv->my_mvaddch(tv,1, tv->ccol++, (tv->ref && pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N');
398         ++tv->last_pos;
399     }
400     return 0;
401 }
402 
403 
404 
405 
error(const char * format,...)406 static void error(const char *format, ...)
407 {
408     if ( !format )
409     {
410         fprintf(stderr,
411 "Usage: samtools tview [options] <aln.bam> [ref.fasta]\n"
412 "Options:\n"
413 "   -d display      output as (H)tml or (C)urses or (T)ext \n"
414 "   -X              include customized index file\n"
415 "   -p chr:pos      go directly to this position\n"
416 "   -s STR          display only reads from this sample or group\n"
417 "   -w INT          display width (with -d T only)\n");
418         sam_global_opt_help(stderr, "-.--.--.");
419     }
420     else
421     {
422         va_list ap;
423         va_start(ap, format);
424         vfprintf(stderr, format, ap);
425         va_end(ap);
426     }
427     exit(-1);
428 }
429 
430 enum dipsay_mode {display_ncurses,display_html,display_text};
431 extern tview_t* curses_tv_init(const char *fn, const char *fn_fa,
432                                const char *samples, const htsFormat *fmt);
433 extern tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *fn_idx,
434                              const char *samples, const htsFormat *fmt);
435 extern tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *fn_idx,
436                              const char *samples, const htsFormat *fmt);
437 
bam_tview_main(int argc,char * argv[])438 int bam_tview_main(int argc, char *argv[])
439 {
440     int view_mode=display_ncurses, display_width = 0;
441     tview_t* tv=NULL;
442     char *samples=NULL, *position=NULL, *ref, *fn_idx=NULL;
443     int c, has_index_file = 0, ref_index = 0;
444 
445     sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
446     static const struct option lopts[] = {
447         SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
448         { NULL, 0, NULL, 0 }
449     };
450 
451     char *tmp;
452     while ((c = getopt_long(argc, argv, "s:p:d:Xw:", lopts, NULL)) >= 0) {
453         switch (c) {
454             case 'w':
455                 display_width = strtol(optarg,&tmp,10);
456                 if ( tmp==optarg || *tmp || display_width<1 ) error("Could not parse: -w %s\n",optarg);
457                 break;
458             case 's': samples=optarg; break;
459             case 'p': position=optarg; break;
460             case 'X': has_index_file=1; break; // -X flag for index filename
461             case 'd':
462             {
463                 switch(optarg[0])
464                 {
465                     case 'H': case 'h': view_mode=display_html;break;
466                     case 'T': case 't': view_mode=display_text;break;
467                     case 'C': case 'c': view_mode=display_ncurses;break;
468                     default: view_mode=display_ncurses;break;
469                 }
470                 break;
471             }
472             default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
473                       /* else fall-through */
474             case '?': error(NULL);
475         }
476     }
477     if (argc==optind) error(NULL);
478     if (display_width && view_mode == display_ncurses)
479         error("The -w option is currently supported only with -d T and -d H\n");
480 
481     ref = NULL;
482     ref_index = optind;
483     if (!has_index_file) {
484         ref = (optind+1>=argc)? ga.reference : argv[optind+1];
485     }
486     else {
487         ref = (optind+2>=argc)? ga.reference : argv[optind+2];
488         if (optind+1 >= argc) {
489             fprintf(stderr, "Incorrect number of arguments provided! Aborting...\n");
490             return 1;
491         }
492         fn_idx = argv[optind+1];
493         ref_index = optind+1;
494     }
495 
496     switch(view_mode)
497     {
498         case display_ncurses:
499             tv = curses_tv_init(argv[ref_index], ref, samples, &ga.in);
500             break;
501 
502         case display_text:
503             tv = text_tv_init(argv[ref_index], ref, fn_idx, samples, &ga.in);
504             if ( display_width ) tv->mcol = display_width;
505             break;
506 
507         case display_html:
508             tv = html_tv_init(argv[ref_index], ref, fn_idx, samples, &ga.in);
509             if ( display_width ) tv->mcol = display_width;
510             break;
511     }
512     if (tv==NULL)
513     {
514         error("cannot create view");
515         return EXIT_FAILURE;
516     }
517 
518     if ( position )
519     {
520         int tid;
521         hts_pos_t beg, end;
522         if (!sam_parse_region(tv->header, position, &tid, &beg, &end, 0)) {
523             tv->my_destroy(tv);
524             fprintf(stderr, "Unknown reference or malformed region\n");
525             exit(EXIT_FAILURE);
526         }
527         tv->curr_tid = tid;
528         tv->left_pos = beg;
529     }
530     else if ( tv->fai )
531     {
532         // find the first sequence present in both BAM and the reference file
533         int i;
534         for (i=0; i < sam_hdr_nref(tv->header); i++)
535         {
536             if ( faidx_has_seq(tv->fai, sam_hdr_tid2name(tv->header, i)) ) break;
537         }
538         if ( i==sam_hdr_nref(tv->header) )
539         {
540             tv->my_destroy(tv);
541             fprintf(stderr,"None of the BAM sequence names present in the fasta file\n");
542             exit(EXIT_FAILURE);
543         }
544         tv->curr_tid = i;
545     }
546     tv->my_drawaln(tv, tv->curr_tid, tv->left_pos);
547     tv->my_loop(tv);
548     tv->my_destroy(tv);
549 
550     return EXIT_SUCCESS;
551 }
552