1 /* bam_tview.c -- tview subcommand.
2
3 Copyright (C) 2008-2016, 2019-2020 Genome Research Ltd.
4 Portions copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, Université de Nantes.
5
6 Author: Heng Li <lh3@sanger.ac.uk>
7
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14
15 The above copyright notices and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25
26 #include <config.h>
27
28 #include <regex.h>
29 #include <assert.h>
30 #include "bam_tview.h"
31 #include <htslib/faidx.h>
32 #include <htslib/sam.h>
33 #include <htslib/bgzf.h>
34 #include "samtools.h"
35 #include "sam_opts.h"
36
37 // Threshold where spacing of numbers on the scale line changes from 10 to 20
38 // to stop them running into each other.
39 #define TEN_DIGITS 1000000000
40
destroy_rg_hash(khash_t (kh_rg)* rg_hash)41 static void destroy_rg_hash(khash_t(kh_rg)* rg_hash)
42 {
43 khiter_t k;
44
45 if (!rg_hash)
46 return;
47
48 for (k = 0; k < kh_end(rg_hash); k++) {
49 if (kh_exist(rg_hash, k))
50 free((char *) kh_key(rg_hash, k));
51 }
52 kh_destroy(kh_rg, rg_hash);
53 }
54
55 static
khash_t(kh_rg)56 khash_t(kh_rg)* get_rg_sample(sam_hdr_t* header, const char* sample)
57 {
58 int n_rg, i;
59 kstring_t id_val = KS_INITIALIZE, sm_val = KS_INITIALIZE;
60 khash_t(kh_rg)* rg_hash = kh_init(kh_rg);
61 if (!rg_hash) return NULL;
62
63 // There may be more than one @RG with a given SM:, so iterate through them
64 n_rg = sam_hdr_count_lines(header, "RG");
65 if (n_rg < 0) {
66 print_error("tview", "couldn't parse header");
67 goto fail;
68 }
69
70 for (i = 0; i < n_rg; i++) {
71 // Try ID: first
72 int r = sam_hdr_find_tag_pos(header, "RG", i, "ID", &id_val);
73 if (r < -1) goto memfail;
74 if (r == -1) continue; // Shouldn't happen as ID is compulsory
75
76 if (strcmp(sample, id_val.s) != 0) {
77 // No match, try SM:
78 r = sam_hdr_find_tag_pos(header, "RG", i, "SM", &sm_val);
79 if (r < -1) goto memfail;
80 if (r < 0 || strcmp(sample, sm_val.s) != 0)
81 continue;
82 }
83
84 // Found a match, add ID to rg_hash
85 kh_put(kh_rg, rg_hash, id_val.s, &r);
86 if (r < 0) goto memfail;
87 ks_release(&id_val); // Now owned by hash table
88 }
89
90 ks_free(&id_val);
91 ks_free(&sm_val);
92 return rg_hash;
93
94 memfail:
95 perror("tview");
96 fail:
97 ks_free(&id_val);
98 ks_free(&sm_val);
99 destroy_rg_hash(rg_hash);
100 return NULL;
101 }
102
base_tv_init(tview_t * tv,const char * fn,const char * fn_fa,const char * fn_idx,const char * samples,const htsFormat * fmt)103 int base_tv_init(tview_t* tv, const char *fn, const char *fn_fa, const char *fn_idx,
104 const char *samples, const htsFormat *fmt)
105 {
106 assert(tv!=NULL);
107 assert(fn!=NULL);
108 tv->mrow = 24; tv->mcol = 80;
109 tv->color_for = TV_COLOR_MAPQ;
110 tv->is_dot = 1;
111
112 tv->fp = sam_open_format(fn, "r", fmt);
113 if(tv->fp == NULL)
114 {
115 print_error_errno("tview", "can't open \"%s\"", fn);
116 exit(EXIT_FAILURE);
117 }
118 // TODO bgzf_set_cache_size(tv->fp->fp.bgzf, 8 * 1024 *1024);
119 assert(tv->fp);
120
121 tv->header = sam_hdr_read(tv->fp);
122 if(tv->header == NULL)
123 {
124 print_error("tview", "cannot read \"%s\"", fn);
125 exit(EXIT_FAILURE);
126 }
127 // If index filename has not been specfied, look in BAM folder
128 if (fn_idx != NULL) {
129 tv->idx = sam_index_load2(tv->fp, fn, fn_idx);
130 } else {
131 tv->idx = sam_index_load(tv->fp, fn);
132 }
133
134 if (tv->idx == NULL)
135 {
136 print_error("tview", "cannot read index for \"%s\"", fn);
137 exit(EXIT_FAILURE);
138 }
139 tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv);
140 if (fn_fa) tv->fai = fai_load(fn_fa);
141 tv->bca = bcf_call_init(0.83, 13);
142 tv->ins = 1;
143
144 // If the user has asked for specific samples find out create a list of readgroups make up these samples
145 if ( samples )
146 {
147 tv->rg_hash = get_rg_sample(tv->header, samples); // Init the list of rg's
148 if (kh_size(tv->rg_hash) == 0) {
149 print_error("tview",
150 "The sample or read group \"%s\" not present.",
151 samples);
152 exit(EXIT_FAILURE);
153 }
154 }
155
156 return 0;
157 }
158
159
base_tv_destroy(tview_t * tv)160 void base_tv_destroy(tview_t* tv)
161 {
162 bam_lplbuf_destroy(tv->lplbuf);
163 bcf_call_destroy(tv->bca);
164 hts_idx_destroy(tv->idx);
165 if (tv->fai) fai_destroy(tv->fai);
166 free(tv->ref);
167 sam_hdr_destroy(tv->header);
168 destroy_rg_hash(tv->rg_hash);
169 sam_close(tv->fp);
170 }
171
172
tv_pl_func(uint32_t tid,hts_pos_t pos,int n,const bam_pileup1_t * pl,void * data)173 int tv_pl_func(uint32_t tid, hts_pos_t pos, int n, const bam_pileup1_t *pl, void *data)
174 {
175 tview_t *tv = (tview_t*)data;
176 hts_pos_t cp;
177 int i, j, c, rb, attr, max_ins = 0, interval;
178 uint32_t call = 0;
179 kstring_t ks = KS_INITIALIZE;
180 if (pos < tv->left_pos || tv->ccol > tv->mcol) return 0; // out of screen
181 // print reference
182 rb = (tv->ref && pos - tv->left_pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N';
183 for (cp = tv->last_pos + 1; cp < pos; ++cp) {
184 interval = cp < TEN_DIGITS ? 10 : 20;
185 if (cp%interval == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-"PRIhts_pos, cp+1);
186 c = tv->ref? tv->ref[cp - tv->left_pos] : 'N';
187 tv->my_mvaddch(tv,1, tv->ccol++, c);
188 }
189 interval = pos < TEN_DIGITS ? 10 : 20;
190 if (pos%interval == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-"PRIhts_pos, pos+1);
191 { // call consensus
192 bcf_callret1_t bcr;
193 memset(&bcr, 0, sizeof bcr);
194 int qsum[4], a1, a2, tmp;
195 double p[3], prior = 30;
196 bcf_call_glfgen(n, pl, seq_nt16_table[rb], tv->bca, &bcr);
197 for (i = 0; i < 4; ++i) qsum[i] = ((int)bcr.qsum[i])<<2 | i;
198 for (i = 1; i < 4; ++i) // insertion sort
199 for (j = i; j > 0 && qsum[j] > qsum[j-1]; --j)
200 tmp = qsum[j], qsum[j] = qsum[j-1], qsum[j-1] = tmp;
201 a1 = qsum[0]&3; a2 = qsum[1]&3;
202 p[0] = bcr.p[a1*5+a1]; p[1] = bcr.p[a1*5+a2] + prior; p[2] = bcr.p[a2*5+a2];
203 if ("ACGT"[a1] != toupper(rb)) p[0] += prior + 3;
204 if ("ACGT"[a2] != toupper(rb)) p[2] += prior + 3;
205 if (p[0] < p[1] && p[0] < p[2]) call = (1<<a1)<<16 | (int)((p[1]<p[2]?p[1]:p[2]) - p[0] + .499);
206 else if (p[2] < p[1] && p[2] < p[0]) call = (1<<a2)<<16 | (int)((p[0]<p[1]?p[0]:p[1]) - p[2] + .499);
207 else call = (1<<a1|1<<a2)<<16 | (int)((p[0]<p[2]?p[0]:p[2]) - p[1] + .499);
208 }
209 attr = tv->my_underline(tv);
210 c = ",ACMGRSVTWYHKDBN"[call>>16&0xf];
211 i = (call&0xffff)/10+1;
212 if (i > 4) i = 4;
213 attr |= tv->my_colorpair(tv,i);
214 if (c == toupper(rb)) c = '.';
215 tv->my_attron(tv,attr);
216 tv->my_mvaddch(tv,2, tv->ccol, c);
217 tv->my_attroff(tv,attr);
218 if(tv->ins) {
219 // calculate maximum insert
220 for (i = 0; i < n; ++i) {
221 const bam_pileup1_t *p = pl + i;
222 int len = bam_plp_insertion(p, &ks, NULL);
223 if (len < 0) {
224 print_error("tview", "Memory allocation failure.");
225 exit(1);
226 }
227 if (max_ins < len) max_ins = len;
228 }
229 }
230 // core loop
231 for (j = 0; j <= max_ins; ++j) {
232 for (i = 0; i < n; ++i) {
233 const bam_pileup1_t *p = pl + i;
234 int row = TV_MIN_ALNROW + p->level - tv->row_shift;
235 if (j == 0) {
236 if (!p->is_del) {
237 if (tv->base_for == TV_BASE_COLOR_SPACE &&
238 (c = bam_aux_getCSi(p->b, p->qpos))) {
239 // assume that if we found one color, we will be able to get the color error
240 if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos)) c = bam_is_rev(p->b)? ',' : '.';
241 } else {
242 if (tv->show_name) {
243 char *name = bam_get_qname(p->b);
244 c = (p->qpos + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos];
245 } else {
246 c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos)];
247 if (tv->is_dot && toupper(c) == toupper(rb)) c = bam_is_rev(p->b)? ',' : '.';
248 }
249 }
250 } else c = p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : '*';
251 } else { // padding
252 int len = bam_plp_insertion(p, &ks, NULL);
253 if (len < 0) {
254 print_error("tview", "Memory allocation failure.");
255 exit(1);
256 }
257
258 if (j > len) c = '*';
259 else { // insertion
260 if (tv->base_for == TV_BASE_NUCL) {
261 if (tv->show_name) {
262 char *name = bam_get_qname(p->b);
263 c = (p->qpos + j + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos + j];
264 } else {
265 c = ks.s[j-1];
266 if (j == 0 && tv->is_dot && toupper(c) == toupper(rb)) c = bam_is_rev(p->b)? ',' : '.';
267 }
268 } else {
269 c = bam_aux_getCSi(p->b, p->qpos + j);
270 if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos + j)) c = bam_is_rev(p->b)? ',' : '.';
271 }
272 }
273 }
274 if (row > TV_MIN_ALNROW && row < tv->mrow) {
275 int x;
276 attr = 0;
277 if (((p->b->core.flag&BAM_FPAIRED) && !(p->b->core.flag&BAM_FPROPER_PAIR))
278 || (p->b->core.flag & BAM_FSECONDARY)) attr |= tv->my_underline(tv);
279 if (tv->color_for == TV_COLOR_BASEQ) {
280 x = bam_get_qual(p->b)[p->qpos]/10 + 1;
281 if (x > 4) x = 4;
282 attr |= tv->my_colorpair(tv,x);
283 } else if (tv->color_for == TV_COLOR_MAPQ) {
284 x = p->b->core.qual/10 + 1;
285 if (x > 4) x = 4;
286 attr |= tv->my_colorpair(tv,x);
287 } else if (tv->color_for == TV_COLOR_NUCL) {
288 x = seq_nt16_int[bam_seqi(bam_get_seq(p->b), p->qpos)] + 5;
289 attr |= tv->my_colorpair(tv,x);
290 } else if(tv->color_for == TV_COLOR_COL) {
291 x = 0;
292 switch(bam_aux_getCSi(p->b, p->qpos)) {
293 case '0': x = 0; break;
294 case '1': x = 1; break;
295 case '2': x = 2; break;
296 case '3': x = 3; break;
297 case '4': x = 4; break;
298 default: x = seq_nt16_int[bam_seqi(bam_get_seq(p->b), p->qpos)]; break;
299 }
300 x+=5;
301 attr |= tv->my_colorpair(tv,x);
302 } else if(tv->color_for == TV_COLOR_COLQ) {
303 x = bam_aux_getCQi(p->b, p->qpos);
304 if(0 == x) x = bam_get_qual(p->b)[p->qpos];
305 x = x/10 + 1;
306 if (x > 4) x = 4;
307 attr |= tv->my_colorpair(tv,x);
308 }
309 tv->my_attron(tv,attr);
310 tv->my_mvaddch(tv,row, tv->ccol, bam_is_rev(p->b)? tolower(c) : toupper(c));
311 tv->my_attroff(tv,attr);
312 }
313 }
314 c = j? '*' : rb;
315 if (c == '*') {
316 attr = tv->my_colorpair(tv,8);
317 tv->my_attron(tv,attr);
318 tv->my_mvaddch(tv,1, tv->ccol++, c);
319 tv->my_attroff(tv,attr);
320 } else tv->my_mvaddch(tv,1, tv->ccol++, c);
321 }
322 tv->last_pos = pos;
323 ks_free(&ks);
324 return 0;
325 }
326
327
328
329
tv_push_aln(const bam1_t * b,tview_t * tv)330 static int tv_push_aln(const bam1_t *b, tview_t *tv)
331 {
332 /* If we are restricted to specific readgroups check RG is in the list */
333 if ( tv->rg_hash )
334 {
335 const uint8_t *rg = bam_aux_get(b, "RG");
336 if ( !rg ) return 0; // If we don't have an RG tag exclude read
337 khiter_t k = kh_get(kh_rg, tv->rg_hash, (const char*)(rg + 1));
338 if ( k == kh_end(tv->rg_hash) ) return 0; // if RG tag is not in list of allowed tags exclude read
339 }
340 if (tv->no_skip) {
341 uint32_t *cigar = bam_get_cigar(b); // this is cheating...
342 int i;
343 for (i = 0; i <b->core.n_cigar; ++i) {
344 if ((cigar[i]&0xf) == BAM_CREF_SKIP)
345 cigar[i] = cigar[i]>>4<<4 | BAM_CDEL;
346 }
347 }
348 bam_lplbuf_push(b, tv->lplbuf);
349 return 0;
350 }
351
base_draw_aln(tview_t * tv,int tid,hts_pos_t pos)352 int base_draw_aln(tview_t *tv, int tid, hts_pos_t pos)
353 {
354 int ret;
355 assert(tv!=NULL);
356 // reset
357 tv->my_clear(tv);
358 tv->curr_tid = tid; tv->left_pos = pos;
359 tv->last_pos = tv->left_pos - 1;
360 tv->ccol = 0;
361 // print ref and consensus
362 if (tv->fai) {
363 char *str;
364 if (tv->ref) free(tv->ref);
365 assert(tv->curr_tid>=0);
366
367 const char *ref_name = sam_hdr_tid2name(tv->header, tv->curr_tid);
368 str = (char*)calloc(strlen(ref_name) + 30, 1);
369 assert(str!=NULL);
370 sprintf(str, "%s:%"PRIhts_pos"-%"PRIhts_pos, ref_name, tv->left_pos + 1, tv->left_pos + tv->mcol);
371 tv->ref = fai_fetch64(tv->fai, str, &tv->l_ref);
372 free(str);
373 if ( !tv->ref )
374 {
375 fprintf(stderr,"Could not read the reference sequence. Is it seekable (plain text or compressed + .gzi indexed with bgzip)?\n");
376 exit(1);
377 }
378 }
379 // draw aln
380 bam_lplbuf_reset(tv->lplbuf);
381 hts_itr_t *iter = sam_itr_queryi(tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol);
382 bam1_t *b = bam_init1();
383 while ((ret = sam_itr_next(tv->fp, iter, b)) >= 0) tv_push_aln(b, tv);
384 bam_destroy1(b);
385 hts_itr_destroy(iter);
386 if (ret < -1) {
387 print_error("tview", "could not read from input file");
388 exit(1);
389 }
390
391 bam_lplbuf_push(0, tv->lplbuf);
392
393 while (tv->ccol < tv->mcol) {
394 hts_pos_t pos = tv->last_pos + 1;
395 int interval = pos < TEN_DIGITS ? 10 : 20;
396 if (pos%interval == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-"PRIhts_pos, pos+1);
397 tv->my_mvaddch(tv,1, tv->ccol++, (tv->ref && pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N');
398 ++tv->last_pos;
399 }
400 return 0;
401 }
402
403
404
405
error(const char * format,...)406 static void error(const char *format, ...)
407 {
408 if ( !format )
409 {
410 fprintf(stderr,
411 "Usage: samtools tview [options] <aln.bam> [ref.fasta]\n"
412 "Options:\n"
413 " -d display output as (H)tml or (C)urses or (T)ext \n"
414 " -X include customized index file\n"
415 " -p chr:pos go directly to this position\n"
416 " -s STR display only reads from this sample or group\n"
417 " -w INT display width (with -d T only)\n");
418 sam_global_opt_help(stderr, "-.--.--.");
419 }
420 else
421 {
422 va_list ap;
423 va_start(ap, format);
424 vfprintf(stderr, format, ap);
425 va_end(ap);
426 }
427 exit(-1);
428 }
429
430 enum dipsay_mode {display_ncurses,display_html,display_text};
431 extern tview_t* curses_tv_init(const char *fn, const char *fn_fa,
432 const char *samples, const htsFormat *fmt);
433 extern tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *fn_idx,
434 const char *samples, const htsFormat *fmt);
435 extern tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *fn_idx,
436 const char *samples, const htsFormat *fmt);
437
bam_tview_main(int argc,char * argv[])438 int bam_tview_main(int argc, char *argv[])
439 {
440 int view_mode=display_ncurses, display_width = 0;
441 tview_t* tv=NULL;
442 char *samples=NULL, *position=NULL, *ref, *fn_idx=NULL;
443 int c, has_index_file = 0, ref_index = 0;
444
445 sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
446 static const struct option lopts[] = {
447 SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
448 { NULL, 0, NULL, 0 }
449 };
450
451 char *tmp;
452 while ((c = getopt_long(argc, argv, "s:p:d:Xw:", lopts, NULL)) >= 0) {
453 switch (c) {
454 case 'w':
455 display_width = strtol(optarg,&tmp,10);
456 if ( tmp==optarg || *tmp || display_width<1 ) error("Could not parse: -w %s\n",optarg);
457 break;
458 case 's': samples=optarg; break;
459 case 'p': position=optarg; break;
460 case 'X': has_index_file=1; break; // -X flag for index filename
461 case 'd':
462 {
463 switch(optarg[0])
464 {
465 case 'H': case 'h': view_mode=display_html;break;
466 case 'T': case 't': view_mode=display_text;break;
467 case 'C': case 'c': view_mode=display_ncurses;break;
468 default: view_mode=display_ncurses;break;
469 }
470 break;
471 }
472 default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
473 /* else fall-through */
474 case '?': error(NULL);
475 }
476 }
477 if (argc==optind) error(NULL);
478 if (display_width && view_mode == display_ncurses)
479 error("The -w option is currently supported only with -d T and -d H\n");
480
481 ref = NULL;
482 ref_index = optind;
483 if (!has_index_file) {
484 ref = (optind+1>=argc)? ga.reference : argv[optind+1];
485 }
486 else {
487 ref = (optind+2>=argc)? ga.reference : argv[optind+2];
488 if (optind+1 >= argc) {
489 fprintf(stderr, "Incorrect number of arguments provided! Aborting...\n");
490 return 1;
491 }
492 fn_idx = argv[optind+1];
493 ref_index = optind+1;
494 }
495
496 switch(view_mode)
497 {
498 case display_ncurses:
499 tv = curses_tv_init(argv[ref_index], ref, samples, &ga.in);
500 break;
501
502 case display_text:
503 tv = text_tv_init(argv[ref_index], ref, fn_idx, samples, &ga.in);
504 if ( display_width ) tv->mcol = display_width;
505 break;
506
507 case display_html:
508 tv = html_tv_init(argv[ref_index], ref, fn_idx, samples, &ga.in);
509 if ( display_width ) tv->mcol = display_width;
510 break;
511 }
512 if (tv==NULL)
513 {
514 error("cannot create view");
515 return EXIT_FAILURE;
516 }
517
518 if ( position )
519 {
520 int tid;
521 hts_pos_t beg, end;
522 if (!sam_parse_region(tv->header, position, &tid, &beg, &end, 0)) {
523 tv->my_destroy(tv);
524 fprintf(stderr, "Unknown reference or malformed region\n");
525 exit(EXIT_FAILURE);
526 }
527 tv->curr_tid = tid;
528 tv->left_pos = beg;
529 }
530 else if ( tv->fai )
531 {
532 // find the first sequence present in both BAM and the reference file
533 int i;
534 for (i=0; i < sam_hdr_nref(tv->header); i++)
535 {
536 if ( faidx_has_seq(tv->fai, sam_hdr_tid2name(tv->header, i)) ) break;
537 }
538 if ( i==sam_hdr_nref(tv->header) )
539 {
540 tv->my_destroy(tv);
541 fprintf(stderr,"None of the BAM sequence names present in the fasta file\n");
542 exit(EXIT_FAILURE);
543 }
544 tv->curr_tid = i;
545 }
546 tv->my_drawaln(tv, tv->curr_tid, tv->left_pos);
547 tv->my_loop(tv);
548 tv->my_destroy(tv);
549
550 return EXIT_SUCCESS;
551 }
552