1 /* The MIT License
2 
3    Copyright (c) 2013 Adrian Tan <atks@umich.edu>
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to deal
7    in the Software without restriction, including without limitation the rights
8    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9    copies of the Software, and to permit persons to whom the Software is
10    furnished to do so, subject to the following conditions:
11 
12    The above copyright notice and this permission notice shall be included in
13    all copies or substantial portions of the Software.
14 
15    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21    THE SOFTWARE.
22 */
23 
24 #ifndef HTS_UTILS_H
25 #define HTS_UTILS_H
26 
27 #include <sys/stat.h>
28 #include "htslib/kstring.h"
29 #include "htslib/khash.h"
30 #include "htslib/hts.h"
31 #include "htslib/sam.h"
32 #include "htslib/vcf.h"
33 #include "htslib/bgzf.h"
34 #include "htslib/faidx.h"
35 #include "htslib/tbx.h"
36 #include "htslib/hfile.h"
37 #include "utils.h"
38 
39 /**********
40  *FAI UTILS
41  **********/
42 typedef struct {
43     int32_t line_len, line_blen;
44     int64_t len;
45     uint64_t offset;
46 } faidx1_t;
47 
48 KHASH_MAP_INIT_STR(s, faidx1_t)
49 
50 struct faidx_t {
51     BGZF *bgzf;
52     int n, m;
53     char **name;
54     khash_t(s) *hash;
55 };
56 
57 /**
58  * An alternate sequence fetcher for upper case sequence.
59  */
60 char *faidx_fetch_uc_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len);
61 
62 /**********
63  *HTS UTILS
64  **********/
65 
66 /**
67  * Checks file extension for use in writing files.
68  */
69 bool str_ends_with(std::string const & value, std::string const & ending);
70 
71 /**
72  * Checks file extension based on filename.
73  */
74 int32_t hts_filename_type(std::string const& value);
75 
76 /**************
77  *BAM HDR UTILS
78  **************/
79 
80 /**
81  * Copies contigs found in bam header to bcf header.
82  */
83 void bam_hdr_transfer_contigs_to_bcf_hdr(const bam_hdr_t *sh, bcf_hdr_t *vh);
84 
85 /**
86  * Gets sample names from bam header.
87  */
88 std::string bam_hdr_get_sample_name(bam_hdr_t* hdr);
89 
90 /**
91  * Get number of sequences.
92  */
93 #define bam_hdr_get_n_targets(h) ((h)->n_targets)
94 
95 /**
96  * Get list of sequence names.
97  */
98 #define bam_hdr_get_target_name(h) ((h)->target_name)
99 
100 /**
101  * Get list of sequence lenghts.
102  */
103 #define bam_hdr_get_target_len(h) ((h)->target_len)
104 
105 /**********
106  *BAM UTILS
107  **********/
108 
109 //used when a base on a read is not aligned to the human genome reference
110 //in particular for soft clips and insertions
111 #define BAM_READ_INDEX_NA -1
112 
113 /**
114  * Gets the chromosome name of the tid.
115  */
116 #define bam_get_chrom(h, s) ((h)->target_name[(s)->core.tid])
117 
118 /**
119  * Gets the 1 based start position of the first mapped base in the read.
120  */
121 #define bam_get_pos1(s) ((s)->core.pos + 1)
122 
123 /**
124  * Gets the 0 based start position of the first mapped base in the read.
125  */
126 #define bam_get_pos0(s) ((s)->core.pos)
127 
128 /**
129  * Gets the end position of the last mapped base in the read.
130  */
131 int32_t bam_get_end_pos1(bam1_t *srec);
132 
133 /**
134  * Gets the template ID of the read.
135  */
136 #define bam_get_tid(s) ((s)->core.tid)
137 
138 /**
139  * Gets the template ID of the paired read.
140  */
141 #define bam_get_mtid(s) ((s)->core.mtid)
142 
143 /**
144  * Gets the start position of the first mapped base in the read.
145  */
146 #define bam_get_mpos1(s) ((s)->core.mpos)
147 
148 /**
149  * Gets the read sequence from a bam record
150  */
151 void bam_get_seq_string(bam1_t *s, kstring_t *seq);
152 
153 /**
154  * Gets the base qualities from a bam record.
155  */
156 void bam_get_qual_string(bam1_t *s, kstring_t *qual);
157 
158 /**
159  * Gets the cigar sequence from a bam record.
160  */
161 #define bam_get_n_cigar_op(b) ((b)->core.n_cigar)
162 
163 /**
164  * Gets the cigar from a BAM record.
165  */
166 void bam_get_cigar_string(bam1_t *s, kstring_t *cigar);
167 
168 /**
169  * Gets the cigar string from a bam record.
170  */
171 void bam_get_cigar_expanded_string(bam1_t *s, kstring_t *cigar_string);
172 
173 /**
174  * Is this sequence the first read?
175  */
176 #define bam_is_fread1(s) (((s)->core.flag&BAM_FREAD1) != 0)
177 
178 /**
179  * Is this sequence the second read?
180  */
181 #define bam_is_fread2(s) (((s)->core.flag&BAM_FREAD2) != 0)
182 
183 /**
184  * Gets the base in the read that is mapped to a genomic position.
185  * Returns -1 if it does not exists.
186  */
187 void bam_get_base_and_qual(bam1_t *srec, uint32_t pos, char& base, char& qual, int32_t& rpos);
188 
189 /**
190  * Gets the base in the read that is mapped to a genomic position.
191  * Returns true if successful, false if the location is a deletion or not on the read.
192  */
193 void bam_get_base_and_qual_and_read_and_qual(bam1_t *s, uint32_t pos, char& base, char& qual, int32_t& rpos, kstring_t* readseq, kstring_t* readqual);
194 
195 /**
196  * Converts base encoding to character.
197  */
198 #define bam_base2char(b) ("XACXGXXXTXXXXXXN"[(b)])
199 
200 /**
201  * Gets flag.
202  */
203 #define bam_get_flag(s) ((s)->core.flag)
204 
205 /**
206  * Get map quality.
207  */
208 #define bam_get_mapq(s) ((s)->core.qual)
209 
210 /**
211  * Get tid - e.g. chromosome id.
212  */
213 #define bam_get_tid(s) ((s)->core.tid)
214 
215 /**
216  * Get read length.
217  */
218 #define bam_get_l_qseq(s) ((s)->core.l_qseq)
219 
220 /**
221  * Prints a bam.
222  */
223 void bam_print(bam_hdr_t *h, bam1_t *s);
224 
225 /**************
226  *BCF HDR UTILS
227  **************/
228 
229 /**
230  * Prints header.
231  */
232 void bcf_hdr_print(bcf_hdr_t *h);
233 
234 /**
235  * Copies contigs found in bcf header to another bcf header.
236  */
237 void bcf_hdr_transfer_contigs(const bcf_hdr_t *sh, bcf_hdr_t *vh);
238 
239 /**
240  * Checks if a particular header type exists
241  * @hdr  - header
242  * @type - BCF_HL_FLT, BCF_HL_INFO, BCF_HL_FMT, BCF_HL_CTG
243  * @key  - the key name
244  */
245 bool bcf_hdr_exists(bcf_hdr_t *hdr, int32_t type, const char *key);
246 
247 /**
248  * Extracts sequence length by rid.
249  */
250 int32_t* bcf_hdr_seqlen(const bcf_hdr_t *hdr, int32_t *nseq);
251 
252 /**
253  * Copies an info fields from one record to another
254  * @hsrc  - source header
255  * @vsrc  - source bcf1_t
256  * @hdest - destination header
257  * @vdest - destination bcf1_t
258  * @key   - the key name
259  * @type  - BCF_HT_FLAG, BCF_HT_INT, BCF_HT_REAL, BCF_HT_STR
260  */
261 void bcf_copy_info_field(bcf_hdr_t *hsrc, bcf1_t* vsrc, bcf_hdr_t *hdest, bcf1_t* vdest, const char* key, int32_t type);
262 
263 /**
264  * Get samples from bcf header
265  */
266 #define bcf_hdr_get_samples(h) ((h)->samples)
267 
268 /**
269  * Get ith sample name from bcf header
270  */
271 #define bcf_hdr_get_sample_name(h, i) ((h)->samples[i])
272 
273 /**
274  * Get number of samples in bcf header
275  */
276 int32_t bcf_hdr_get_n_sample(bcf_hdr_t *h);
277 
278 /**
279  * Reads header of a VCF file and returns the bcf header object.
280  * This wraps around vcf_hdr_read from the original htslib to
281  * allow for an alternative header file to be read in.
282  *
283  * this searches for the alternative header saved as <filename>.hdr
284  * If the VCF files is BCF, any alternative header is ignored.
285  */
286 bcf_hdr_t *bcf_alt_hdr_read(htsFile *fp);
287 
288 /**
289  * Subsets a record by samples.
290  * @imap - indices the subsetted samples
291  */
292 int bcf_hdr_subset_samples(const bcf_hdr_t *h, bcf1_t *v, std::vector<int32_t>& imap);
293 
294 /**
295  * Help function for adding a header with a backup tag name.
296  * If the <tag> is already present, a new tag is attempted
297  * in the format <tag>_1 to <tag>_9.  If <tag>_9 failed,
298  * the function will not add any new tag and will return
299  * an empty string.
300  *
301  * Returns the tag that was inserted or updated.
302  */
303 std::string bcf_hdr_append_info_with_backup_naming(bcf_hdr_t *h, std::string tag, std::string number, std::string type, std::string description, bool rename);
304 
305 /**
306  * Translates BCF_VL types to string.
307  */
308 std::string bcf_hdr_vl2str(int32_t id);
309 
310 /**
311  * Translates BCF_BT types to string.
312  */
313 std::string bcf_hdr_ht2str(int32_t id);
314 
315 /**********
316  *BCF UTILS
317  **********/
318 
319 /**
320  * Gets number of expected genotypes from number of alleles for a ploidy of 2.
321  */
322 #define bcf_an2gn(n) (((n+1)*n)>>1)
323 
324 /**
325  * Gets number of genotypes from number of alleles and ploidy.
326  *
327  * Returns 0 if number of alleles and genotypes are not consistent.
328  */
329 uint32_t bcf_ap2g(uint32_t no_allele, uint32_t no_ploidy);
330 
331 /**
332  * Gets alleles from number of ploidy and genotype index.
333  */
334 void bcf_pg2a(uint32_t no_ploidy, uint32_t genotype_index, std::vector<int32_t>& alleles);
335 
336 /**
337  * Gets number of ploidy from number of alleles and genotypes.
338  */
339 uint32_t bcf_ag2p(uint32_t no_alleles, uint32_t no_genotypes);
340 
341 /**
342  * Gets genotype from genotype index and ploidy.
343  */
344 std::vector<int32_t> bcf_ip2g(int32_t genotype_index, uint32_t no_ploidy);
345 
346 /**
347  * Gets index of a genotype of p ploidy.
348  */
349 uint32_t bcf_g2i(int32_t* g, uint32_t p);
350 
351 /**
352  * Gets index of a genotype of n ploidy.
353  */
354 uint32_t bcf_g2i(std::vector<int32_t>& g);
355 
356 /**
357  * Gets index of a diploid genotype.
358  */
359 #define bcf_2g2c(i,j) ((i)+((((j)+1)*(j))>>1))
360 
361 ///**
362 // * Gets number of genotypes from number of alleles and ploidy.
363 // */
364 //uint32_t bcf_g2i(std::string genotype);
365 
366 /**
367  * Returns true if a is before b, false otherwise.
368  */
369 bool bcf_is_in_order(bcf1_t *a, bcf1_t *b);
370 
371 /**
372  * Returns a copy v that only has the chr:pos1:ref:alt information.
373  */
374 bcf1_t* bcf_copy_variant(bcf_hdr_t *h, bcf1_t *v);
375 
376 /**
377  * Gets a string representation of a variant.
378  */
379 std::string bcf_variant2string(bcf_hdr_t *h, bcf1_t *v);
380 
381 /**
382  * Gets a string representation of a variant.
383  */
384 void bcf_variant2string(bcf_hdr_t *h, bcf1_t *v, kstring_t *var);
385 
386 /**
387  * Gets a sorted string representation of a variant.
388  */
389 void bcf_variant2string_sorted(bcf_hdr_t *h, bcf1_t *v, kstring_t *var);
390 
391 /**
392  * Gets a string representation of the alleles of a variant.
393  */
394 void bcf_alleles2string(bcf_hdr_t *h, bcf1_t *v, kstring_t *var);
395 
396 /**
397  * Gets a sorted string representation of the alleles of a variant.
398  */
399 void bcf_alleles2string_sorted(bcf_hdr_t *h, bcf1_t *v, kstring_t *var);
400 
401 /**
402  * Prints a VCF record to STDERR.
403  */
404 void bcf_print(bcf_hdr_t *h, bcf1_t *v);
405 
406 /**
407  * Prints a VCF record in compact string representation to STDERR.
408  */
409 void bcf_print_lite(bcf_hdr_t *h, bcf1_t *v);
410 
411 /**
412  * Prints a VCF record in compact string representation to STDERR with alleles sorted.
413  */
414 void bcf_print_lite_sorted(bcf_hdr_t *h, bcf1_t *v);
415 
416 /**
417  * Prints a VCF record in compact string representation to STDERR with a new line.
418  */
419 void bcf_print_liten(bcf_hdr_t *h, bcf1_t *v);
420 
421 /**
422  * Get number of chromosomes
423  */
424 #define bcf_get_n_chrom(h) ((h)->n[BCF_DT_CTG])
425 
426 /**
427  * Get chromosome name
428  */
429 const char* bcf_get_chrom(bcf_hdr_t *h, bcf1_t *v);
430 
431 /**
432  * Set chromosome name
433  */
434 void bcf_set_chrom(bcf_hdr_t *h, bcf1_t *v, const char* chrom);
435 
436 /**
437  * Get RID
438  */
439 #define bcf_get_rid(v) ((v)->rid)
440 
441 /**
442  * Set RID
443  */
444 #define bcf_set_rid(v, c) ((v)->rid=(c))
445 
446 /**
447  * Check if variant is passed
448  */
449 bool bcf_is_passed(bcf_hdr_t *h, bcf1_t *v);
450 
451 /**
452  * Get 0-based position
453  */
454 #define bcf_get_pos0(v) ((v)->pos)
455 
456 /**
457  * Set 0-based position
458  */
459 #define bcf_set_pos0(v, p) ((v)->pos = (p))
460 
461 /**
462  * Get 1-based position
463  */
464 #define bcf_get_pos1(v) ((v)->pos+1)
465 
466 /**
467  * Get 1-based end position
468  */
469 #define bcf_get_end1(v) ((v)->pos + strlen((v)->d.allele[0]))
470 
471 /**
472  * Set 1-based position
473  */
474 #define bcf_set_pos1(v, p) ((v)->pos = (p)-1)
475 
476 /**
477  * Get id
478  */
479 #define bcf_get_id(v) ((v)->d.id)
480 
481 /**
482  * Set id.
483  */
484 void bcf_set_id(bcf1_t *v, char* id);
485 
486 /**
487  * Gets an info flag.
488  */
489 bool bcf_get_info_flg(bcf_hdr_t *h, bcf1_t *v, const char* tag);
490 
491 /**
492  * Sets an info flag.
493  */
494 void bcf_set_info_flg(bcf_hdr_t *h, bcf1_t *v, const char* tag, bool value);
495 
496 /**
497  * Gets an info integer.
498  */
499 int32_t bcf_get_info_int(bcf_hdr_t *h, bcf1_t *v, const char* tag, int32_t default_value = 0);
500 
501 /**
502  * Sets an info integer.
503  */
504 void bcf_set_info_int(bcf_hdr_t *h, bcf1_t *v, const char* tag, int32_t value);
505 
506 /**
507  * Gets an info integer vector.
508  */
509 std::vector<int32_t> bcf_get_info_int_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, int32_t default_size=0, int32_t default_value=0);
510 
511 /**
512  * Sets an info integer vector.
513  */
514 void bcf_set_info_int_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::vector<int32_t>& values);
515 
516 /**
517  * Gets an info float.
518  */
519 float bcf_get_info_flt(bcf_hdr_t *h, bcf1_t *v, const char* tag, float default_value = 0);
520 
521 /**
522  * Sets an info float.
523  */
524 void bcf_set_info_flt(bcf_hdr_t *h, bcf1_t *v, const char* tag, float value);
525 
526 /**
527  * Gets an info integer vector.
528  */
529 std::vector<float> bcf_get_info_flt_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, int32_t default_size=0, float default_value=0);
530 
531 /**
532  * Sets an info float vector.
533  */
534 void bcf_set_info_flt_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::vector<float>& values);
535 
536 /**
537  * Gets an info string.
538  */
539 std::string bcf_get_info_str(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::string default_value = ".");
540 
541 /**
542  * Sets an info string.
543  */
544 void bcf_set_info_str(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::string default_value = ".");
545 
546 /**
547  * Gets an info string vector.
548  */
549 std::vector<std::string> bcf_get_info_str_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::string default_value = ".");
550 
551 /**
552  * Sets an info string vector.
553  */
554 void bcf_set_info_str_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::vector<std::string> values);
555 
556 /**
557  * Get allele
558  */
559 #define bcf_get_allele(v) ((v)->d.allele)
560 
561 /**
562  * Get n_alleles of a bcf record
563  */
564 #define bcf_get_n_allele(v) ((v)->n_allele)
565 
566 /**
567  * Get reference base for a SNP, assumes the record is a biallelic SNP
568  */
569 #define bcf_get_snp_ref(v) ((v)->d.allele[0][0])
570 
571 /**
572  * Get alternative base for a SNP, assumes the record is a biallelic SNP
573  */
574 #define bcf_get_snp_alt(v) ((v)->d.allele[1][0])
575 
576 /**
577  * Get reference allele
578  */
579 #define bcf_get_ref(v) ((v)->d.allele[0])
580 
581 /**
582  * Get alternative allele
583  */
584 #define bcf_get_alt(v, i) ((v)->d.allele[i])
585 
586 /**
587  * Get variant type
588  */
589 #define bcf_get_var_type(v) ((v)->d.var_type)
590 
591 /**
592  * Get qual
593  */
594 #define bcf_get_qual(v) ((v)->qual)
595 
596 /**
597  * Get n_flt of a bcf record
598  */
599 #define bcf_get_n_filter(v) ((v)->d.n_flt)
600 
601 /**
602  * Get ith format name
603  */
604 #define bcf_get_format(h, v, i) (h)->id[BCF_DT_ID][(v->d.fmt)[i].id].key
605 /**
606  * Get number of samples in bcf record
607  */
608 #define bcf_get_n_sample(v) ((v)->n_sample)
609 
610 /**
611  * Set number of samples in bcf record
612  */
613 #define bcf_set_n_sample(v, n) ((v)->n_sample = (n))
614 
615 /**
616  * Get qual
617  */
618 #define bcf_get_qual(v) ((v)->qual)
619 
620 /**
621  * Set qual
622  */
623 #define bcf_set_qual(v, q) ((v)->qual = (q))
624 
625 /**
626  * Creates a dummy header with hs37d5 contigs for testing purposes.
627  */
628 bcf_hdr_t* bcf_create_dummy_hdr();
629 
630 /**
631  * Creates a dummy bcf record representing the variant for testing purposes.
632  *
633  * @variant - 1:123:ACT:AC/ACCCC
634  */
635 bcf1_t* bcf_create_dummy_record(bcf_hdr_t* h, std::string& variant);
636 
637 /**********
638  *LOG UTILS
639  **********/
640 
641 /**
642  * Prints a message to STDERR and abort.
643  */
644 void error(const char * msg, ...);
645 
646 /**
647  * Gives a notice to STDERR.
648  */
649 void notice(const char * msg, ...);
650 
651 #endif
652