1 /* The MIT License 2 3 Copyright (c) 2013 Adrian Tan <atks@umich.edu> 4 5 Permission is hereby granted, free of charge, to any person obtaining a copy 6 of this software and associated documentation files (the "Software"), to deal 7 in the Software without restriction, including without limitation the rights 8 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 copies of the Software, and to permit persons to whom the Software is 10 furnished to do so, subject to the following conditions: 11 12 The above copyright notice and this permission notice shall be included in 13 all copies or substantial portions of the Software. 14 15 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 THE SOFTWARE. 22 */ 23 24 #ifndef HTS_UTILS_H 25 #define HTS_UTILS_H 26 27 #include <sys/stat.h> 28 #include "htslib/kstring.h" 29 #include "htslib/khash.h" 30 #include "htslib/hts.h" 31 #include "htslib/sam.h" 32 #include "htslib/vcf.h" 33 #include "htslib/bgzf.h" 34 #include "htslib/faidx.h" 35 #include "htslib/tbx.h" 36 #include "htslib/hfile.h" 37 #include "utils.h" 38 39 /********** 40 *FAI UTILS 41 **********/ 42 typedef struct { 43 int32_t line_len, line_blen; 44 int64_t len; 45 uint64_t offset; 46 } faidx1_t; 47 48 KHASH_MAP_INIT_STR(s, faidx1_t) 49 50 struct faidx_t { 51 BGZF *bgzf; 52 int n, m; 53 char **name; 54 khash_t(s) *hash; 55 }; 56 57 /** 58 * An alternate sequence fetcher for upper case sequence. 59 */ 60 char *faidx_fetch_uc_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len); 61 62 /********** 63 *HTS UTILS 64 **********/ 65 66 /** 67 * Checks file extension for use in writing files. 68 */ 69 bool str_ends_with(std::string const & value, std::string const & ending); 70 71 /** 72 * Checks file extension based on filename. 73 */ 74 int32_t hts_filename_type(std::string const& value); 75 76 /************** 77 *BAM HDR UTILS 78 **************/ 79 80 /** 81 * Copies contigs found in bam header to bcf header. 82 */ 83 void bam_hdr_transfer_contigs_to_bcf_hdr(const bam_hdr_t *sh, bcf_hdr_t *vh); 84 85 /** 86 * Gets sample names from bam header. 87 */ 88 std::string bam_hdr_get_sample_name(bam_hdr_t* hdr); 89 90 /** 91 * Get number of sequences. 92 */ 93 #define bam_hdr_get_n_targets(h) ((h)->n_targets) 94 95 /** 96 * Get list of sequence names. 97 */ 98 #define bam_hdr_get_target_name(h) ((h)->target_name) 99 100 /** 101 * Get list of sequence lenghts. 102 */ 103 #define bam_hdr_get_target_len(h) ((h)->target_len) 104 105 /********** 106 *BAM UTILS 107 **********/ 108 109 //used when a base on a read is not aligned to the human genome reference 110 //in particular for soft clips and insertions 111 #define BAM_READ_INDEX_NA -1 112 113 /** 114 * Gets the chromosome name of the tid. 115 */ 116 #define bam_get_chrom(h, s) ((h)->target_name[(s)->core.tid]) 117 118 /** 119 * Gets the 1 based start position of the first mapped base in the read. 120 */ 121 #define bam_get_pos1(s) ((s)->core.pos + 1) 122 123 /** 124 * Gets the 0 based start position of the first mapped base in the read. 125 */ 126 #define bam_get_pos0(s) ((s)->core.pos) 127 128 /** 129 * Gets the end position of the last mapped base in the read. 130 */ 131 int32_t bam_get_end_pos1(bam1_t *srec); 132 133 /** 134 * Gets the template ID of the read. 135 */ 136 #define bam_get_tid(s) ((s)->core.tid) 137 138 /** 139 * Gets the template ID of the paired read. 140 */ 141 #define bam_get_mtid(s) ((s)->core.mtid) 142 143 /** 144 * Gets the start position of the first mapped base in the read. 145 */ 146 #define bam_get_mpos1(s) ((s)->core.mpos) 147 148 /** 149 * Gets the read sequence from a bam record 150 */ 151 void bam_get_seq_string(bam1_t *s, kstring_t *seq); 152 153 /** 154 * Gets the base qualities from a bam record. 155 */ 156 void bam_get_qual_string(bam1_t *s, kstring_t *qual); 157 158 /** 159 * Gets the cigar sequence from a bam record. 160 */ 161 #define bam_get_n_cigar_op(b) ((b)->core.n_cigar) 162 163 /** 164 * Gets the cigar from a BAM record. 165 */ 166 void bam_get_cigar_string(bam1_t *s, kstring_t *cigar); 167 168 /** 169 * Gets the cigar string from a bam record. 170 */ 171 void bam_get_cigar_expanded_string(bam1_t *s, kstring_t *cigar_string); 172 173 /** 174 * Is this sequence the first read? 175 */ 176 #define bam_is_fread1(s) (((s)->core.flag&BAM_FREAD1) != 0) 177 178 /** 179 * Is this sequence the second read? 180 */ 181 #define bam_is_fread2(s) (((s)->core.flag&BAM_FREAD2) != 0) 182 183 /** 184 * Gets the base in the read that is mapped to a genomic position. 185 * Returns -1 if it does not exists. 186 */ 187 void bam_get_base_and_qual(bam1_t *srec, uint32_t pos, char& base, char& qual, int32_t& rpos); 188 189 /** 190 * Gets the base in the read that is mapped to a genomic position. 191 * Returns true if successful, false if the location is a deletion or not on the read. 192 */ 193 void bam_get_base_and_qual_and_read_and_qual(bam1_t *s, uint32_t pos, char& base, char& qual, int32_t& rpos, kstring_t* readseq, kstring_t* readqual); 194 195 /** 196 * Converts base encoding to character. 197 */ 198 #define bam_base2char(b) ("XACXGXXXTXXXXXXN"[(b)]) 199 200 /** 201 * Gets flag. 202 */ 203 #define bam_get_flag(s) ((s)->core.flag) 204 205 /** 206 * Get map quality. 207 */ 208 #define bam_get_mapq(s) ((s)->core.qual) 209 210 /** 211 * Get tid - e.g. chromosome id. 212 */ 213 #define bam_get_tid(s) ((s)->core.tid) 214 215 /** 216 * Get read length. 217 */ 218 #define bam_get_l_qseq(s) ((s)->core.l_qseq) 219 220 /** 221 * Prints a bam. 222 */ 223 void bam_print(bam_hdr_t *h, bam1_t *s); 224 225 /************** 226 *BCF HDR UTILS 227 **************/ 228 229 /** 230 * Prints header. 231 */ 232 void bcf_hdr_print(bcf_hdr_t *h); 233 234 /** 235 * Copies contigs found in bcf header to another bcf header. 236 */ 237 void bcf_hdr_transfer_contigs(const bcf_hdr_t *sh, bcf_hdr_t *vh); 238 239 /** 240 * Checks if a particular header type exists 241 * @hdr - header 242 * @type - BCF_HL_FLT, BCF_HL_INFO, BCF_HL_FMT, BCF_HL_CTG 243 * @key - the key name 244 */ 245 bool bcf_hdr_exists(bcf_hdr_t *hdr, int32_t type, const char *key); 246 247 /** 248 * Extracts sequence length by rid. 249 */ 250 int32_t* bcf_hdr_seqlen(const bcf_hdr_t *hdr, int32_t *nseq); 251 252 /** 253 * Copies an info fields from one record to another 254 * @hsrc - source header 255 * @vsrc - source bcf1_t 256 * @hdest - destination header 257 * @vdest - destination bcf1_t 258 * @key - the key name 259 * @type - BCF_HT_FLAG, BCF_HT_INT, BCF_HT_REAL, BCF_HT_STR 260 */ 261 void bcf_copy_info_field(bcf_hdr_t *hsrc, bcf1_t* vsrc, bcf_hdr_t *hdest, bcf1_t* vdest, const char* key, int32_t type); 262 263 /** 264 * Get samples from bcf header 265 */ 266 #define bcf_hdr_get_samples(h) ((h)->samples) 267 268 /** 269 * Get ith sample name from bcf header 270 */ 271 #define bcf_hdr_get_sample_name(h, i) ((h)->samples[i]) 272 273 /** 274 * Get number of samples in bcf header 275 */ 276 int32_t bcf_hdr_get_n_sample(bcf_hdr_t *h); 277 278 /** 279 * Reads header of a VCF file and returns the bcf header object. 280 * This wraps around vcf_hdr_read from the original htslib to 281 * allow for an alternative header file to be read in. 282 * 283 * this searches for the alternative header saved as <filename>.hdr 284 * If the VCF files is BCF, any alternative header is ignored. 285 */ 286 bcf_hdr_t *bcf_alt_hdr_read(htsFile *fp); 287 288 /** 289 * Subsets a record by samples. 290 * @imap - indices the subsetted samples 291 */ 292 int bcf_hdr_subset_samples(const bcf_hdr_t *h, bcf1_t *v, std::vector<int32_t>& imap); 293 294 /** 295 * Help function for adding a header with a backup tag name. 296 * If the <tag> is already present, a new tag is attempted 297 * in the format <tag>_1 to <tag>_9. If <tag>_9 failed, 298 * the function will not add any new tag and will return 299 * an empty string. 300 * 301 * Returns the tag that was inserted or updated. 302 */ 303 std::string bcf_hdr_append_info_with_backup_naming(bcf_hdr_t *h, std::string tag, std::string number, std::string type, std::string description, bool rename); 304 305 /** 306 * Translates BCF_VL types to string. 307 */ 308 std::string bcf_hdr_vl2str(int32_t id); 309 310 /** 311 * Translates BCF_BT types to string. 312 */ 313 std::string bcf_hdr_ht2str(int32_t id); 314 315 /********** 316 *BCF UTILS 317 **********/ 318 319 /** 320 * Gets number of expected genotypes from number of alleles for a ploidy of 2. 321 */ 322 #define bcf_an2gn(n) (((n+1)*n)>>1) 323 324 /** 325 * Gets number of genotypes from number of alleles and ploidy. 326 * 327 * Returns 0 if number of alleles and genotypes are not consistent. 328 */ 329 uint32_t bcf_ap2g(uint32_t no_allele, uint32_t no_ploidy); 330 331 /** 332 * Gets alleles from number of ploidy and genotype index. 333 */ 334 void bcf_pg2a(uint32_t no_ploidy, uint32_t genotype_index, std::vector<int32_t>& alleles); 335 336 /** 337 * Gets number of ploidy from number of alleles and genotypes. 338 */ 339 uint32_t bcf_ag2p(uint32_t no_alleles, uint32_t no_genotypes); 340 341 /** 342 * Gets genotype from genotype index and ploidy. 343 */ 344 std::vector<int32_t> bcf_ip2g(int32_t genotype_index, uint32_t no_ploidy); 345 346 /** 347 * Gets index of a genotype of p ploidy. 348 */ 349 uint32_t bcf_g2i(int32_t* g, uint32_t p); 350 351 /** 352 * Gets index of a genotype of n ploidy. 353 */ 354 uint32_t bcf_g2i(std::vector<int32_t>& g); 355 356 /** 357 * Gets index of a diploid genotype. 358 */ 359 #define bcf_2g2c(i,j) ((i)+((((j)+1)*(j))>>1)) 360 361 ///** 362 // * Gets number of genotypes from number of alleles and ploidy. 363 // */ 364 //uint32_t bcf_g2i(std::string genotype); 365 366 /** 367 * Returns true if a is before b, false otherwise. 368 */ 369 bool bcf_is_in_order(bcf1_t *a, bcf1_t *b); 370 371 /** 372 * Returns a copy v that only has the chr:pos1:ref:alt information. 373 */ 374 bcf1_t* bcf_copy_variant(bcf_hdr_t *h, bcf1_t *v); 375 376 /** 377 * Gets a string representation of a variant. 378 */ 379 std::string bcf_variant2string(bcf_hdr_t *h, bcf1_t *v); 380 381 /** 382 * Gets a string representation of a variant. 383 */ 384 void bcf_variant2string(bcf_hdr_t *h, bcf1_t *v, kstring_t *var); 385 386 /** 387 * Gets a sorted string representation of a variant. 388 */ 389 void bcf_variant2string_sorted(bcf_hdr_t *h, bcf1_t *v, kstring_t *var); 390 391 /** 392 * Gets a string representation of the alleles of a variant. 393 */ 394 void bcf_alleles2string(bcf_hdr_t *h, bcf1_t *v, kstring_t *var); 395 396 /** 397 * Gets a sorted string representation of the alleles of a variant. 398 */ 399 void bcf_alleles2string_sorted(bcf_hdr_t *h, bcf1_t *v, kstring_t *var); 400 401 /** 402 * Prints a VCF record to STDERR. 403 */ 404 void bcf_print(bcf_hdr_t *h, bcf1_t *v); 405 406 /** 407 * Prints a VCF record in compact string representation to STDERR. 408 */ 409 void bcf_print_lite(bcf_hdr_t *h, bcf1_t *v); 410 411 /** 412 * Prints a VCF record in compact string representation to STDERR with alleles sorted. 413 */ 414 void bcf_print_lite_sorted(bcf_hdr_t *h, bcf1_t *v); 415 416 /** 417 * Prints a VCF record in compact string representation to STDERR with a new line. 418 */ 419 void bcf_print_liten(bcf_hdr_t *h, bcf1_t *v); 420 421 /** 422 * Get number of chromosomes 423 */ 424 #define bcf_get_n_chrom(h) ((h)->n[BCF_DT_CTG]) 425 426 /** 427 * Get chromosome name 428 */ 429 const char* bcf_get_chrom(bcf_hdr_t *h, bcf1_t *v); 430 431 /** 432 * Set chromosome name 433 */ 434 void bcf_set_chrom(bcf_hdr_t *h, bcf1_t *v, const char* chrom); 435 436 /** 437 * Get RID 438 */ 439 #define bcf_get_rid(v) ((v)->rid) 440 441 /** 442 * Set RID 443 */ 444 #define bcf_set_rid(v, c) ((v)->rid=(c)) 445 446 /** 447 * Check if variant is passed 448 */ 449 bool bcf_is_passed(bcf_hdr_t *h, bcf1_t *v); 450 451 /** 452 * Get 0-based position 453 */ 454 #define bcf_get_pos0(v) ((v)->pos) 455 456 /** 457 * Set 0-based position 458 */ 459 #define bcf_set_pos0(v, p) ((v)->pos = (p)) 460 461 /** 462 * Get 1-based position 463 */ 464 #define bcf_get_pos1(v) ((v)->pos+1) 465 466 /** 467 * Get 1-based end position 468 */ 469 #define bcf_get_end1(v) ((v)->pos + strlen((v)->d.allele[0])) 470 471 /** 472 * Set 1-based position 473 */ 474 #define bcf_set_pos1(v, p) ((v)->pos = (p)-1) 475 476 /** 477 * Get id 478 */ 479 #define bcf_get_id(v) ((v)->d.id) 480 481 /** 482 * Set id. 483 */ 484 void bcf_set_id(bcf1_t *v, char* id); 485 486 /** 487 * Gets an info flag. 488 */ 489 bool bcf_get_info_flg(bcf_hdr_t *h, bcf1_t *v, const char* tag); 490 491 /** 492 * Sets an info flag. 493 */ 494 void bcf_set_info_flg(bcf_hdr_t *h, bcf1_t *v, const char* tag, bool value); 495 496 /** 497 * Gets an info integer. 498 */ 499 int32_t bcf_get_info_int(bcf_hdr_t *h, bcf1_t *v, const char* tag, int32_t default_value = 0); 500 501 /** 502 * Sets an info integer. 503 */ 504 void bcf_set_info_int(bcf_hdr_t *h, bcf1_t *v, const char* tag, int32_t value); 505 506 /** 507 * Gets an info integer vector. 508 */ 509 std::vector<int32_t> bcf_get_info_int_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, int32_t default_size=0, int32_t default_value=0); 510 511 /** 512 * Sets an info integer vector. 513 */ 514 void bcf_set_info_int_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::vector<int32_t>& values); 515 516 /** 517 * Gets an info float. 518 */ 519 float bcf_get_info_flt(bcf_hdr_t *h, bcf1_t *v, const char* tag, float default_value = 0); 520 521 /** 522 * Sets an info float. 523 */ 524 void bcf_set_info_flt(bcf_hdr_t *h, bcf1_t *v, const char* tag, float value); 525 526 /** 527 * Gets an info integer vector. 528 */ 529 std::vector<float> bcf_get_info_flt_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, int32_t default_size=0, float default_value=0); 530 531 /** 532 * Sets an info float vector. 533 */ 534 void bcf_set_info_flt_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::vector<float>& values); 535 536 /** 537 * Gets an info string. 538 */ 539 std::string bcf_get_info_str(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::string default_value = "."); 540 541 /** 542 * Sets an info string. 543 */ 544 void bcf_set_info_str(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::string default_value = "."); 545 546 /** 547 * Gets an info string vector. 548 */ 549 std::vector<std::string> bcf_get_info_str_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::string default_value = "."); 550 551 /** 552 * Sets an info string vector. 553 */ 554 void bcf_set_info_str_vec(bcf_hdr_t *h, bcf1_t *v, const char* tag, std::vector<std::string> values); 555 556 /** 557 * Get allele 558 */ 559 #define bcf_get_allele(v) ((v)->d.allele) 560 561 /** 562 * Get n_alleles of a bcf record 563 */ 564 #define bcf_get_n_allele(v) ((v)->n_allele) 565 566 /** 567 * Get reference base for a SNP, assumes the record is a biallelic SNP 568 */ 569 #define bcf_get_snp_ref(v) ((v)->d.allele[0][0]) 570 571 /** 572 * Get alternative base for a SNP, assumes the record is a biallelic SNP 573 */ 574 #define bcf_get_snp_alt(v) ((v)->d.allele[1][0]) 575 576 /** 577 * Get reference allele 578 */ 579 #define bcf_get_ref(v) ((v)->d.allele[0]) 580 581 /** 582 * Get alternative allele 583 */ 584 #define bcf_get_alt(v, i) ((v)->d.allele[i]) 585 586 /** 587 * Get variant type 588 */ 589 #define bcf_get_var_type(v) ((v)->d.var_type) 590 591 /** 592 * Get qual 593 */ 594 #define bcf_get_qual(v) ((v)->qual) 595 596 /** 597 * Get n_flt of a bcf record 598 */ 599 #define bcf_get_n_filter(v) ((v)->d.n_flt) 600 601 /** 602 * Get ith format name 603 */ 604 #define bcf_get_format(h, v, i) (h)->id[BCF_DT_ID][(v->d.fmt)[i].id].key 605 /** 606 * Get number of samples in bcf record 607 */ 608 #define bcf_get_n_sample(v) ((v)->n_sample) 609 610 /** 611 * Set number of samples in bcf record 612 */ 613 #define bcf_set_n_sample(v, n) ((v)->n_sample = (n)) 614 615 /** 616 * Get qual 617 */ 618 #define bcf_get_qual(v) ((v)->qual) 619 620 /** 621 * Set qual 622 */ 623 #define bcf_set_qual(v, q) ((v)->qual = (q)) 624 625 /** 626 * Creates a dummy header with hs37d5 contigs for testing purposes. 627 */ 628 bcf_hdr_t* bcf_create_dummy_hdr(); 629 630 /** 631 * Creates a dummy bcf record representing the variant for testing purposes. 632 * 633 * @variant - 1:123:ACT:AC/ACCCC 634 */ 635 bcf1_t* bcf_create_dummy_record(bcf_hdr_t* h, std::string& variant); 636 637 /********** 638 *LOG UTILS 639 **********/ 640 641 /** 642 * Prints a message to STDERR and abort. 643 */ 644 void error(const char * msg, ...); 645 646 /** 647 * Gives a notice to STDERR. 648 */ 649 void notice(const char * msg, ...); 650 651 #endif 652