1 /* @include ajseqbam ********************************************************** 2 ** 3 ** AJAX BAM format sequence processing functions 4 ** 5 ** These functions control all aspects of AJAX BAM file processing 6 ** 7 ** @author Copyright (C) 2010 Peter Rice ported from samtools 8 ** @version $Revision: 1.21 $ 9 ** @modified 2010-2011 Peter Rice 10 ** @modified $Date: 2012/07/02 17:24:52 $ by $Author: rice $ 11 ** @@ 12 ** 13 ** This library is free software; you can redistribute it and/or 14 ** modify it under the terms of the GNU Lesser General Public 15 ** License as published by the Free Software Foundation; either 16 ** version 2.1 of the License, or (at your option) any later version. 17 ** 18 ** This library is distributed in the hope that it will be useful, 19 ** but WITHOUT ANY WARRANTY; without even the implied warranty of 20 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 ** Lesser General Public License for more details. 22 ** 23 ** You should have received a copy of the GNU Lesser General Public 24 ** License along with this library; if not, write to the Free Software 25 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 26 ** MA 02110-1301, USA. 27 ** 28 ******************************************************************************/ 29 30 /* The MIT License 31 ** 32 ** Copyright (c) 2008 Genome Research Ltd (GRL). 33 ** 34 ** Permission is hereby granted, free of charge, to any person obtaining 35 ** a copy of this software and associated documentation files (the 36 ** "Software"), to deal in the Software without restriction, including 37 ** without limitation the rights to use, copy, modify, merge, publish, 38 ** distribute, sublicense, and/or sell copies of the Software, and to 39 ** permit persons to whom the Software is furnished to do so, subject to 40 ** the following conditions: 41 ** 42 ** The above copyright notice and this permission notice shall be 43 ** included in all copies or substantial portions of the Software. 44 ** 45 ** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 46 ** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 47 ** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 48 ** NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 49 ** BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 50 ** ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 51 ** CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 52 ** SOFTWARE. 53 */ 54 55 /* Contact: Heng Li <lh3@sanger.ac.uk> */ 56 57 /* 58 ** much modified for EMBOSS by Peter Rice pmr@ebi.ac.uk May 2010 59 ** lists changed to AjPList 60 ** hashes changed to AjPTable 61 ** strings changed to AjPStr 62 ** fixed-length datatypes int32_t etc. changed to EMBOSS types 63 */ 64 65 #ifndef AJSEQBAM_H 66 #define AJSEQBAM_H 67 68 /* 69 ** BAM library provides I/O and various operations on manipulating files 70 ** in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map) 71 ** format. It now supports importing from or exporting to TAM, sorting, 72 ** merging, generating pileup, and quickly retrieval of reads overlapped 73 ** with a specified region. 74 ** 75 ** copyright Genome Research Ltd. 76 */ 77 78 /* ========================================================================= */ 79 /* ============================= include files ============================= */ 80 /* ========================================================================= */ 81 82 #include "ajdefine.h" 83 #include "ajlist.h" 84 #include "ajtable.h" 85 86 #include <stdlib.h> 87 #include <string.h> 88 #include <stdio.h> 89 90 #include "zlib.h" 91 92 AJ_BEGIN_DECLS 93 94 95 96 97 /* ========================================================================= */ 98 /* =============================== constants =============================== */ 99 /* ========================================================================= */ 100 101 102 103 104 #ifdef WIN32 105 #define inline __inline 106 #endif /* WIN32 */ 107 108 /* The read is paired in sequencing, no matter whether it is mapped in a pair */ 109 #define BAM_FPAIRED 1 110 111 /* The read is mapped in a proper pair */ 112 #define BAM_FPROPER_PAIR 2 113 114 /* The read itself is unmapped; conflictive with BAM_FPROPER_PAIR */ 115 #define BAM_FUNMAP 4 116 117 /* The mate is unmapped */ 118 #define BAM_FMUNMAP 8 119 120 /* The read is mapped to the reverse strand */ 121 #define BAM_FREVERSE 16 122 123 /* The mate is mapped to the reverse strand */ 124 #define BAM_FMREVERSE 32 125 126 /* This is read1 */ 127 #define BAM_FREAD1 64 128 129 /* This is read2 */ 130 #define BAM_FREAD2 128 131 132 /* Not primary alignment */ 133 #define BAM_FSECONDARY 256 134 135 /* QC failure */ 136 #define BAM_FQCFAIL 512 137 138 /* Optical or PCR duplicate */ 139 #define BAM_FDUP 1024 140 141 142 #define BAM_OFDEC 0 143 #define BAM_OFHEX 1 144 #define BAM_OFSTR 2 145 146 147 /* Default mask for pileup */ 148 #define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) 149 150 #define BAM_CORE_SIZE sizeof(AjOSeqBamCore) 151 152 /* 153 ** Describing how CIGAR operation/length is packed in a 32-bit integer. 154 */ 155 156 #define BAM_CIGAR_SHIFT 4 157 #define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1) 158 159 /* 160 ** CIGAR operations. 161 */ 162 163 /* CIGAR: M match or mismatch */ 164 #define BAM_CMATCH 0 165 166 /* CIGAR: I insertion to the reference */ 167 #define BAM_CINS 1 168 169 /* CIGAR: D deletion from the reference */ 170 #define BAM_CDEL 2 171 172 /* CIGAR: N skip on the reference (e.g. spliced alignment) */ 173 #define BAM_CREF_SKIP 3 174 175 /* CIGAR: S clip on the read with clipped sequence present in qseq */ 176 #define BAM_CSOFT_CLIP 4 177 178 /* CIGAR: H clip on the read with clipped sequence trimmed off */ 179 #define BAM_CHARD_CLIP 5 180 181 /* CIGAR: P padding */ 182 #define BAM_CPAD 6 183 184 /* CIGAR: = match */ 185 #define BAM_CEQUAL 7 186 187 /* CIGAR: X mismatch */ 188 #define BAM_CDIFF 8 189 190 191 extern const char* cigarcode; 192 extern const char* bam_nt16_rev_table; 193 194 195 196 197 /* ========================================================================= */ 198 /* ============================== public data ============================== */ 199 /* ========================================================================= */ 200 201 202 203 204 /* @data AjPSeqBamBgzf ******************************************************** 205 ** 206 ** BGZF file handling object 207 ** 208 ** @alias AjOSeqBamBgzf 209 ** @alias AjSSeqBamBgzf 210 ** 211 ** @attr file [FILE*] File object 212 ** @attr cache [AjPTable] Block cache 213 ** @attr uncompressed_block [void*] Uncompressed block data 214 ** @attr compressed_block [void*] Compressed block data 215 ** @attr error [const char*] Error description 216 ** @attr block_address [ajlong] Block offset 217 ** @attr file_descriptor [int] File descriptor 218 ** @attr cache_size [int] Cache size 219 ** @attr uncompressed_block_size [int] Uncompressed block size 220 ** @attr compressed_block_size [int] Compressed block size 221 ** @attr block_length [int] Block length 222 ** @attr block_offset [int] Block offset 223 ** @attr open_mode [char] Open_mode 'r' or 'w' 224 ** @attr owned_file [char] Boolean 225 ** @attr is_uncompressed [char] Boolean 226 ** @attr Padding [char[5]] Padding 227 ** 228 ******************************************************************************/ 229 230 typedef struct AjSSeqBamBgzf 231 { 232 FILE* file; 233 AjPTable cache; 234 void* uncompressed_block; 235 void* compressed_block; 236 const char* error; 237 ajlong block_address; 238 int file_descriptor; 239 int cache_size; 240 int uncompressed_block_size; 241 int compressed_block_size; 242 int block_length; 243 int block_offset; 244 char open_mode; 245 char owned_file; 246 char is_uncompressed; 247 char Padding[5]; 248 } AjOSeqBamBgzf; 249 250 #define AjPSeqBamBgzf AjOSeqBamBgzf* 251 252 253 254 255 #define BAM_VIRTUAL_OFFSET16 256 257 258 259 260 /* #abstract BAM file handler */ 261 262 /* @data AjPSeqBamHeader ****************************************************** 263 ** 264 ** BAM alignment file header data 265 ** 266 ** @attr target_name [char**] names of the reference sequences 267 ** @attr target_len [ajuint*] lengths of the reference sequences 268 ** @attr dict [AjPList] header dictionary 269 ** @attr hash [AjPTable] hash table for fast name lookup 270 ** @attr rg2lib [AjPTable] hash table for @RG-ID -> LB lookup 271 ** @attr text [char*] plain text 272 ** @attr n_targets [ajint] number of reference sequences 273 ** @attr l_text [ajint] length of the plain text in the header 274 ** 275 ** @@ 276 ** discussion Field hash points to null by default. It is a private 277 ** member. 278 ******************************************************************************/ 279 280 typedef struct AjSSeqBamheader 281 { 282 char **target_name; 283 ajuint *target_len; 284 AjPList dict; 285 AjPTable hash; 286 AjPTable rg2lib; 287 char *text; 288 ajint n_targets; 289 ajint l_text; 290 } AjOSeqBamHeader; 291 292 #define AjPSeqBamHeader AjOSeqBamHeader* 293 294 295 296 297 /* @data AjPSeqBamCore ******************************************************** 298 ** 299 ** Structure for core alignment information. 300 ** 301 ** @attr tid [ajint] read ID, defined by AjPSeqBamheader 302 ** @attr pos [ajint] 0-based leftmost coordinate 303 ** @attr bin [ajushort] bin calculated by ajSeqBamReg2bin() 304 ** @attr qual [unsigned char] mapping quality 305 ** @attr l_qname [unsigned char] length of the query name 306 ** @attr flag [ajushort] bitwise flag 307 ** @attr n_cigar [ajushort] number of CIGAR operations 308 ** @attr l_qseq [ajint] length of the query sequence (read) 309 ** @attr mtid [ajint] paired read (mate) ID 310 ** @attr mpos [ajint] paired read (mate) position 311 ** @attr isize [ajint] insert size for paired reads 312 ******************************************************************************/ 313 314 typedef struct AjSBamSeqCore 315 { 316 ajint tid; 317 ajint pos; 318 ajushort bin; 319 unsigned char qual; 320 unsigned char l_qname; 321 ajushort flag; 322 ajushort n_cigar; 323 ajint l_qseq; 324 ajint mtid; 325 ajint mpos; 326 ajint isize; 327 } AjOSeqBamCore; 328 329 #define AjPSeqBamCore AjOSeqBamCore* 330 331 332 333 334 /* @data AjPSeqBam ************************************************************ 335 ** 336 ** Structure for one alignment. 337 ** 338 ** @alias AjSSeqBam 339 ** @alias AjOSeqBam 340 ** 341 ** @attr core [AjOSeqBamCore] core information about the alignment 342 ** @attr data [unsigned char*] all variable-length data, concatenated; 343 ** structure: cigar-qname-seq-qual-aux 344 ** @attr l_aux [int] length of auxiliary data 345 ** @attr data_len [int] current length of data 346 ** @attr m_data [int] maximum reserved size of data 347 ** @attr Padding [int] Padding to alignment boundary 348 ** 349 ** @@ 350 ** discussion Notes: 351 ** 352 ** 1. qname is zero tailed and core.l_qname includes the tailing '\0'. 353 ** 354 ** 2. l_qseq is calculated from the total length of an alignment block 355 ** on reading or from CIGAR. 356 ******************************************************************************/ 357 358 typedef struct AjSSeqBam 359 { 360 AjOSeqBamCore core; 361 unsigned char *data; 362 int l_aux; 363 int data_len; 364 int m_data; 365 int Padding; 366 } AjOSeqBam; 367 368 #define AjPSeqBam AjOSeqBam* 369 370 371 372 /* ========================================================================= */ 373 /* =========================== public functions ============================ */ 374 /* ========================================================================= */ 375 376 377 378 379 #define MAJSEQBAMSTRAND(b) (((b)->core.flag&BAM_FREVERSE) != 0) 380 #define MAJSEQBAMMSTRAND(b) (((b)->core.flag&BAM_FMREVERSE) != 0) 381 382 383 384 385 /* 386 ** Get the CIGAR array 387 ** param b pointer to an alignment 388 ** return pointer to the CIGAR array 389 ** 390 ** In the CIGAR array, each element is a 32-bit integer. The 391 ** lower 4 bits gives a CIGAR operation and the higher 28 bits keep the 392 ** length of a CIGAR. 393 */ 394 #define MAJSEQBAMCIGAR(b) ((ajuint*)((b)->data + (b)->core.l_qname)) 395 396 397 /* 398 ** Get the name of the query 399 ** param b pointer to an alignment 400 ** return pointer to the name string, null terminated 401 */ 402 #define MAJSEQBAMQNAME(b) ((char*)((b)->data)) 403 404 405 /* 406 ** Get query sequence 407 ** param b pointer to an alignment 408 ** return pointer to sequence 409 ** 410 ** Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G, 411 ** 8 for T and 15 for N. Two bases are packed in one byte with the base 412 ** at the higher 4 bits having smaller coordinate on the read. It is 413 ** recommended to use bam1_seqi() macro to get the base. 414 */ 415 #define MAJSEQBAMSEQ(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname) 416 417 418 /* 419 ** Get query quality 420 ** param b pointer to an alignment 421 ** return pointer to quality string 422 */ 423 #define MAJSEQBAMQUAL(b) ((b)->data + (b)->core.n_cigar*4 + \ 424 (b)->core.l_qname + ((b)->core.l_qseq + 1)/2) 425 426 /* 427 ** Get a base on read 428 ** param s Query sequence returned by bam1_seq() 429 ** param i The i-th position, 0-based 430 ** return 4-bit integer representing the base. 431 */ 432 #define MAJSEQBAMSEQI(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf) 433 434 /* 435 ** Get pointer to the list of auxiliary data 436 ** param b pointer to an alignment 437 ** return pointer to the concatenated auxiliary data 438 */ 439 #define MAJSEQBAMAUX(b) ((b)->data + (b)->core.n_cigar*4 + \ 440 (b)->core.l_qname + \ 441 (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2) 442 443 #ifndef kroundup32 444 /* 445 ** Round an integer to the next closest power-2 integer. 446 ** param x integer to be rounded (in place) 447 ** x will be modified. 448 */ 449 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, \ 450 (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 451 #endif /* !kroundup32 */ 452 453 454 455 456 /* 457 ** Prototype definitions 458 */ 459 460 void ajSeqBamAuxAppend(AjPSeqBam b, const char tag[2], 461 char type, int len, const unsigned char* data); 462 463 AjPSeqBamBgzf ajSeqBamBgzfNew(FILE* file, const char* mode); 464 465 AjPSeqBamBgzf ajSeqBamBgzfOpenfd(int fd, const char *mode); 466 AjPSeqBamBgzf ajSeqBamBgzfOpenC(const char* path, const char *mode); 467 468 ajuint ajSeqBamCalend(const AjPSeqBamCore c, const ajuint *cigar); 469 470 int ajSeqBamBgzfClose(AjPSeqBamBgzf fp); 471 int ajSeqBamBgzfEof(AjPSeqBamBgzf fp); 472 int ajSeqBamBgzfFlush(AjPSeqBamBgzf fp); 473 int ajSeqBamBgzfRead(AjPSeqBamBgzf fp, void* data, int length); 474 ajlong ajSeqBamBgzfSeek(AjPSeqBamBgzf fp, ajlong pos, int where); 475 AjBool ajSeqBamBgzfSetInfile(AjPSeqBamBgzf gzfile, AjPFile outf); 476 AjBool ajSeqBamBgzfSetOutfile(AjPSeqBamBgzf gzfile, AjPFile outf); 477 int ajSeqBamBgzfWrite(AjPSeqBamBgzf fp, const void* data, int length); 478 479 void ajSeqBamDel(AjPSeqBam *Pbam); 480 481 AjPSeqBamHeader ajSeqBamHeaderNew(void); 482 AjPSeqBamHeader ajSeqBamHeaderNewN(ajint n); 483 AjPSeqBamHeader ajSeqBamHeaderNewTextC(const char* txt); 484 AjPSeqBamHeader ajSeqBamHeaderRead(AjPSeqBamBgzf gzfile); 485 void ajSeqBamHeaderDel(AjPSeqBamHeader *Pheader); 486 int ajSeqBamHeaderWrite(AjPSeqBamBgzf fp, const AjPSeqBamHeader header); 487 AjPTable ajSeqBamHeaderGetRefseqTags(const AjPSeqBamHeader header); 488 AjPTable ajSeqBamHeaderGetReadgroupTags(const AjPSeqBamHeader header); 489 const char* ajSeqBamHeaderGetSortorder(const AjPSeqBamHeader header); 490 void ajSeqBamHeaderSetTextC(AjPSeqBamHeader header, const char* txt); 491 492 int ajSeqBamRead(AjPSeqBamBgzf fp, AjPSeqBam b); 493 494 int ajSeqBamReg2bin(ajuint beg, ajuint end); 495 496 int ajSeqBamValidate(const AjPSeqBamHeader header, const AjPSeqBam b); 497 int ajSeqBamWrite(AjPSeqBamBgzf fp, const AjPSeqBam b); 498 499 const char *ajSeqBamGetLibrary(AjPSeqBamHeader header, const AjPSeqBam b); 500 501 AjPTable ajSeqBamHeaderLineParse(const char *headerLine); 502 503 /* 504 ** End of prototype definitions 505 */ 506 507 508 509 510 AJ_END_DECLS 511 512 #endif /* !AJSEQBAM_H */ 513