1 /* 2 Copyright (c) 2012-2013 Genome Research Ltd. 3 Author: James Bonfield <jkb@sanger.ac.uk> 4 5 Redistribution and use in source and binary forms, with or without 6 modification, are permitted provided that the following conditions are met: 7 8 1. Redistributions of source code must retain the above copyright notice, 9 this list of conditions and the following disclaimer. 10 11 2. Redistributions in binary form must reproduce the above copyright notice, 12 this list of conditions and the following disclaimer in the documentation 13 and/or other materials provided with the distribution. 14 15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger 16 Institute nor the names of its contributors may be used to endorse or promote 17 products derived from this software without specific prior written permission. 18 19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND 20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE 23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #ifndef _CRAM_STRUCTS_H_ 32 #define _CRAM_STRUCTS_H_ 33 34 /* 35 * Defines in-memory structs for the basic file-format objects in the 36 * CRAM format. 37 * 38 * The basic file format is: 39 * File-def SAM-hdr Container Container ... 40 * 41 * Container: 42 * Service-block data-block data-block ... 43 * 44 * Multiple blocks in a container are grouped together as slices, 45 * also sometimes referred to as landmarks in the spec. 46 */ 47 48 49 #include <pthread.h> 50 #include <stdint.h> 51 #include <sys/types.h> 52 53 #include "htslib/thread_pool.h" 54 #include "cram/string_alloc.h" 55 #include "cram/mFILE.h" 56 #include "htslib/khash.h" 57 58 #ifdef __cplusplus 59 extern "C" { 60 #endif 61 62 // Generic hash-map integer -> integer 63 KHASH_MAP_INIT_INT(m_i2i, int) 64 65 // Generic hash-set integer -> (existance) 66 KHASH_SET_INIT_INT(s_i2i) 67 68 // For brevity 69 typedef unsigned char uc; 70 71 /* 72 * A union for the preservation map. Required for khash. 73 */ 74 typedef union { 75 int i; 76 char *p; 77 } pmap_t; 78 79 // Generates static functions here which isn't ideal, but we have no way 80 // currently to declare the kh_map_t structure here without also declaring a 81 // duplicate in the .c files due to the nature of the KHASH macros. 82 KHASH_MAP_INIT_STR(map, pmap_t) 83 84 struct hFILE; 85 86 #define SEQS_PER_SLICE 10000 87 #define BASES_PER_SLICE (SEQS_PER_SLICE*500) 88 #define SLICE_PER_CNT 1 89 90 #define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT" 91 92 #define MAX_STAT_VAL 1024 93 //#define MAX_STAT_VAL 16 94 typedef struct cram_stats { 95 int freqs[MAX_STAT_VAL]; 96 khash_t(m_i2i) *h; 97 int nsamp; // total number of values added 98 int nvals; // total number of unique values added 99 } cram_stats; 100 101 /* NB: matches java impl, not the spec */ 102 enum cram_encoding { 103 E_NULL = 0, 104 E_EXTERNAL = 1, 105 E_GOLOMB = 2, 106 E_HUFFMAN = 3, 107 E_BYTE_ARRAY_LEN = 4, 108 E_BYTE_ARRAY_STOP = 5, 109 E_BETA = 6, 110 E_SUBEXP = 7, 111 E_GOLOMB_RICE = 8, 112 E_GAMMA = 9, 113 E_NUM_CODECS = 10, /* Number of codecs, not a real one. */ 114 }; 115 116 enum cram_external_type { 117 E_INT = 1, 118 E_LONG = 2, 119 E_BYTE = 3, 120 E_BYTE_ARRAY = 4, 121 E_BYTE_ARRAY_BLOCK = 5, 122 }; 123 124 /* External IDs used by this implementation (only assumed during writing) */ 125 enum cram_DS_ID { 126 DS_CORE = 0, 127 DS_aux = 1, // aux_blk 128 DS_aux_OQ = 2, 129 DS_aux_BQ = 3, 130 DS_aux_BD = 4, 131 DS_aux_BI = 5, 132 DS_aux_FZ = 6, // also ZM:B 133 DS_aux_oq = 7, // other qualities 134 DS_aux_os = 8, // other sequences 135 DS_aux_oz = 9, // other strings 136 DS_ref, 137 DS_RN, // name_blk 138 DS_QS, // qual_blk 139 DS_IN, // base_blk 140 DS_SC, // soft_blk 141 142 DS_BF, // start loop 143 DS_CF, 144 DS_AP, 145 DS_RG, 146 DS_MQ, 147 DS_NS, 148 DS_MF, 149 DS_TS, 150 DS_NP, 151 DS_NF, 152 DS_RL, 153 DS_FN, 154 DS_FC, 155 DS_FP, 156 DS_DL, 157 DS_BA, 158 DS_BS, 159 DS_TL, 160 DS_RI, 161 DS_RS, 162 DS_PD, 163 DS_HC, 164 DS_BB, 165 DS_QQ, 166 167 DS_TN, // end loop 168 169 DS_RN_len, 170 DS_SC_len, 171 DS_BB_len, 172 DS_QQ_len, 173 174 DS_TC, // CRAM v1.0 tags 175 DS_TM, // test 176 DS_TV, // test 177 178 DS_END, 179 }; 180 181 /* "File Definition Structure" */ 182 typedef struct cram_file_def { 183 char magic[4]; 184 uint8_t major_version; 185 uint8_t minor_version; 186 char file_id[20]; // Filename or SHA1 checksum 187 } cram_file_def; 188 189 #define CRAM_MAJOR_VERS(v) ((v) >> 8) 190 #define CRAM_MINOR_VERS(v) ((v) & 0xff) 191 192 struct cram_slice; 193 194 enum cram_block_method { 195 BM_ERROR = -1, 196 RAW = 0, 197 GZIP = 1, 198 BZIP2 = 2, 199 LZMA = 3, 200 RANS = 4, // Generic; either order 201 RANS0 = 4, 202 RANS1 = 10, // Not externalised; stored as RANS (generic) 203 GZIP_RLE = 11, // NB: not externalised in CRAM 204 }; 205 206 enum cram_content_type { 207 CT_ERROR = -1, 208 FILE_HEADER = 0, 209 COMPRESSION_HEADER = 1, 210 MAPPED_SLICE = 2, 211 UNMAPPED_SLICE = 3, // CRAM V1.0 only 212 EXTERNAL = 4, 213 CORE = 5, 214 }; 215 216 /* Compression metrics */ 217 typedef struct { 218 // number of trials and time to next trial 219 int trial; 220 int next_trial; 221 222 // aggregate sizes during trials 223 int sz_gz_rle; 224 int sz_gz_def; 225 int sz_rans0; 226 int sz_rans1; 227 int sz_bzip2; 228 int sz_lzma; 229 230 // resultant method from trials 231 int method; 232 int strat; 233 234 // Revisions of method, to allow culling of continually failing ones. 235 int gz_rle_cnt; 236 int gz_def_cnt; 237 int rans0_cnt; 238 int rans1_cnt; 239 int bzip2_cnt; 240 int lzma_cnt; 241 int revised_method; 242 243 double gz_rle_extra; 244 double gz_def_extra; 245 double rans0_extra; 246 double rans1_extra; 247 double bzip2_extra; 248 double lzma_extra; 249 } cram_metrics; 250 251 // Hash aux key (XX:i) to cram_metrics 252 KHASH_MAP_INIT_INT(m_metrics, cram_metrics*) 253 254 255 /* Block */ 256 typedef struct cram_block { 257 enum cram_block_method method, orig_method; 258 enum cram_content_type content_type; 259 int32_t content_id; 260 int32_t comp_size; 261 int32_t uncomp_size; 262 uint32_t crc32; 263 int32_t idx; /* offset into data */ 264 unsigned char *data; 265 266 // For bit I/O 267 size_t alloc; 268 size_t byte; 269 int bit; 270 271 // To aid compression 272 cram_metrics *m; // used to track aux block compression only 273 } cram_block; 274 275 struct cram_codec; /* defined in cram_codecs.h */ 276 struct cram_map; 277 278 #define CRAM_MAP_HASH 32 279 #define CRAM_MAP(a,b) (((a)*3+(b))&(CRAM_MAP_HASH-1)) 280 281 /* Compression header block */ 282 typedef struct cram_block_compression_hdr { 283 int32_t ref_seq_id; 284 int32_t ref_seq_start; 285 int32_t ref_seq_span; 286 int32_t num_records; 287 int32_t num_landmarks; 288 int32_t *landmark; 289 290 /* Flags from preservation map */ 291 int mapped_qs_included; 292 int unmapped_qs_included; 293 int unmapped_placed; 294 int qs_included; 295 int read_names_included; 296 int AP_delta; 297 // indexed by ref-base and subst. code 298 char substitution_matrix[5][4]; 299 300 // TD Dictionary as a concatenated block 301 cram_block *TD_blk; // Tag Dictionary 302 int nTL; // number of TL entries in TD 303 unsigned char **TL; // array of size nTL, pointer into TD_blk. 304 khash_t(m_s2i) *TD_hash; // Keyed on TD strings, map to TL[] indices 305 string_alloc_t *TD_keys; // Pooled keys for TD hash. 306 307 khash_t(map) *preservation_map; 308 struct cram_map *rec_encoding_map[CRAM_MAP_HASH]; 309 struct cram_map *tag_encoding_map[CRAM_MAP_HASH]; 310 311 struct cram_codec *codecs[DS_END]; 312 313 char *uncomp; // A single block of uncompressed data 314 size_t uncomp_size, uncomp_alloc; 315 316 unsigned int data_series; // See cram_fields enum below 317 } cram_block_compression_hdr; 318 319 typedef struct cram_map { 320 int key; /* 0xe0 + 3 bytes */ 321 enum cram_encoding encoding; 322 int offset; /* Offset into a single block of memory */ 323 int size; /* Size */ 324 struct cram_codec *codec; 325 struct cram_map *next; // for noddy internal hash 326 } cram_map; 327 328 typedef struct cram_tag_map { 329 struct cram_codec *codec; 330 cram_block *blk; 331 cram_metrics *m; 332 } cram_tag_map; 333 334 // Hash aux key (XX:i) to cram_tag_map 335 KHASH_MAP_INIT_INT(m_tagmap, cram_tag_map*) 336 337 /* Mapped or unmapped slice header block */ 338 typedef struct cram_block_slice_hdr { 339 enum cram_content_type content_type; 340 int32_t ref_seq_id; /* if content_type == MAPPED_SLICE */ 341 int32_t ref_seq_start; /* if content_type == MAPPED_SLICE */ 342 int32_t ref_seq_span; /* if content_type == MAPPED_SLICE */ 343 int32_t num_records; 344 int64_t record_counter; 345 int32_t num_blocks; 346 int32_t num_content_ids; 347 int32_t *block_content_ids; 348 int32_t ref_base_id; /* if content_type == MAPPED_SLICE */ 349 unsigned char md5[16]; 350 } cram_block_slice_hdr; 351 352 struct ref_entry; 353 354 /* 355 * Container. 356 * 357 * Conceptually a container is split into slices, and slices into blocks. 358 * However on disk it's just a list of blocks and we need to query the 359 * block types to identify the start/end points of the slices. 360 * 361 * OR... are landmarks the start/end points of slices? 362 */ 363 typedef struct cram_container { 364 int32_t length; 365 int32_t ref_seq_id; 366 int32_t ref_seq_start; 367 int32_t ref_seq_span; 368 int64_t record_counter; 369 int64_t num_bases; 370 int32_t num_records; 371 int32_t num_blocks; 372 int32_t num_landmarks; 373 int32_t *landmark; 374 375 /* Size of container header above */ 376 size_t offset; 377 378 /* Compression header is always the first block? */ 379 cram_block_compression_hdr *comp_hdr; 380 cram_block *comp_hdr_block; 381 382 /* For construction purposes */ 383 int max_slice, curr_slice; // maximum number of slices 384 int max_rec, curr_rec; // current and max recs per slice 385 int max_c_rec, curr_c_rec; // current and max recs per container 386 int slice_rec; // rec no. for start of this slice 387 int curr_ref; // current ref ID. -2 for no previous 388 int last_pos; // last record position 389 struct cram_slice **slices, *slice; 390 int pos_sorted; // boolean, 1=>position sorted data 391 int max_apos; // maximum position, used if pos_sorted==0 392 int last_slice; // number of reads in last slice (0 for 1st) 393 int multi_seq; // true if packing multi seqs per cont/slice 394 int unsorted; // true is AP_delta is 0. 395 396 /* Copied from fd before encoding, to allow multi-threading */ 397 int ref_start, first_base, last_base, ref_id, ref_end; 398 char *ref; 399 //struct ref_entry *ref; 400 401 /* For multi-threading */ 402 bam_seq_t **bams; 403 404 /* Statistics for encoding */ 405 cram_stats *stats[DS_END]; 406 407 khash_t(m_tagmap) *tags_used; // set of tag types in use, for tag encoding map 408 int *refs_used; // array of frequency of ref seq IDs 409 410 uint32_t crc32; // CRC32 411 412 uint64_t s_num_bases; // number of bases in this slice 413 } cram_container; 414 415 /* 416 * A single cram record 417 */ 418 typedef struct cram_record { 419 struct cram_slice *s; // Filled out by cram_decode only 420 421 int32_t ref_id; // fixed for all recs in slice? 422 int32_t flags; // BF 423 int32_t cram_flags; // CF 424 int32_t len; // RL 425 int32_t apos; // AP 426 int32_t rg; // RG 427 int32_t name; // RN; idx to s->names_blk 428 int32_t name_len; 429 int32_t mate_line; // index to another cram_record 430 int32_t mate_ref_id; 431 int32_t mate_pos; // NP 432 int32_t tlen; // TS 433 434 // Auxiliary data 435 int32_t ntags; // TC 436 int32_t aux; // idx to s->aux_blk 437 int32_t aux_size; // total size of packed ntags in aux_blk 438 #ifndef TN_external 439 int32_t TN_idx; // TN; idx to s->TN; 440 #else 441 int32_t tn; // idx to s->tn_blk 442 #endif 443 int TL; 444 445 int32_t seq; // idx to s->seqs_blk 446 int32_t qual; // idx to s->qual_blk 447 int32_t cigar; // idx to s->cigar 448 int32_t ncigar; 449 int32_t aend; // alignment end 450 int32_t mqual; // MQ 451 452 int32_t feature; // idx to s->feature 453 int32_t nfeature; // number of features 454 int32_t mate_flags; // MF 455 } cram_record; 456 457 // Accessor macros as an analogue of the bam ones 458 #define cram_qname(c) (&(c)->s->name_blk->data[(c)->name]) 459 #define cram_seq(c) (&(c)->s->seqs_blk->data[(c)->seq]) 460 #define cram_qual(c) (&(c)->s->qual_blk->data[(c)->qual]) 461 #define cram_aux(c) (&(c)->s->aux_blk->data[(c)->aux]) 462 #define cram_seqi(c,i) (cram_seq((c))[(i)]) 463 #define cram_name_len(c) ((c)->name_len) 464 #define cram_strand(c) (((c)->flags & BAM_FREVERSE) != 0) 465 #define cram_mstrand(c) (((c)->flags & BAM_FMREVERSE) != 0) 466 #define cram_cigar(c) (&((cr)->s->cigar)[(c)->cigar]) 467 468 /* 469 * A feature is a base difference, used for the sequence reference encoding. 470 * (We generate these internally when writing CRAM.) 471 */ 472 typedef struct cram_feature { 473 union { 474 struct { 475 int pos; 476 int code; 477 int base; // substitution code 478 } X; 479 struct { 480 int pos; 481 int code; 482 int base; // actual base & qual 483 int qual; 484 } B; 485 struct { 486 int pos; 487 int code; 488 int seq_idx; // index to s->seqs_blk 489 int len; 490 } b; 491 struct { 492 int pos; 493 int code; 494 int qual; 495 } Q; 496 struct { 497 int pos; 498 int code; 499 int len; 500 int seq_idx; // soft-clip multiple bases 501 } S; 502 struct { 503 int pos; 504 int code; 505 int len; 506 int seq_idx; // insertion multiple bases 507 } I; 508 struct { 509 int pos; 510 int code; 511 int base; // insertion single base 512 } i; 513 struct { 514 int pos; 515 int code; 516 int len; 517 } D; 518 struct { 519 int pos; 520 int code; 521 int len; 522 } N; 523 struct { 524 int pos; 525 int code; 526 int len; 527 } P; 528 struct { 529 int pos; 530 int code; 531 int len; 532 } H; 533 }; 534 } cram_feature; 535 536 /* 537 * A slice is really just a set of blocks, but it 538 * is the logical unit for decoding a number of 539 * sequences. 540 */ 541 typedef struct cram_slice { 542 cram_block_slice_hdr *hdr; 543 cram_block *hdr_block; 544 cram_block **block; 545 cram_block **block_by_id; 546 547 /* State used during encoding/decoding */ 548 int last_apos, max_apos; 549 550 /* Array of decoded cram records */ 551 cram_record *crecs; 552 553 /* An dynamically growing buffers for data pointed 554 * to by crecs[] array. 555 */ 556 uint32_t *cigar; 557 uint32_t cigar_alloc; 558 uint32_t ncigar; 559 560 cram_feature *features; 561 int nfeatures; 562 int afeatures; // allocated size of features 563 564 #ifndef TN_external 565 // TN field (Tag Name) 566 uint32_t *TN; 567 int nTN, aTN; // used and allocated size for TN[] 568 #else 569 cram_block *tn_blk; 570 int tn_id; 571 #endif 572 573 // For variable sized elements which are always external blocks. 574 cram_block *name_blk; 575 cram_block *seqs_blk; 576 cram_block *qual_blk; 577 cram_block *base_blk; 578 cram_block *soft_blk; 579 cram_block *aux_blk; // BAM aux block, created while decoding CRAM 580 581 string_alloc_t *pair_keys; // Pooled keys for pair hash. 582 khash_t(m_s2i) *pair[2]; // for identifying read-pairs in this slice. 583 584 char *ref; // slice of current reference 585 int ref_start; // start position of current reference; 586 int ref_end; // end position of current reference; 587 int ref_id; 588 589 // For going from BAM to CRAM; an array of auxiliary blocks per type 590 int naux_block; 591 cram_block **aux_block; 592 } cram_slice; 593 594 /*----------------------------------------------------------------------------- 595 * Consider moving reference handling to cram_refs.[ch] 596 */ 597 // from fa.fai / samtools faidx files 598 typedef struct ref_entry { 599 char *name; 600 char *fn; 601 int64_t length; 602 int64_t offset; 603 int bases_per_line; 604 int line_length; 605 int64_t count; // for shared references so we know to dealloc seq 606 char *seq; 607 mFILE *mf; 608 int is_md5; // Reference comes from a raw seq found by MD5 609 } ref_entry; 610 611 KHASH_MAP_INIT_STR(refs, ref_entry*) 612 613 // References structure. 614 typedef struct { 615 string_alloc_t *pool; // String pool for holding filenames and SN vals 616 617 khash_t(refs) *h_meta; // ref_entry*, index by name 618 ref_entry **ref_id; // ref_entry*, index by ID 619 int nref; // number of ref_entry 620 621 char *fn; // current file opened 622 BGZF *fp; // and the hFILE* to go with it. 623 624 int count; // how many cram_fd sharing this refs struct 625 626 pthread_mutex_t lock; // Mutex for multi-threaded updating 627 ref_entry *last; // Last queried sequence 628 int last_id; // Used in cram_ref_decr_locked to delay free 629 } refs_t; 630 631 /*----------------------------------------------------------------------------- 632 * CRAM index 633 * 634 * Detect format by number of entries per line. 635 * 5 => 1.0 (refid, start, nseq, C offset, slice) 636 * 6 => 1.1 (refid, start, span, C offset, S offset, S size) 637 * 638 * Indices are stored in a nested containment list, which is trivial to set 639 * up as the indices are on sorted data so we're appending to the nclist 640 * in sorted order. Basically if a slice entirely fits within a previous 641 * slice then we append to that slices list. This is done recursively. 642 * 643 * Lists are sorted on two dimensions: ref id + slice coords. 644 */ 645 typedef struct cram_index { 646 int nslice, nalloc; // total number of slices 647 struct cram_index *e; // array of size nslice 648 649 int refid; // 1.0 1.1 650 int start; // 1.0 1.1 651 int end; // 1.1 652 int nseq; // 1.0 - undocumented 653 int slice; // 1.0 landmark index, 1.1 landmark value 654 int len; // 1.1 - size of slice in bytes 655 int64_t offset; // 1.0 1.1 656 } cram_index; 657 658 typedef struct { 659 int refid; 660 int start; 661 int end; 662 } cram_range; 663 664 /*----------------------------------------------------------------------------- 665 */ 666 /* CRAM File handle */ 667 668 typedef struct spare_bams { 669 bam_seq_t **bams; 670 struct spare_bams *next; 671 } spare_bams; 672 673 typedef struct cram_fd { 674 struct hFILE *fp; 675 int mode; // 'r' or 'w' 676 int version; 677 cram_file_def *file_def; 678 SAM_hdr *header; 679 680 char *prefix; 681 int64_t record_counter; 682 int err; 683 684 // Most recent compression header decoded 685 //cram_block_compression_hdr *comp_hdr; 686 //cram_block_slice_hdr *slice_hdr; 687 688 // Current container being processed. 689 cram_container *ctr; 690 691 // positions for encoding or decoding 692 int first_base, last_base; 693 694 // cached reference portion 695 refs_t *refs; // ref meta-data structure 696 char *ref, *ref_free; // current portion held in memory 697 int ref_id; 698 int ref_start; 699 int ref_end; 700 char *ref_fn; // reference fasta filename 701 702 // compression level and metrics 703 int level; 704 cram_metrics *m[DS_END]; 705 khash_t(m_metrics) *tags_used; // cram_metrics[], per tag types in use. 706 707 // options 708 int decode_md; // Whether to export MD and NM tags 709 int seqs_per_slice; 710 int bases_per_slice; 711 int slices_per_container; 712 int embed_ref; 713 int no_ref; 714 int ignore_md5; 715 int use_bz2; 716 int use_rans; 717 int use_lzma; 718 int shared_ref; 719 unsigned int required_fields; 720 cram_range range; 721 722 // lookup tables, stored here so we can be trivially multi-threaded 723 unsigned int bam_flag_swap[0x1000]; // cram -> bam flags 724 unsigned int cram_flag_swap[0x1000];// bam -> cram flags 725 unsigned char L1[256]; // ACGT{*} ->0123{4} 726 unsigned char L2[256]; // ACGTN{*}->01234{5} 727 char cram_sub_matrix[32][32]; // base substituion codes 728 729 int index_sz; 730 cram_index *index; // array, sizeof index_sz 731 off_t first_container; 732 int eof; 733 int last_slice; // number of recs encoded in last slice 734 int multi_seq; 735 int unsorted; 736 int empty_container; // Marker for EOF block 737 738 // thread pool 739 int own_pool; 740 hts_tpool *pool; 741 hts_tpool_process *rqueue; 742 pthread_mutex_t metrics_lock; 743 pthread_mutex_t ref_lock; 744 spare_bams *bl; 745 pthread_mutex_t bam_list_lock; 746 void *job_pending; 747 int ooc; // out of containers. 748 749 int lossy_read_names; // boolean 750 int tlen_approx; // max TLEN calculation offset. 751 int tlen_zero; // If true, permit tlen 0 (=> tlen calculated) 752 } cram_fd; 753 754 // Translation of required fields to cram data series 755 enum cram_fields { 756 CRAM_BF = 0x00000001, 757 CRAM_AP = 0x00000002, 758 CRAM_FP = 0x00000004, 759 CRAM_RL = 0x00000008, 760 CRAM_DL = 0x00000010, 761 CRAM_NF = 0x00000020, 762 CRAM_BA = 0x00000040, 763 CRAM_QS = 0x00000080, 764 CRAM_FC = 0x00000100, 765 CRAM_FN = 0x00000200, 766 CRAM_BS = 0x00000400, 767 CRAM_IN = 0x00000800, 768 CRAM_RG = 0x00001000, 769 CRAM_MQ = 0x00002000, 770 CRAM_TL = 0x00004000, 771 CRAM_RN = 0x00008000, 772 CRAM_NS = 0x00010000, 773 CRAM_NP = 0x00020000, 774 CRAM_TS = 0x00040000, 775 CRAM_MF = 0x00080000, 776 CRAM_CF = 0x00100000, 777 CRAM_RI = 0x00200000, 778 CRAM_RS = 0x00400000, 779 CRAM_PD = 0x00800000, 780 CRAM_HC = 0x01000000, 781 CRAM_SC = 0x02000000, 782 CRAM_BB = 0x04000000, 783 CRAM_BB_len = 0x08000000, 784 CRAM_QQ = 0x10000000, 785 CRAM_QQ_len = 0x20000000, 786 CRAM_aux= 0x40000000, 787 CRAM_ALL= 0x7fffffff, 788 }; 789 790 // A CIGAR opcode, but not necessarily the implications of it. Eg FC/FP may 791 // encode a base difference, but we don't need to know what it is for CIGAR. 792 // If we have a soft-clip or insertion, we do need SC/IN though to know how 793 // long that array is. 794 #define CRAM_CIGAR (CRAM_FN | CRAM_FP | CRAM_FC | CRAM_DL | CRAM_IN | \ 795 CRAM_SC | CRAM_HC | CRAM_PD | CRAM_RS | CRAM_RL | CRAM_BF) 796 797 #define CRAM_SEQ (CRAM_CIGAR | CRAM_BA | CRAM_BS | \ 798 CRAM_RL | CRAM_AP | CRAM_BB) 799 800 #define CRAM_QUAL (CRAM_CIGAR | CRAM_RL | CRAM_AP | CRAM_QS | CRAM_QQ) 801 802 /* BF bitfields */ 803 /* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */ 804 #define CRAM_FPAIRED 256 805 #define CRAM_FPROPER_PAIR 128 806 #define CRAM_FUNMAP 64 807 #define CRAM_FREVERSE 32 808 #define CRAM_FREAD1 16 809 #define CRAM_FREAD2 8 810 #define CRAM_FSECONDARY 4 811 #define CRAM_FQCFAIL 2 812 #define CRAM_FDUP 1 813 814 #define DS_aux_S "\001" 815 #define DS_aux_OQ_S "\002" 816 #define DS_aux_BQ_S "\003" 817 #define DS_aux_BD_S "\004" 818 #define DS_aux_BI_S "\005" 819 #define DS_aux_FZ_S "\006" 820 #define DS_aux_oq_S "\007" 821 #define DS_aux_os_S "\010" 822 #define DS_aux_oz_S "\011" 823 824 #define CRAM_M_REVERSE 1 825 #define CRAM_M_UNMAP 2 826 827 828 /* CF bitfields */ 829 #define CRAM_FLAG_PRESERVE_QUAL_SCORES (1<<0) 830 #define CRAM_FLAG_DETACHED (1<<1) 831 #define CRAM_FLAG_MATE_DOWNSTREAM (1<<2) 832 #define CRAM_FLAG_NO_SEQ (1<<3) 833 #define CRAM_FLAG_MASK ((1<<4)-1) 834 835 /* Internal only */ 836 #define CRAM_FLAG_STATS_ADDED (1<<30) 837 #define CRAM_FLAG_DISCARD_NAME (1<<31) 838 839 #ifdef __cplusplus 840 } 841 #endif 842 843 #endif /* _CRAM_STRUCTS_H_ */ 844