1 /* 2 Copyright (c) 2012-2016, 2018-2020 Genome Research Ltd. 3 Author: James Bonfield <jkb@sanger.ac.uk> 4 5 Redistribution and use in source and binary forms, with or without 6 modification, are permitted provided that the following conditions are met: 7 8 1. Redistributions of source code must retain the above copyright notice, 9 this list of conditions and the following disclaimer. 10 11 2. Redistributions in binary form must reproduce the above copyright notice, 12 this list of conditions and the following disclaimer in the documentation 13 and/or other materials provided with the distribution. 14 15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger 16 Institute nor the names of its contributors may be used to endorse or promote 17 products derived from this software without specific prior written permission. 18 19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND 20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE 23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #ifndef HTSLIB_CRAM_STRUCTS_H 32 #define HTSLIB_CRAM_STRUCTS_H 33 34 /* 35 * Defines in-memory structs for the basic file-format objects in the 36 * CRAM format. 37 * 38 * The basic file format is: 39 * File-def SAM-hdr Container Container ... 40 * 41 * Container: 42 * Service-block data-block data-block ... 43 * 44 * Multiple blocks in a container are grouped together as slices, 45 * also sometimes referred to as landmarks in the spec. 46 */ 47 48 49 #include <pthread.h> 50 #include <stdint.h> 51 #include <sys/types.h> 52 53 #include "../htslib/thread_pool.h" 54 #include "../htslib/cram.h" 55 #include "string_alloc.h" 56 #include "mFILE.h" 57 #include "../htslib/khash.h" 58 59 #ifdef __cplusplus 60 extern "C" { 61 #endif 62 63 // Generic hash-map integer -> integer 64 KHASH_MAP_INIT_INT64(m_i2i, int) 65 66 // Generic hash-set integer -> (existence) 67 KHASH_SET_INIT_INT(s_i2i) 68 69 // For brevity 70 typedef unsigned char uc; 71 72 /* 73 * A union for the preservation map. Required for khash. 74 */ 75 typedef union { 76 int i; 77 char *p; 78 } pmap_t; 79 80 // Generates static functions here which isn't ideal, but we have no way 81 // currently to declare the kh_map_t structure here without also declaring a 82 // duplicate in the .c files due to the nature of the KHASH macros. 83 KHASH_MAP_INIT_STR(map, pmap_t) 84 85 struct hFILE; 86 87 #define SEQS_PER_SLICE 10000 88 #define BASES_PER_SLICE (SEQS_PER_SLICE*500) 89 #define SLICE_PER_CNT 1 90 91 #define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT" 92 93 #define MAX_STAT_VAL 1024 94 //#define MAX_STAT_VAL 16 95 typedef struct cram_stats { 96 int freqs[MAX_STAT_VAL]; 97 khash_t(m_i2i) *h; 98 int nsamp; // total number of values added 99 int nvals; // total number of unique values added 100 } cram_stats; 101 102 /* NB: matches java impl, not the spec */ 103 enum cram_encoding { 104 E_NULL = 0, 105 E_EXTERNAL = 1, 106 E_GOLOMB = 2, 107 E_HUFFMAN = 3, 108 E_BYTE_ARRAY_LEN = 4, 109 E_BYTE_ARRAY_STOP = 5, 110 E_BETA = 6, 111 E_SUBEXP = 7, 112 E_GOLOMB_RICE = 8, 113 E_GAMMA = 9, 114 E_NUM_CODECS = 10, /* Number of codecs, not a real one. */ 115 }; 116 117 enum cram_external_type { 118 E_INT = 1, 119 E_LONG = 2, 120 E_BYTE = 3, 121 E_BYTE_ARRAY = 4, 122 E_BYTE_ARRAY_BLOCK = 5, 123 }; 124 125 /* External IDs used by this implementation (only assumed during writing) */ 126 enum cram_DS_ID { 127 DS_CORE = 0, 128 DS_aux = 1, // aux_blk 129 DS_aux_OQ = 2, 130 DS_aux_BQ = 3, 131 DS_aux_BD = 4, 132 DS_aux_BI = 5, 133 DS_aux_FZ = 6, // also ZM:B 134 DS_aux_oq = 7, // other qualities 135 DS_aux_os = 8, // other sequences 136 DS_aux_oz = 9, // other strings 137 DS_ref, 138 DS_RN, // name_blk 139 DS_QS, // qual_blk 140 DS_IN, // base_blk 141 DS_SC, // soft_blk 142 143 DS_BF, // start loop 144 DS_CF, 145 DS_AP, 146 DS_RG, 147 DS_MQ, 148 DS_NS, 149 DS_MF, 150 DS_TS, 151 DS_NP, 152 DS_NF, 153 DS_RL, 154 DS_FN, 155 DS_FC, 156 DS_FP, 157 DS_DL, 158 DS_BA, 159 DS_BS, 160 DS_TL, 161 DS_RI, 162 DS_RS, 163 DS_PD, 164 DS_HC, 165 DS_BB, 166 DS_QQ, 167 168 DS_TN, // end loop 169 170 DS_RN_len, 171 DS_SC_len, 172 DS_BB_len, 173 DS_QQ_len, 174 175 DS_TC, // CRAM v1.0 tags 176 DS_TM, // test 177 DS_TV, // test 178 179 DS_END, 180 }; 181 182 /* "File Definition Structure" */ 183 struct cram_file_def { 184 char magic[4]; 185 uint8_t major_version; 186 uint8_t minor_version; 187 char file_id[20] HTS_NONSTRING; // Filename or SHA1 checksum 188 }; 189 190 #define CRAM_MAJOR_VERS(v) ((v) >> 8) 191 #define CRAM_MINOR_VERS(v) ((v) & 0xff) 192 193 struct cram_slice; 194 195 /* Now in htslib/cram.h 196 enum cram_block_method { 197 BM_ERROR = -1, 198 RAW = 0, 199 GZIP = 1, 200 BZIP2 = 2, 201 LZMA = 3, 202 RANS = 4, // Generic; either order 203 RANS0 = 4, 204 RANS1 = 10, // Not externalised; stored as RANS (generic) 205 GZIP_RLE = 11, // NB: not externalised in CRAM 206 }; 207 */ 208 209 /* Now in htslib/cram.h 210 enum cram_content_type { 211 CT_ERROR = -1, 212 FILE_HEADER = 0, 213 COMPRESSION_HEADER = 1, 214 MAPPED_SLICE = 2, 215 UNMAPPED_SLICE = 3, // CRAM V1.0 only 216 EXTERNAL = 4, 217 CORE = 5, 218 }; 219 */ 220 221 /* Compression metrics */ 222 struct cram_metrics { 223 // number of trials and time to next trial 224 int trial; 225 int next_trial; 226 227 // aggregate sizes during trials 228 int sz_gz_rle; 229 int sz_gz_def; 230 int sz_rans0; 231 int sz_rans1; 232 int sz_bzip2; 233 int sz_lzma; 234 235 // resultant method from trials 236 int method; 237 int strat; 238 239 // Revisions of method, to allow culling of continually failing ones. 240 int gz_rle_cnt; 241 int gz_def_cnt; 242 int rans0_cnt; 243 int rans1_cnt; 244 int bzip2_cnt; 245 int lzma_cnt; 246 int revised_method; 247 248 double gz_rle_extra; 249 double gz_def_extra; 250 double rans0_extra; 251 double rans1_extra; 252 double bzip2_extra; 253 double lzma_extra; 254 }; 255 256 // Hash aux key (XX:i) to cram_metrics 257 KHASH_MAP_INIT_INT(m_metrics, cram_metrics*) 258 259 260 /* Block */ 261 struct cram_block { 262 enum cram_block_method method, orig_method; 263 enum cram_content_type content_type; 264 int32_t content_id; 265 int32_t comp_size; 266 int32_t uncomp_size; 267 uint32_t crc32; 268 int32_t idx; /* offset into data */ 269 unsigned char *data; 270 271 // For bit I/O 272 size_t alloc; 273 size_t byte; 274 int bit; 275 276 // To aid compression 277 cram_metrics *m; // used to track aux block compression only 278 279 int crc32_checked; 280 uint32_t crc_part; 281 }; 282 283 struct cram_codec; /* defined in cram_codecs.h */ 284 struct cram_map; 285 286 #define CRAM_MAP_HASH 32 287 #define CRAM_MAP(a,b) (((a)*3+(b))&(CRAM_MAP_HASH-1)) 288 289 /* Compression header block */ 290 struct cram_block_compression_hdr { 291 int32_t ref_seq_id; 292 int64_t ref_seq_start; 293 int64_t ref_seq_span; 294 int32_t num_records; 295 int32_t num_landmarks; 296 int32_t *landmark; 297 298 /* Flags from preservation map */ 299 int read_names_included; 300 int AP_delta; 301 // indexed by ref-base and subst. code 302 char substitution_matrix[5][4]; 303 int no_ref; 304 305 // TD Dictionary as a concatenated block 306 cram_block *TD_blk; // Tag Dictionary 307 int nTL; // number of TL entries in TD 308 unsigned char **TL; // array of size nTL, pointer into TD_blk. 309 khash_t(m_s2i) *TD_hash; // Keyed on TD strings, map to TL[] indices 310 string_alloc_t *TD_keys; // Pooled keys for TD hash. 311 312 khash_t(map) *preservation_map; 313 struct cram_map *rec_encoding_map[CRAM_MAP_HASH]; 314 struct cram_map *tag_encoding_map[CRAM_MAP_HASH]; 315 316 struct cram_codec *codecs[DS_END]; 317 318 char *uncomp; // A single block of uncompressed data 319 size_t uncomp_size, uncomp_alloc; 320 }; 321 322 typedef struct cram_map { 323 int key; /* 0xe0 + 3 bytes */ 324 enum cram_encoding encoding; 325 int offset; /* Offset into a single block of memory */ 326 int size; /* Size */ 327 struct cram_codec *codec; 328 struct cram_map *next; // for noddy internal hash 329 } cram_map; 330 331 typedef struct cram_tag_map { 332 struct cram_codec *codec; 333 cram_block *blk; 334 cram_metrics *m; 335 } cram_tag_map; 336 337 // Hash aux key (XX:i) to cram_tag_map 338 KHASH_MAP_INIT_INT(m_tagmap, cram_tag_map*) 339 340 /* Mapped or unmapped slice header block */ 341 struct cram_block_slice_hdr { 342 enum cram_content_type content_type; 343 int32_t ref_seq_id; /* if content_type == MAPPED_SLICE */ 344 int64_t ref_seq_start; /* if content_type == MAPPED_SLICE */ 345 int64_t ref_seq_span; /* if content_type == MAPPED_SLICE */ 346 int32_t num_records; 347 int64_t record_counter; 348 int32_t num_blocks; 349 int32_t num_content_ids; 350 int32_t *block_content_ids; 351 int32_t ref_base_id; /* if content_type == MAPPED_SLICE */ 352 unsigned char md5[16]; 353 }; 354 355 struct ref_entry; 356 357 /* 358 * Container. 359 * 360 * Conceptually a container is split into slices, and slices into blocks. 361 * However on disk it's just a list of blocks and we need to query the 362 * block types to identify the start/end points of the slices. 363 * 364 * OR... are landmarks the start/end points of slices? 365 */ 366 struct cram_container { 367 int32_t length; 368 int32_t ref_seq_id; 369 int64_t ref_seq_start; 370 int64_t ref_seq_span; 371 int64_t record_counter; 372 int64_t num_bases; 373 int32_t num_records; 374 int32_t num_blocks; 375 int32_t num_landmarks; 376 int32_t *landmark; 377 378 /* Size of container header above */ 379 size_t offset; 380 381 /* Compression header is always the first block? */ 382 cram_block_compression_hdr *comp_hdr; 383 cram_block *comp_hdr_block; 384 385 /* For construction purposes */ 386 int max_slice, curr_slice; // maximum number of slices 387 int curr_slice_mt; // Curr_slice when reading ahead (via threads) 388 int max_rec, curr_rec; // current and max recs per slice 389 int max_c_rec, curr_c_rec; // current and max recs per container 390 int slice_rec; // rec no. for start of this slice 391 int curr_ref; // current ref ID. -2 for no previous 392 int64_t last_pos; // last record position 393 struct cram_slice **slices, *slice; 394 int pos_sorted; // boolean, 1=>position sorted data 395 int64_t max_apos; // maximum position, used if pos_sorted==0 396 int last_slice; // number of reads in last slice (0 for 1st) 397 int multi_seq; // true if packing multi seqs per cont/slice 398 int unsorted; // true is AP_delta is 0. 399 400 /* Copied from fd before encoding, to allow multi-threading */ 401 int ref_start, first_base, last_base, ref_id, ref_end; 402 char *ref; 403 //struct ref_entry *ref; 404 405 /* For multi-threading */ 406 bam_seq_t **bams; 407 408 /* Statistics for encoding */ 409 cram_stats *stats[DS_END]; 410 411 khash_t(m_tagmap) *tags_used; // set of tag types in use, for tag encoding map 412 int *refs_used; // array of frequency of ref seq IDs 413 414 uint32_t crc32; // CRC32 415 416 uint64_t s_num_bases; // number of bases in this slice 417 418 uint32_t n_mapped; // Number of mapped reads 419 }; 420 421 /* 422 * A single cram record 423 */ 424 typedef struct cram_record { 425 struct cram_slice *s; // Filled out by cram_decode only 426 427 int32_t ref_id; // fixed for all recs in slice? 428 int32_t flags; // BF 429 int32_t cram_flags; // CF 430 int32_t len; // RL 431 int64_t apos; // AP 432 int32_t rg; // RG 433 int32_t name; // RN; idx to s->names_blk 434 int32_t name_len; 435 int32_t mate_line; // index to another cram_record 436 int32_t mate_ref_id; 437 int64_t mate_pos; // NP 438 int64_t tlen; // TS 439 440 // Auxiliary data 441 int32_t ntags; // TC 442 int32_t aux; // idx to s->aux_blk 443 int32_t aux_size; // total size of packed ntags in aux_blk 444 #ifndef TN_external 445 int32_t TN_idx; // TN; idx to s->TN; 446 #else 447 int32_t tn; // idx to s->tn_blk 448 #endif 449 int TL; 450 451 int32_t seq; // idx to s->seqs_blk 452 int32_t qual; // idx to s->qual_blk 453 int32_t cigar; // idx to s->cigar 454 int32_t ncigar; 455 int64_t aend; // alignment end 456 int32_t mqual; // MQ 457 458 int32_t feature; // idx to s->feature 459 int32_t nfeature; // number of features 460 int32_t mate_flags; // MF 461 } cram_record; 462 463 // Accessor macros as an analogue of the bam ones 464 #define cram_qname(c) (&(c)->s->name_blk->data[(c)->name]) 465 #define cram_seq(c) (&(c)->s->seqs_blk->data[(c)->seq]) 466 #define cram_qual(c) (&(c)->s->qual_blk->data[(c)->qual]) 467 #define cram_aux(c) (&(c)->s->aux_blk->data[(c)->aux]) 468 #define cram_seqi(c,i) (cram_seq((c))[(i)]) 469 #define cram_name_len(c) ((c)->name_len) 470 #define cram_strand(c) (((c)->flags & BAM_FREVERSE) != 0) 471 #define cram_mstrand(c) (((c)->flags & BAM_FMREVERSE) != 0) 472 #define cram_cigar(c) (&((cr)->s->cigar)[(c)->cigar]) 473 474 /* 475 * A feature is a base difference, used for the sequence reference encoding. 476 * (We generate these internally when writing CRAM.) 477 */ 478 typedef union cram_feature { 479 struct { 480 int pos; 481 int code; 482 int base; // substitution code 483 } X; 484 struct { 485 int pos; 486 int code; 487 int base; // actual base & qual 488 int qual; 489 } B; 490 struct { 491 int pos; 492 int code; 493 int seq_idx; // index to s->seqs_blk 494 int len; 495 } b; 496 struct { 497 int pos; 498 int code; 499 int qual; 500 } Q; 501 struct { 502 int pos; 503 int code; 504 int len; 505 int seq_idx; // soft-clip multiple bases 506 } S; 507 struct { 508 int pos; 509 int code; 510 int len; 511 int seq_idx; // insertion multiple bases 512 } I; 513 struct { 514 int pos; 515 int code; 516 int base; // insertion single base 517 } i; 518 struct { 519 int pos; 520 int code; 521 int len; 522 } D; 523 struct { 524 int pos; 525 int code; 526 int len; 527 } N; 528 struct { 529 int pos; 530 int code; 531 int len; 532 } P; 533 struct { 534 int pos; 535 int code; 536 int len; 537 } H; 538 } cram_feature; 539 540 /* 541 * A slice is really just a set of blocks, but it 542 * is the logical unit for decoding a number of 543 * sequences. 544 */ 545 struct cram_slice { 546 cram_block_slice_hdr *hdr; 547 cram_block *hdr_block; 548 cram_block **block; 549 cram_block **block_by_id; 550 551 /* State used during encoding/decoding */ 552 int64_t last_apos, max_apos; 553 554 /* Array of decoded cram records */ 555 cram_record *crecs; 556 557 /* An dynamically growing buffers for data pointed 558 * to by crecs[] array. 559 */ 560 uint32_t *cigar; 561 uint32_t cigar_alloc; 562 uint32_t ncigar; 563 564 cram_feature *features; 565 int nfeatures; 566 int afeatures; // allocated size of features 567 568 #ifndef TN_external 569 // TN field (Tag Name) 570 uint32_t *TN; 571 int nTN, aTN; // used and allocated size for TN[] 572 #else 573 cram_block *tn_blk; 574 int tn_id; 575 #endif 576 577 // For variable sized elements which are always external blocks. 578 cram_block *name_blk; 579 cram_block *seqs_blk; 580 cram_block *qual_blk; 581 cram_block *base_blk; 582 cram_block *soft_blk; 583 cram_block *aux_blk; // BAM aux block, created while decoding CRAM 584 585 string_alloc_t *pair_keys; // Pooled keys for pair hash. 586 khash_t(m_s2i) *pair[2]; // for identifying read-pairs in this slice. 587 588 char *ref; // slice of current reference 589 int ref_start; // start position of current reference; 590 int ref_end; // end position of current reference; 591 int ref_id; 592 593 // For going from BAM to CRAM; an array of auxiliary blocks per type 594 int naux_block; 595 cram_block **aux_block; 596 597 unsigned int data_series; // See cram_fields enum 598 int decode_md; 599 600 int max_rec, curr_rec; // current and max recs per slice 601 int slice_num; // To be copied into c->curr_slice in decode 602 }; 603 604 /*----------------------------------------------------------------------------- 605 * Consider moving reference handling to cram_refs.[ch] 606 */ 607 // from fa.fai / samtools faidx files 608 typedef struct ref_entry { 609 char *name; 610 char *fn; 611 int64_t length; 612 int64_t offset; 613 int bases_per_line; 614 int line_length; 615 int64_t count; // for shared references so we know to dealloc seq 616 char *seq; 617 mFILE *mf; 618 int is_md5; // Reference comes from a raw seq found by MD5 619 } ref_entry; 620 621 KHASH_MAP_INIT_STR(refs, ref_entry*) 622 623 // References structure. 624 struct refs_t { 625 string_alloc_t *pool; // String pool for holding filenames and SN vals 626 627 khash_t(refs) *h_meta; // ref_entry*, index by name 628 ref_entry **ref_id; // ref_entry*, index by ID 629 int nref; // number of ref_entry 630 631 char *fn; // current file opened 632 BGZF *fp; // and the hFILE* to go with it. 633 634 int count; // how many cram_fd sharing this refs struct 635 636 pthread_mutex_t lock; // Mutex for multi-threaded updating 637 ref_entry *last; // Last queried sequence 638 int last_id; // Used in cram_ref_decr_locked to delay free 639 }; 640 641 /*----------------------------------------------------------------------------- 642 * CRAM index 643 * 644 * Detect format by number of entries per line. 645 * 5 => 1.0 (refid, start, nseq, C offset, slice) 646 * 6 => 1.1 (refid, start, span, C offset, S offset, S size) 647 * 648 * Indices are stored in a nested containment list, which is trivial to set 649 * up as the indices are on sorted data so we're appending to the nclist 650 * in sorted order. Basically if a slice entirely fits within a previous 651 * slice then we append to that slices list. This is done recursively. 652 * 653 * Lists are sorted on two dimensions: ref id + slice coords. 654 */ 655 typedef struct cram_index { 656 int nslice, nalloc; // total number of slices 657 struct cram_index *e; // array of size nslice 658 659 int refid; // 1.0 1.1 660 int start; // 1.0 1.1 661 int end; // 1.1 662 int nseq; // 1.0 - undocumented 663 int slice; // 1.0 landmark index, 1.1 landmark value 664 int len; // 1.1 - size of slice in bytes 665 int64_t offset; // 1.0 1.1 666 int64_t next; // derived: offset of next container. 667 } cram_index; 668 669 typedef struct { 670 int refid; 671 int64_t start; 672 int64_t end; 673 } cram_range; 674 675 /*----------------------------------------------------------------------------- 676 */ 677 /* CRAM File handle */ 678 679 typedef struct spare_bams { 680 bam_seq_t **bams; 681 struct spare_bams *next; 682 } spare_bams; 683 684 struct cram_fd { 685 struct hFILE *fp; 686 int mode; // 'r' or 'w' 687 int version; 688 cram_file_def *file_def; 689 sam_hdr_t *header; 690 691 char *prefix; 692 int64_t record_counter; 693 int err; 694 695 // Most recent compression header decoded 696 //cram_block_compression_hdr *comp_hdr; 697 //cram_block_slice_hdr *slice_hdr; 698 699 // Current container being processed 700 cram_container *ctr; 701 702 // Current container used for decoder threads 703 cram_container *ctr_mt; 704 705 // positions for encoding or decoding 706 int first_base, last_base; 707 708 // cached reference portion 709 refs_t *refs; // ref meta-data structure 710 char *ref, *ref_free; // current portion held in memory 711 int ref_id; 712 int ref_start; 713 int ref_end; 714 char *ref_fn; // reference fasta filename 715 716 // compression level and metrics 717 int level; 718 cram_metrics *m[DS_END]; 719 khash_t(m_metrics) *tags_used; // cram_metrics[], per tag types in use. 720 721 // options 722 int decode_md; // Whether to export MD and NM tags 723 int seqs_per_slice; 724 int bases_per_slice; 725 int slices_per_container; 726 int embed_ref; 727 int no_ref; 728 int ignore_md5; 729 int use_bz2; 730 int use_rans; 731 int use_lzma; 732 int shared_ref; 733 unsigned int required_fields; 734 int store_md; 735 int store_nm; 736 cram_range range; 737 738 // lookup tables, stored here so we can be trivially multi-threaded 739 unsigned int bam_flag_swap[0x1000]; // cram -> bam flags 740 unsigned int cram_flag_swap[0x1000];// bam -> cram flags 741 unsigned char L1[256]; // ACGT{*} ->0123{4} 742 unsigned char L2[256]; // ACGTN{*}->01234{5} 743 char cram_sub_matrix[32][32]; // base substitution codes 744 745 int index_sz; 746 cram_index *index; // array, sizeof index_sz 747 off_t first_container; 748 off_t curr_position; 749 int eof; 750 int last_slice; // number of recs encoded in last slice 751 int last_RI_count; // number of references encoded in last container 752 int multi_seq; // -1 is auto, 0 is one ref per container, 1 is multi... 753 int multi_seq_user; // Original user setting (CRAM_OPT_MULTI_SEQ_PER_SLICE) 754 int unsorted; 755 int last_mapped; // number of mapped reads in last container 756 int empty_container; // Marker for EOF block 757 758 // thread pool 759 int own_pool; 760 hts_tpool *pool; 761 hts_tpool_process *rqueue; 762 pthread_mutex_t metrics_lock; 763 pthread_mutex_t ref_lock; 764 pthread_mutex_t range_lock; 765 spare_bams *bl; 766 pthread_mutex_t bam_list_lock; 767 void *job_pending; 768 int ooc; // out of containers. 769 770 int lossy_read_names; // boolean 771 int tlen_approx; // max TLEN calculation offset. 772 int tlen_zero; // If true, permit tlen 0 (=> tlen calculated) 773 774 BGZF *idxfp; // File pointer for on-the-fly index creation 775 }; 776 777 // Translation of required fields to cram data series 778 enum cram_fields { 779 CRAM_BF = 0x00000001, 780 CRAM_AP = 0x00000002, 781 CRAM_FP = 0x00000004, 782 CRAM_RL = 0x00000008, 783 CRAM_DL = 0x00000010, 784 CRAM_NF = 0x00000020, 785 CRAM_BA = 0x00000040, 786 CRAM_QS = 0x00000080, 787 CRAM_FC = 0x00000100, 788 CRAM_FN = 0x00000200, 789 CRAM_BS = 0x00000400, 790 CRAM_IN = 0x00000800, 791 CRAM_RG = 0x00001000, 792 CRAM_MQ = 0x00002000, 793 CRAM_TL = 0x00004000, 794 CRAM_RN = 0x00008000, 795 CRAM_NS = 0x00010000, 796 CRAM_NP = 0x00020000, 797 CRAM_TS = 0x00040000, 798 CRAM_MF = 0x00080000, 799 CRAM_CF = 0x00100000, 800 CRAM_RI = 0x00200000, 801 CRAM_RS = 0x00400000, 802 CRAM_PD = 0x00800000, 803 CRAM_HC = 0x01000000, 804 CRAM_SC = 0x02000000, 805 CRAM_BB = 0x04000000, 806 CRAM_BB_len = 0x08000000, 807 CRAM_QQ = 0x10000000, 808 CRAM_QQ_len = 0x20000000, 809 CRAM_aux= 0x40000000, 810 CRAM_ALL= 0x7fffffff, 811 }; 812 813 // A CIGAR opcode, but not necessarily the implications of it. Eg FC/FP may 814 // encode a base difference, but we don't need to know what it is for CIGAR. 815 // If we have a soft-clip or insertion, we do need SC/IN though to know how 816 // long that array is. 817 #define CRAM_CIGAR (CRAM_FN | CRAM_FP | CRAM_FC | CRAM_DL | CRAM_IN | \ 818 CRAM_SC | CRAM_HC | CRAM_PD | CRAM_RS | CRAM_RL | CRAM_BF) 819 820 #define CRAM_SEQ (CRAM_CIGAR | CRAM_BA | CRAM_BS | \ 821 CRAM_RL | CRAM_AP | CRAM_BB) 822 823 #define CRAM_QUAL (CRAM_CIGAR | CRAM_RL | CRAM_AP | CRAM_QS | CRAM_QQ) 824 825 /* BF bitfields */ 826 /* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */ 827 #define CRAM_FPAIRED 256 828 #define CRAM_FPROPER_PAIR 128 829 #define CRAM_FUNMAP 64 830 #define CRAM_FREVERSE 32 831 #define CRAM_FREAD1 16 832 #define CRAM_FREAD2 8 833 #define CRAM_FSECONDARY 4 834 #define CRAM_FQCFAIL 2 835 #define CRAM_FDUP 1 836 837 #define DS_aux_S "\001" 838 #define DS_aux_OQ_S "\002" 839 #define DS_aux_BQ_S "\003" 840 #define DS_aux_BD_S "\004" 841 #define DS_aux_BI_S "\005" 842 #define DS_aux_FZ_S "\006" 843 #define DS_aux_oq_S "\007" 844 #define DS_aux_os_S "\010" 845 #define DS_aux_oz_S "\011" 846 847 #define CRAM_M_REVERSE 1 848 #define CRAM_M_UNMAP 2 849 850 851 /* CF bitfields */ 852 #define CRAM_FLAG_PRESERVE_QUAL_SCORES (1<<0) 853 #define CRAM_FLAG_DETACHED (1<<1) 854 #define CRAM_FLAG_MATE_DOWNSTREAM (1<<2) 855 #define CRAM_FLAG_NO_SEQ (1<<3) 856 #define CRAM_FLAG_MASK ((1<<4)-1) 857 858 /* Internal only */ 859 #define CRAM_FLAG_STATS_ADDED (1<<30) 860 #define CRAM_FLAG_DISCARD_NAME (1U<<31) 861 862 #ifdef __cplusplus 863 } 864 #endif 865 866 #endif /* HTSLIB_CRAM_STRUCTS_H */ 867