1 /* 2 Copyright (c) 2012-2016, 2018-2020 Genome Research Ltd. 3 Author: James Bonfield <jkb@sanger.ac.uk> 4 5 Redistribution and use in source and binary forms, with or without 6 modification, are permitted provided that the following conditions are met: 7 8 1. Redistributions of source code must retain the above copyright notice, 9 this list of conditions and the following disclaimer. 10 11 2. Redistributions in binary form must reproduce the above copyright notice, 12 this list of conditions and the following disclaimer in the documentation 13 and/or other materials provided with the distribution. 14 15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger 16 Institute nor the names of its contributors may be used to endorse or promote 17 products derived from this software without specific prior written permission. 18 19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND 20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE 23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #ifndef HTSLIB_CRAM_STRUCTS_H 32 #define HTSLIB_CRAM_STRUCTS_H 33 34 /* 35 * Defines in-memory structs for the basic file-format objects in the 36 * CRAM format. 37 * 38 * The basic file format is: 39 * File-def SAM-hdr Container Container ... 40 * 41 * Container: 42 * Service-block data-block data-block ... 43 * 44 * Multiple blocks in a container are grouped together as slices, 45 * also sometimes referred to as landmarks in the spec. 46 */ 47 48 49 #include <pthread.h> 50 #include <stdint.h> 51 #include <sys/types.h> 52 53 #include "../htslib/thread_pool.h" 54 #include "../htslib/cram.h" 55 #include "string_alloc.h" 56 #include "mFILE.h" 57 #include "../htslib/khash.h" 58 59 #ifdef __cplusplus 60 extern "C" { 61 #endif 62 63 // Generic hash-map integer -> integer 64 KHASH_MAP_INIT_INT64(m_i2i, int) 65 66 // Generic hash-set integer -> (existence) 67 KHASH_SET_INIT_INT(s_i2i) 68 69 // For brevity 70 typedef unsigned char uc; 71 72 /* 73 * A union for the preservation map. Required for khash. 74 */ 75 typedef union { 76 int i; 77 char *p; 78 } pmap_t; 79 80 // Generates static functions here which isn't ideal, but we have no way 81 // currently to declare the kh_map_t structure here without also declaring a 82 // duplicate in the .c files due to the nature of the KHASH macros. 83 KHASH_MAP_INIT_STR(map, pmap_t) 84 85 struct hFILE; 86 87 #define SEQS_PER_SLICE 10000 88 #define BASES_PER_SLICE (SEQS_PER_SLICE*500) 89 #define SLICE_PER_CNT 1 90 91 #define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT" 92 93 #define MAX_STAT_VAL 1024 94 //#define MAX_STAT_VAL 16 95 typedef struct cram_stats { 96 int freqs[MAX_STAT_VAL]; 97 khash_t(m_i2i) *h; 98 int nsamp; // total number of values added 99 int nvals; // total number of unique values added 100 int64_t min_val, max_val; 101 } cram_stats; 102 103 /* NB: matches java impl, not the spec */ 104 enum cram_encoding { 105 E_NULL = 0, 106 E_EXTERNAL = 1, // Only for BYTE type in CRAM 4 107 E_GOLOMB = 2, // Not in CRAM 4 108 E_HUFFMAN = 3, // Not in CRAM 4 109 E_BYTE_ARRAY_LEN = 4, 110 E_BYTE_ARRAY_STOP = 5, 111 E_BETA = 6, // Not in CRAM 4 112 E_SUBEXP = 7, // Not in CRAM 4 113 E_GOLOMB_RICE = 8, // Not in CRAM 4 114 E_GAMMA = 9, // Not in CRAM 4 115 116 // CRAM 4 specific codecs 117 E_VARINT_UNSIGNED = 41, // Specialisation of EXTERNAL 118 E_VARINT_SIGNED = 42, // Specialisation of EXTERNAL 119 E_CONST_BYTE = 43, // Alternative to HUFFMAN with 1 symbol 120 E_CONST_INT = 44, // Alternative to HUFFMAN with 1 symbol 121 122 // More experimental ideas, not documented in spec yet 123 E_XHUFFMAN = 50, // To external block 124 E_XPACK = 51, // Transform to sub-codec 125 E_XRLE = 52, // Transform to sub-codec 126 E_XDELTA = 53, // Transform to sub-codec 127 128 // Total number of codecs, not a real one. 129 E_NUM_CODECS, 130 }; 131 132 enum cram_external_type { 133 E_INT = 1, 134 E_LONG = 2, 135 E_BYTE = 3, 136 E_BYTE_ARRAY = 4, 137 E_BYTE_ARRAY_BLOCK = 5, 138 E_SINT = 6, // signed INT 139 E_SLONG = 7, // signed LONG 140 }; 141 142 /* External IDs used by this implementation (only assumed during writing) */ 143 enum cram_DS_ID { 144 DS_CORE = 0, 145 DS_aux = 1, // aux_blk 146 DS_aux_OQ = 2, 147 DS_aux_BQ = 3, 148 DS_aux_BD = 4, 149 DS_aux_BI = 5, 150 DS_aux_FZ = 6, // also ZM:B 151 DS_aux_oq = 7, // other qualities 152 DS_aux_os = 8, // other sequences 153 DS_aux_oz = 9, // other strings 154 DS_ref, 155 DS_RN, // name_blk 156 DS_QS, // qual_blk 157 DS_IN, // base_blk 158 DS_SC, // soft_blk 159 160 DS_BF, // start loop 161 DS_CF, 162 DS_AP, 163 DS_RG, 164 DS_MQ, 165 DS_NS, 166 DS_MF, 167 DS_TS, 168 DS_NP, 169 DS_NF, 170 DS_RL, 171 DS_FN, 172 DS_FC, 173 DS_FP, 174 DS_DL, 175 DS_BA, 176 DS_BS, 177 DS_TL, 178 DS_RI, 179 DS_RS, 180 DS_PD, 181 DS_HC, 182 DS_BB, 183 DS_QQ, 184 185 DS_TN, // end loop 186 187 DS_RN_len, 188 DS_SC_len, 189 DS_BB_len, 190 DS_QQ_len, 191 192 DS_TC, // CRAM v1.0 tags 193 DS_TM, // test 194 DS_TV, // test 195 196 DS_END, 197 }; 198 199 /* "File Definition Structure" */ 200 struct cram_file_def { 201 char magic[4]; 202 uint8_t major_version; 203 uint8_t minor_version; 204 char file_id[20] HTS_NONSTRING; // Filename or SHA1 checksum 205 }; 206 207 #define CRAM_MAJOR_VERS(v) ((v) >> 8) 208 #define CRAM_MINOR_VERS(v) ((v) & 0xff) 209 210 struct cram_slice; 211 212 // Internal version of htslib/cram.h enum. 213 // Note these have to match the laout of methmap and methcost in 214 // cram_io.c:cram_compress_block2 215 enum cram_block_method_int { 216 // Public methods as defined in the CRAM spec. 217 BM_ERROR = -1, 218 219 // CRAM 2.x and 3.0 220 RAW = 0, 221 GZIP = 1, 222 BZIP2 = 2, 223 LZMA = 3, 224 RANS = 4, RANS0 = RANS, 225 226 // CRAM 3.1 onwards 227 RANSPR = 5, RANS_PR0 = RANSPR, 228 ARITH = 6, ARITH_PR0 = ARITH, 229 FQZ = 7, 230 TOK3 = 8, 231 // BSC = 9, ZSTD = 10 232 233 // Methods not externalised, but used in metrics. 234 // Externally they become one of the above methods. 235 GZIP_RLE = 11, 236 GZIP_1, // Z_DEFAULT_STRATEGY level 1, NB: not externalised in CRAM 237 238 FQZ_b, FQZ_c, FQZ_d, // Various preset FQZ methods 239 240 //RANS0, // Order 0 241 RANS1, 242 243 //RANS_PR0, // Order 0 244 RANS_PR1, // Order 1 245 RANS_PR64, // O0 + RLE 246 RANS_PR9, // O1 + X4 247 RANS_PR128, // O0 + Pack 248 RANS_PR129, // O1 + Pack 249 RANS_PR192, // O0 + RLE + pack 250 RANS_PR193, // O1 + RLE + pack 251 252 //TOK3, // tok+rans 253 TOKA, // tok+arith 254 255 //ARITH_PR0, // Order 0 256 ARITH_PR1, // Order 1 257 ARITH_PR64, // O0 + RLE 258 ARITH_PR9, // O1 + X4 259 ARITH_PR128, // O0 + Pack 260 ARITH_PR129, // O1 + Pack 261 ARITH_PR192, // O0 + RLE + pack 262 ARITH_PR193, // O1 + RLE + pack 263 264 // NB: must end on no more than 31 unless we change to a 265 // 64-bit method type. 266 }; 267 268 /* Now in htslib/cram.h 269 enum cram_content_type { 270 CT_ERROR = -1, 271 FILE_HEADER = 0, 272 COMPRESSION_HEADER = 1, 273 MAPPED_SLICE = 2, 274 UNMAPPED_SLICE = 3, // CRAM V1.0 only 275 EXTERNAL = 4, 276 CORE = 5, 277 }; 278 */ 279 280 /* Maximum simultaneous codecs allowed, 1 per bit */ 281 #define CRAM_MAX_METHOD 32 282 283 /* Compression metrics */ 284 struct cram_metrics { 285 // number of trials and time to next trial 286 int trial; 287 int next_trial; 288 int consistency; 289 290 // aggregate sizes during trials 291 int sz[CRAM_MAX_METHOD]; 292 int input_avg_sz, input_avg_delta; 293 294 // resultant method from trials 295 int method, revised_method; 296 int strat; 297 298 // Revisions of method, to allow culling of continually failing ones. 299 int cnt[CRAM_MAX_METHOD]; 300 301 double extra[CRAM_MAX_METHOD]; 302 303 // Not amenable to rANS bit-packing techniques; cardinality > 16 304 int unpackable; 305 }; 306 307 // Hash aux key (XX:i) to cram_metrics 308 KHASH_MAP_INIT_INT(m_metrics, cram_metrics*) 309 310 311 /* Block */ 312 struct cram_block { 313 enum cram_block_method_int method, orig_method; 314 enum cram_content_type content_type; 315 int32_t content_id; 316 int32_t comp_size; 317 int32_t uncomp_size; 318 uint32_t crc32; 319 int32_t idx; /* offset into data */ 320 unsigned char *data; 321 322 // For bit I/O 323 size_t alloc; 324 size_t byte; 325 int bit; 326 327 // To aid compression 328 cram_metrics *m; // used to track aux block compression only 329 330 int crc32_checked; 331 uint32_t crc_part; 332 }; 333 334 struct cram_codec; /* defined in cram_codecs.h */ 335 struct cram_map; 336 337 #define CRAM_MAP_HASH 32 338 #define CRAM_MAP(a,b) (((a)*3+(b))&(CRAM_MAP_HASH-1)) 339 340 /* Compression header block */ 341 struct cram_block_compression_hdr { 342 int32_t ref_seq_id; 343 int64_t ref_seq_start; 344 int64_t ref_seq_span; 345 int32_t num_records; 346 int32_t num_landmarks; 347 int32_t *landmark; 348 349 /* Flags from preservation map */ 350 int read_names_included; 351 int AP_delta; 352 // indexed by ref-base and subst. code 353 char substitution_matrix[5][4]; 354 int no_ref; 355 int qs_seq_orient; // 1 => same as seq. 0 => original orientation 356 357 // TD Dictionary as a concatenated block 358 cram_block *TD_blk; // Tag Dictionary 359 int nTL; // number of TL entries in TD 360 unsigned char **TL; // array of size nTL, pointer into TD_blk. 361 khash_t(m_s2i) *TD_hash; // Keyed on TD strings, map to TL[] indices 362 string_alloc_t *TD_keys; // Pooled keys for TD hash. 363 364 khash_t(map) *preservation_map; 365 struct cram_map *rec_encoding_map[CRAM_MAP_HASH]; 366 struct cram_map *tag_encoding_map[CRAM_MAP_HASH]; 367 368 struct cram_codec *codecs[DS_END]; 369 370 char *uncomp; // A single block of uncompressed data 371 size_t uncomp_size, uncomp_alloc; 372 373 // Total codec count, used for index to block_by_id for transforms 374 int ncodecs; 375 }; 376 377 typedef struct cram_map { 378 int key; /* 0xe0 + 3 bytes */ 379 enum cram_encoding encoding; 380 int offset; /* Offset into a single block of memory */ 381 int size; /* Size */ 382 struct cram_codec *codec; 383 struct cram_map *next; // for noddy internal hash 384 } cram_map; 385 386 typedef struct cram_tag_map { 387 struct cram_codec *codec; 388 cram_block *blk; 389 cram_block *blk2; 390 cram_metrics *m; 391 } cram_tag_map; 392 393 // Hash aux key (XX:i) to cram_tag_map 394 KHASH_MAP_INIT_INT(m_tagmap, cram_tag_map*) 395 396 /* Mapped or unmapped slice header block */ 397 struct cram_block_slice_hdr { 398 enum cram_content_type content_type; 399 int32_t ref_seq_id; /* if content_type == MAPPED_SLICE */ 400 int64_t ref_seq_start; /* if content_type == MAPPED_SLICE */ 401 int64_t ref_seq_span; /* if content_type == MAPPED_SLICE */ 402 int32_t num_records; 403 int64_t record_counter; 404 int32_t num_blocks; 405 int32_t num_content_ids; 406 int32_t *block_content_ids; 407 int32_t ref_base_id; /* if content_type == MAPPED_SLICE */ 408 unsigned char md5[16]; 409 }; 410 411 struct ref_entry; 412 413 /* 414 * Container. 415 * 416 * Conceptually a container is split into slices, and slices into blocks. 417 * However on disk it's just a list of blocks and we need to query the 418 * block types to identify the start/end points of the slices. 419 * 420 * OR... are landmarks the start/end points of slices? 421 */ 422 struct cram_container { 423 int32_t length; 424 int32_t ref_seq_id; 425 int64_t ref_seq_start; 426 int64_t ref_seq_span; 427 int64_t record_counter; 428 int64_t num_bases; 429 int32_t num_records; 430 int32_t num_blocks; 431 int32_t num_landmarks; 432 int32_t *landmark; 433 434 /* Size of container header above */ 435 size_t offset; 436 437 /* Compression header is always the first block? */ 438 cram_block_compression_hdr *comp_hdr; 439 cram_block *comp_hdr_block; 440 441 /* For construction purposes */ 442 int max_slice, curr_slice; // maximum number of slices 443 int curr_slice_mt; // Curr_slice when reading ahead (via threads) 444 int max_rec, curr_rec; // current and max recs per slice 445 int max_c_rec, curr_c_rec; // current and max recs per container 446 int slice_rec; // rec no. for start of this slice 447 int curr_ref; // current ref ID. -2 for no previous 448 int64_t last_pos; // last record position 449 struct cram_slice **slices, *slice; 450 int pos_sorted; // boolean, 1=>position sorted data 451 int64_t max_apos; // maximum position, used if pos_sorted==0 452 int last_slice; // number of reads in last slice (0 for 1st) 453 int multi_seq; // true if packing multi seqs per cont/slice 454 int unsorted; // true is AP_delta is 0. 455 int qs_seq_orient; // 1 => same as seq. 0 => original orientation 456 457 /* Copied from fd before encoding, to allow multi-threading */ 458 int ref_start, first_base, last_base, ref_id, ref_end; 459 char *ref; 460 //struct ref_entry *ref; 461 462 /* For multi-threading */ 463 bam_seq_t **bams; 464 465 /* Statistics for encoding */ 466 cram_stats *stats[DS_END]; 467 468 khash_t(m_tagmap) *tags_used; // set of tag types in use, for tag encoding map 469 int *refs_used; // array of frequency of ref seq IDs 470 471 uint32_t crc32; // CRC32 472 473 uint64_t s_num_bases; // number of bases in this slice 474 475 uint32_t n_mapped; // Number of mapped reads 476 }; 477 478 /* 479 * A single cram record 480 */ 481 typedef struct cram_record { 482 struct cram_slice *s; // Filled out by cram_decode only 483 484 int32_t ref_id; // fixed for all recs in slice? 485 int32_t flags; // BF 486 int32_t cram_flags; // CF 487 int32_t len; // RL 488 int64_t apos; // AP 489 int32_t rg; // RG 490 int32_t name; // RN; idx to s->names_blk 491 int32_t name_len; 492 int32_t mate_line; // index to another cram_record 493 int32_t mate_ref_id; 494 int64_t mate_pos; // NP 495 int64_t tlen; // TS 496 int64_t explicit_tlen;// TS, but PNEXT/RNEXT still need auto-computing 497 498 // Auxiliary data 499 int32_t ntags; // TC 500 int32_t aux; // idx to s->aux_blk 501 int32_t aux_size; // total size of packed ntags in aux_blk 502 #ifndef TN_external 503 int32_t TN_idx; // TN; idx to s->TN; 504 #else 505 int32_t tn; // idx to s->tn_blk 506 #endif 507 int TL; 508 509 int32_t seq; // idx to s->seqs_blk 510 int32_t qual; // idx to s->qual_blk 511 int32_t cigar; // idx to s->cigar 512 int32_t ncigar; 513 int64_t aend; // alignment end 514 int32_t mqual; // MQ 515 516 int32_t feature; // idx to s->feature 517 int32_t nfeature; // number of features 518 int32_t mate_flags; // MF 519 } cram_record; 520 521 // Accessor macros as an analogue of the bam ones 522 #define cram_qname(c) (&(c)->s->name_blk->data[(c)->name]) 523 #define cram_seq(c) (&(c)->s->seqs_blk->data[(c)->seq]) 524 #define cram_qual(c) (&(c)->s->qual_blk->data[(c)->qual]) 525 #define cram_aux(c) (&(c)->s->aux_blk->data[(c)->aux]) 526 #define cram_seqi(c,i) (cram_seq((c))[(i)]) 527 #define cram_name_len(c) ((c)->name_len) 528 #define cram_strand(c) (((c)->flags & BAM_FREVERSE) != 0) 529 #define cram_mstrand(c) (((c)->flags & BAM_FMREVERSE) != 0) 530 #define cram_cigar(c) (&((cr)->s->cigar)[(c)->cigar]) 531 532 /* 533 * A feature is a base difference, used for the sequence reference encoding. 534 * (We generate these internally when writing CRAM.) 535 */ 536 typedef union cram_feature { 537 struct { 538 int pos; 539 int code; 540 int base; // substitution code 541 } X; 542 struct { 543 int pos; 544 int code; 545 int base; // actual base & qual 546 int qual; 547 } B; 548 struct { 549 int pos; 550 int code; 551 int seq_idx; // index to s->seqs_blk 552 int len; 553 } b; 554 struct { 555 int pos; 556 int code; 557 int qual; 558 } Q; 559 struct { 560 int pos; 561 int code; 562 int len; 563 int seq_idx; // soft-clip multiple bases 564 } S; 565 struct { 566 int pos; 567 int code; 568 int len; 569 int seq_idx; // insertion multiple bases 570 } I; 571 struct { 572 int pos; 573 int code; 574 int base; // insertion single base 575 } i; 576 struct { 577 int pos; 578 int code; 579 int len; 580 } D; 581 struct { 582 int pos; 583 int code; 584 int len; 585 } N; 586 struct { 587 int pos; 588 int code; 589 int len; 590 } P; 591 struct { 592 int pos; 593 int code; 594 int len; 595 } H; 596 } cram_feature; 597 598 /* 599 * A slice is really just a set of blocks, but it 600 * is the logical unit for decoding a number of 601 * sequences. 602 */ 603 struct cram_slice { 604 cram_block_slice_hdr *hdr; 605 cram_block *hdr_block; 606 cram_block **block; 607 cram_block **block_by_id; 608 609 /* State used during encoding/decoding */ 610 int64_t last_apos, max_apos; 611 612 /* Array of decoded cram records */ 613 cram_record *crecs; 614 615 /* An dynamically growing buffers for data pointed 616 * to by crecs[] array. 617 */ 618 uint32_t *cigar; 619 uint32_t cigar_alloc; 620 uint32_t ncigar; 621 622 cram_feature *features; 623 int nfeatures; 624 int afeatures; // allocated size of features 625 626 #ifndef TN_external 627 // TN field (Tag Name) 628 uint32_t *TN; 629 int nTN, aTN; // used and allocated size for TN[] 630 #else 631 cram_block *tn_blk; 632 int tn_id; 633 #endif 634 635 // For variable sized elements which are always external blocks. 636 cram_block *name_blk; 637 cram_block *seqs_blk; 638 cram_block *qual_blk; 639 cram_block *base_blk; 640 cram_block *soft_blk; 641 cram_block *aux_blk; // BAM aux block, created while decoding CRAM 642 643 string_alloc_t *pair_keys; // Pooled keys for pair hash. 644 khash_t(m_s2i) *pair[2]; // for identifying read-pairs in this slice. 645 646 char *ref; // slice of current reference 647 int ref_start; // start position of current reference; 648 int ref_end; // end position of current reference; 649 int ref_id; 650 651 // For going from BAM to CRAM; an array of auxiliary blocks per type 652 int naux_block; 653 cram_block **aux_block; 654 655 unsigned int data_series; // See cram_fields enum 656 int decode_md; 657 658 int max_rec, curr_rec; // current and max recs per slice 659 int slice_num; // To be copied into c->curr_slice in decode 660 }; 661 662 /*----------------------------------------------------------------------------- 663 * Consider moving reference handling to cram_refs.[ch] 664 */ 665 // from fa.fai / samtools faidx files 666 typedef struct ref_entry { 667 char *name; 668 char *fn; 669 int64_t length; 670 int64_t offset; 671 int bases_per_line; 672 int line_length; 673 int64_t count; // for shared references so we know to dealloc seq 674 char *seq; 675 mFILE *mf; 676 int is_md5; // Reference comes from a raw seq found by MD5 677 } ref_entry; 678 679 KHASH_MAP_INIT_STR(refs, ref_entry*) 680 681 // References structure. 682 struct refs_t { 683 string_alloc_t *pool; // String pool for holding filenames and SN vals 684 685 khash_t(refs) *h_meta; // ref_entry*, index by name 686 ref_entry **ref_id; // ref_entry*, index by ID 687 int nref; // number of ref_entry 688 689 char *fn; // current file opened 690 BGZF *fp; // and the hFILE* to go with it. 691 692 int count; // how many cram_fd sharing this refs struct 693 694 pthread_mutex_t lock; // Mutex for multi-threaded updating 695 ref_entry *last; // Last queried sequence 696 int last_id; // Used in cram_ref_decr_locked to delay free 697 }; 698 699 /*----------------------------------------------------------------------------- 700 * CRAM index 701 * 702 * Detect format by number of entries per line. 703 * 5 => 1.0 (refid, start, nseq, C offset, slice) 704 * 6 => 1.1 (refid, start, span, C offset, S offset, S size) 705 * 706 * Indices are stored in a nested containment list, which is trivial to set 707 * up as the indices are on sorted data so we're appending to the nclist 708 * in sorted order. Basically if a slice entirely fits within a previous 709 * slice then we append to that slices list. This is done recursively. 710 * 711 * Lists are sorted on two dimensions: ref id + slice coords. 712 */ 713 typedef struct cram_index { 714 int nslice, nalloc; // total number of slices 715 struct cram_index *e; // array of size nslice 716 717 int refid; // 1.0 1.1 718 int start; // 1.0 1.1 719 int end; // 1.1 720 int nseq; // 1.0 - undocumented 721 int slice; // 1.0 landmark index, 1.1 landmark value 722 int len; // 1.1 - size of slice in bytes 723 int64_t offset; // 1.0 1.1 724 int64_t next; // derived: offset of next container. 725 } cram_index; 726 727 typedef struct { 728 int refid; 729 int64_t start; 730 int64_t end; 731 } cram_range; 732 733 /*----------------------------------------------------------------------------- 734 */ 735 /* CRAM File handle */ 736 737 typedef struct spare_bams { 738 bam_seq_t **bams; 739 struct spare_bams *next; 740 } spare_bams; 741 742 struct cram_fd; 743 typedef struct varint_vec { 744 // Returns number of bytes decoded from fd, 0 on error 745 int (*varint_decode32_crc)(struct cram_fd *fd, int32_t *val_p, uint32_t *crc); 746 int (*varint_decode32s_crc)(struct cram_fd *fd, int32_t *val_p, uint32_t *crc); 747 int (*varint_decode64_crc)(struct cram_fd *fd, int64_t *val_p, uint32_t *crc); 748 749 // Returns the value and increments *cp. Sets err to 1 iff an error occurs. 750 // NOTE: Does not set err to 0 on success. 751 int64_t (*varint_get32) (char **cp, const char *endp, int *err); 752 int64_t (*varint_get32s)(char **cp, const char *endp, int *err); 753 int64_t (*varint_get64) (char **cp, const char *endp, int *err); 754 int64_t (*varint_get64s)(char **cp, const char *endp, int *err); 755 756 // Returns the number of bytes written, <= 0 on error. 757 int (*varint_put32) (char *cp, char *endp, int32_t val_p); 758 int (*varint_put32s)(char *cp, char *endp, int32_t val_p); 759 int (*varint_put64) (char *cp, char *endp, int64_t val_p); 760 int (*varint_put64s)(char *cp, char *endp, int64_t val_p); 761 762 // Returns the number of bytes written, <= 0 on error. 763 int (*varint_put32_blk) (cram_block *blk, int32_t val_p); 764 int (*varint_put32s_blk)(cram_block *blk, int32_t val_p); 765 int (*varint_put64_blk) (cram_block *blk, int64_t val_p); 766 int (*varint_put64s_blk)(cram_block *blk, int64_t val_p); 767 768 // Returns number of bytes needed to encode 'val' 769 int (*varint_size)(int64_t val); 770 } varint_vec; 771 772 struct cram_fd { 773 struct hFILE *fp; 774 int mode; // 'r' or 'w' 775 int version; 776 cram_file_def *file_def; 777 sam_hdr_t *header; 778 779 char *prefix; 780 int64_t record_counter; 781 int err; 782 783 // Most recent compression header decoded 784 //cram_block_compression_hdr *comp_hdr; 785 //cram_block_slice_hdr *slice_hdr; 786 787 // Current container being processed 788 cram_container *ctr; 789 790 // Current container used for decoder threads 791 cram_container *ctr_mt; 792 793 // positions for encoding or decoding 794 int first_base, last_base; 795 796 // cached reference portion 797 refs_t *refs; // ref meta-data structure 798 char *ref, *ref_free; // current portion held in memory 799 int ref_id; 800 int ref_start; 801 int ref_end; 802 char *ref_fn; // reference fasta filename 803 804 // compression level and metrics 805 int level; 806 cram_metrics *m[DS_END]; 807 khash_t(m_metrics) *tags_used; // cram_metrics[], per tag types in use. 808 809 // options 810 int decode_md; // Whether to export MD and NM tags 811 int seqs_per_slice; 812 int bases_per_slice; 813 int slices_per_container; 814 int embed_ref; 815 int no_ref; 816 int ignore_md5; 817 int use_bz2; 818 int use_rans; 819 int use_lzma; 820 int use_fqz; 821 int use_tok; 822 int use_arith; 823 int shared_ref; 824 unsigned int required_fields; 825 int store_md; 826 int store_nm; 827 cram_range range; 828 829 // lookup tables, stored here so we can be trivially multi-threaded 830 unsigned int bam_flag_swap[0x1000]; // cram -> bam flags 831 unsigned int cram_flag_swap[0x1000];// bam -> cram flags 832 unsigned char L1[256]; // ACGT{*} ->0123{4} 833 unsigned char L2[256]; // ACGTN{*}->01234{5} 834 char cram_sub_matrix[32][32]; // base substitution codes 835 836 int index_sz; 837 cram_index *index; // array, sizeof index_sz 838 off_t first_container; 839 off_t curr_position; 840 int eof; 841 int last_slice; // number of recs encoded in last slice 842 int last_RI_count; // number of references encoded in last container 843 int multi_seq; // -1 is auto, 0 is one ref per container, 1 is multi... 844 int multi_seq_user; // Original user setting (CRAM_OPT_MULTI_SEQ_PER_SLICE) 845 int unsorted; 846 int last_mapped; // number of mapped reads in last container 847 int empty_container; // Marker for EOF block 848 849 // thread pool 850 int own_pool; 851 hts_tpool *pool; 852 hts_tpool_process *rqueue; 853 pthread_mutex_t metrics_lock; 854 pthread_mutex_t ref_lock; 855 pthread_mutex_t range_lock; 856 spare_bams *bl; 857 pthread_mutex_t bam_list_lock; 858 void *job_pending; 859 int ooc; // out of containers. 860 861 int lossy_read_names; // boolean 862 int tlen_approx; // max TLEN calculation offset. 863 int tlen_zero; // If true, permit tlen 0 (=> tlen calculated) 864 865 BGZF *idxfp; // File pointer for on-the-fly index creation 866 867 // variable integer decoding callbacks. 868 // This changed in CRAM4.0 to a data-size agnostic encoding. 869 varint_vec vv; 870 871 // Force AP delta even on non positional sorted data. 872 // This can be beneficial for pairs where pairs are nearby each other. 873 // We suffer with delta to unrelated things (previous pair), but gain 874 // in delta between them. (Ideal would be a per read setting.) 875 int ap_delta; 876 }; 877 878 // Translation of required fields to cram data series 879 enum cram_fields { 880 CRAM_BF = 0x00000001, 881 CRAM_AP = 0x00000002, 882 CRAM_FP = 0x00000004, 883 CRAM_RL = 0x00000008, 884 CRAM_DL = 0x00000010, 885 CRAM_NF = 0x00000020, 886 CRAM_BA = 0x00000040, 887 CRAM_QS = 0x00000080, 888 CRAM_FC = 0x00000100, 889 CRAM_FN = 0x00000200, 890 CRAM_BS = 0x00000400, 891 CRAM_IN = 0x00000800, 892 CRAM_RG = 0x00001000, 893 CRAM_MQ = 0x00002000, 894 CRAM_TL = 0x00004000, 895 CRAM_RN = 0x00008000, 896 CRAM_NS = 0x00010000, 897 CRAM_NP = 0x00020000, 898 CRAM_TS = 0x00040000, 899 CRAM_MF = 0x00080000, 900 CRAM_CF = 0x00100000, 901 CRAM_RI = 0x00200000, 902 CRAM_RS = 0x00400000, 903 CRAM_PD = 0x00800000, 904 CRAM_HC = 0x01000000, 905 CRAM_SC = 0x02000000, 906 CRAM_BB = 0x04000000, 907 CRAM_BB_len = 0x08000000, 908 CRAM_QQ = 0x10000000, 909 CRAM_QQ_len = 0x20000000, 910 CRAM_aux= 0x40000000, 911 CRAM_ALL= 0x7fffffff, 912 }; 913 914 // A CIGAR opcode, but not necessarily the implications of it. Eg FC/FP may 915 // encode a base difference, but we don't need to know what it is for CIGAR. 916 // If we have a soft-clip or insertion, we do need SC/IN though to know how 917 // long that array is. 918 #define CRAM_CIGAR (CRAM_FN | CRAM_FP | CRAM_FC | CRAM_DL | CRAM_IN | \ 919 CRAM_SC | CRAM_HC | CRAM_PD | CRAM_RS | CRAM_RL | CRAM_BF) 920 921 #define CRAM_SEQ (CRAM_CIGAR | CRAM_BA | CRAM_BS | \ 922 CRAM_RL | CRAM_AP | CRAM_BB) 923 924 #define CRAM_QUAL (CRAM_CIGAR | CRAM_RL | CRAM_AP | CRAM_QS | CRAM_QQ) 925 926 /* BF bitfields */ 927 /* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */ 928 #define CRAM_FPAIRED 256 929 #define CRAM_FPROPER_PAIR 128 930 #define CRAM_FUNMAP 64 931 #define CRAM_FREVERSE 32 932 #define CRAM_FREAD1 16 933 #define CRAM_FREAD2 8 934 #define CRAM_FSECONDARY 4 935 #define CRAM_FQCFAIL 2 936 #define CRAM_FDUP 1 937 938 #define DS_aux_S "\001" 939 #define DS_aux_OQ_S "\002" 940 #define DS_aux_BQ_S "\003" 941 #define DS_aux_BD_S "\004" 942 #define DS_aux_BI_S "\005" 943 #define DS_aux_FZ_S "\006" 944 #define DS_aux_oq_S "\007" 945 #define DS_aux_os_S "\010" 946 #define DS_aux_oz_S "\011" 947 948 #define CRAM_M_REVERSE 1 949 #define CRAM_M_UNMAP 2 950 951 952 /* CF bitfields */ 953 #define CRAM_FLAG_PRESERVE_QUAL_SCORES (1<<0) 954 #define CRAM_FLAG_DETACHED (1<<1) 955 #define CRAM_FLAG_MATE_DOWNSTREAM (1<<2) 956 #define CRAM_FLAG_NO_SEQ (1<<3) 957 #define CRAM_FLAG_EXPLICIT_TLEN (1<<4) 958 #define CRAM_FLAG_MASK ((1<<5)-1) 959 960 /* Internal only */ 961 #define CRAM_FLAG_STATS_ADDED (1<<30) 962 #define CRAM_FLAG_DISCARD_NAME (1U<<31) 963 964 #ifdef __cplusplus 965 } 966 #endif 967 968 #endif /* HTSLIB_CRAM_STRUCTS_H */ 969