1 /* 2 * Copyright (c) 2013 Genome Research Ltd. 3 * Author(s): James Bonfield 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 * 11 * 2. Redistributions in binary form must reproduce the above 12 * copyright notice, this list of conditions and the following 13 * disclaimer in the documentation and/or other materials provided 14 * with the distribution. 15 * 16 * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger 17 * Institute nor the names of its contributors may be used to endorse 18 * or promote products derived from this software without specific 19 * prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS 22 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 24 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH 25 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #ifndef _CRAM_STRUCTS_H_ 35 #define _CRAM_STRUCTS_H_ 36 37 #ifdef __cplusplus 38 extern "C" { 39 #endif 40 41 /* 42 * Defines in-memory structs for the basic file-format objects in the 43 * CRAM format. 44 * 45 * The basic file format is: 46 * File-def SAM-hdr Container Container ... 47 * 48 * Container: 49 * Service-block data-block data-block ... 50 * 51 * Multiple blocks in a container are grouped together as slices, 52 * also sometimes referred to as landmarks in the spec. 53 */ 54 55 56 #include <stdint.h> 57 58 #include "io_lib/hash_table.h" // From io_lib aka staden-read 59 #include "io_lib/thread_pool.h" 60 #include "io_lib/mFILE.h" 61 #include "io_lib/bgzip.h" 62 63 #ifdef SAMTOOLS 64 // From within samtools/HTSlib 65 # include "io_lib/string_alloc.h" 66 #else 67 // From within io_lib 68 # include "io_lib/bam.h" // For BAM header parsing 69 #endif 70 71 #define SEQS_PER_SLICE 10000 72 #define BASES_PER_SLICE (SEQS_PER_SLICE*500) 73 #define SLICE_PER_CNT 1 74 75 #define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT" 76 77 // TN only in Cram v1 78 //#define TN_external 79 80 #define MAX_STAT_VAL 1024 81 //#define MAX_STAT_VAL 16 82 typedef struct { 83 int freqs[MAX_STAT_VAL]; 84 HashTable *h; 85 int nsamp; // total number of values added 86 int nvals; // total number of unique values added 87 } cram_stats; 88 89 /* NB: matches java impl, not the spec */ 90 enum cram_encoding { 91 E_NULL = 0, 92 E_EXTERNAL = 1, 93 E_GOLOMB = 2, 94 E_HUFFMAN = 3, 95 E_BYTE_ARRAY_LEN = 4, 96 E_BYTE_ARRAY_STOP = 5, 97 E_BETA = 6, 98 E_SUBEXP = 7, 99 E_GOLOMB_RICE = 8, 100 E_GAMMA = 9, 101 E_NUM_CODECS = 10, /* Number of codecs, not a real one. */ 102 }; 103 104 enum cram_external_type { 105 E_INT = 1, 106 E_LONG = 2, 107 E_BYTE = 3, 108 E_BYTE_ARRAY = 4, 109 E_BYTE_ARRAY_BLOCK = 5, 110 }; 111 112 /* External IDs used by this implementation (only assumed during writing) */ 113 enum cram_DS_ID { 114 DS_CORE = 0, 115 DS_aux = 1, // aux_blk 116 DS_aux_OQ = 2, 117 DS_aux_BQ = 3, 118 DS_aux_BD = 4, 119 DS_aux_BI = 5, 120 DS_aux_FZ = 6, // also ZM:B 121 DS_aux_oq = 7, // other qualities 122 DS_aux_os = 8, // other sequences 123 DS_aux_oz = 9, // other strings 124 DS_ref, 125 DS_RN, // name_blk 126 DS_QS, // qual_blk 127 DS_IN, // base_blk 128 DS_SC, // soft_blk 129 130 DS_BF, // start loop 131 DS_CF, 132 DS_AP, 133 DS_RG, 134 DS_MQ, 135 DS_NS, 136 DS_MF, 137 DS_TS, 138 DS_NP, 139 DS_NF, 140 DS_RL, 141 DS_FN, 142 DS_FC, 143 DS_FP, 144 DS_DL, 145 DS_BA, 146 DS_BS, 147 DS_TL, 148 DS_RI, 149 DS_RS, 150 DS_PD, 151 DS_HC, 152 DS_BB, 153 DS_QQ, 154 155 DS_TN, // end loop 156 157 DS_RN_len, 158 DS_SC_len, 159 DS_BB_len, 160 DS_QQ_len, 161 162 DS_TC, // CRAM v1.0 tags 163 DS_TM, // test 164 DS_TV, // test 165 166 DS_END, 167 }; 168 169 /* "File Definition Structure" */ 170 typedef struct { 171 char magic[4]; 172 uint8_t major_version; 173 uint8_t minor_version; 174 char file_id[20]; // Filename or SHA1 checksum 175 } cram_file_def; 176 177 #define CRAM_MAJOR_VERS(v) ((v) >> 8) 178 #define CRAM_MINOR_VERS(v) ((v) & 0xff) 179 #define IS_CRAM_1_VERS(fd) (CRAM_MAJOR_VERS((fd)->version)==1) 180 #define IS_CRAM_2_VERS(fd) (CRAM_MAJOR_VERS((fd)->version)==2) 181 #define IS_CRAM_3_VERS(fd) (CRAM_MAJOR_VERS((fd)->version)>=3) 182 183 struct cram_slice; 184 185 enum cram_block_method { 186 BM_ERROR = -1, 187 RAW = 0, 188 GZIP = 1, // Z_FILTERED 189 BZIP2 = 2, 190 LZMA = 3, 191 RANS0 = 4, 192 BSC = 5, 193 FQZ = 6, 194 RANS1 = 10, // Not externalised; stored as RANS (generic) 195 GZIP_RLE = 11, // Z_RLE, NB: not externalised in CRAM 196 GZIP_1 = 12, // Z_DEFAULT_STRATEGY level 1, NB: not externalised in CRAM 197 198 RANS_PR0 = 13, // Parameterised r4x16pr rANS codecs. 199 RANS_PR1 = 14, 200 RANS_PR64 = 15, 201 RANS_PR65 = 16, 202 RANS_PR128 = 17, 203 RANS_PR129 = 18, 204 RANS_PR192 = 19, 205 RANS_PR193 = 20, 206 207 NAME_TOK3 = 21, 208 209 BLOCK_METHOD_END = 31 210 }; 211 212 enum cram_content_type { 213 CT_ERROR = -1, 214 FILE_HEADER = 0, 215 COMPRESSION_HEADER = 1, 216 MAPPED_SLICE = 2, 217 UNMAPPED_SLICE = 3, // CRAM_1_VERS only 218 EXTERNAL = 4, 219 CORE = 5, 220 }; 221 222 /* Maximum simultaneous codecs allowed, 1 per bit */ 223 #define CRAM_MAX_METHOD 32 224 225 /* Compression metrics */ 226 typedef struct { 227 // number of trials and time to next trial 228 int trial; 229 int next_trial; 230 int consistency; 231 232 // aggregate sizes during trials 233 int sz[CRAM_MAX_METHOD]; 234 235 // resultant method from trials 236 int method; 237 int strat; 238 239 // Revisions of method, to allow culling of continually failing ones. 240 int cnt[CRAM_MAX_METHOD]; 241 int revised_method; 242 243 double extra[CRAM_MAX_METHOD]; 244 245 cram_stats *stats; 246 } cram_metrics; 247 248 /* Block */ 249 typedef struct { 250 enum cram_block_method method, orig_method; 251 enum cram_content_type content_type; 252 int32_t content_id; 253 int32_t comp_size; 254 int32_t uncomp_size; 255 uint32_t crc32; 256 int32_t idx; /* offset into data */ 257 unsigned char *data; 258 259 // For bit I/O 260 size_t alloc; 261 size_t byte; 262 int bit; 263 264 // To aid compression 265 cram_metrics *m; // used to track aux block compression only 266 267 int crc32_checked; 268 uint32_t crc_part; 269 } cram_block; 270 271 struct cram_codec; /* defined in cram_codecs.h */ 272 struct cram_map; 273 274 #define CRAM_MAP_HASH 32 275 #define CRAM_MAP(a,b) (((a)*3+(b))&(CRAM_MAP_HASH-1)) 276 277 /* Compression header block */ 278 typedef struct { 279 int32_t ref_seq_id; 280 int64_t ref_seq_start; 281 int64_t ref_seq_span; 282 int32_t num_records; 283 int32_t num_landmarks; 284 int32_t *landmark; 285 286 /* Flags from preservation map */ 287 int mapped_qs_included; 288 int unmapped_qs_included; 289 int unmapped_placed; 290 int qs_included; 291 int read_names_included; 292 int AP_delta; 293 // indexed by ref-base and subst. code 294 char substitution_matrix[5][4]; 295 int no_ref; 296 297 // TD Dictionary as a concatenated block 298 cram_block *TD_blk; // Tag Dictionary 299 int nTL; // number of TL entries in TD 300 unsigned char **TL; // array of size nTL, pointer into TD_blk. 301 HashTable *TD; // for encoding, keyed on TD entries 302 303 HashTable *preservation_map; 304 struct cram_map *rec_encoding_map[CRAM_MAP_HASH]; 305 struct cram_map *tag_encoding_map[CRAM_MAP_HASH]; 306 307 struct cram_codec *codecs[DS_END]; 308 309 char *uncomp; // A single block of uncompressed data 310 size_t uncomp_size, uncomp_alloc; 311 } cram_block_compression_hdr; 312 313 typedef struct cram_map { 314 int key; /* 0xe0 + 3 bytes */ 315 enum cram_encoding encoding; 316 int offset; /* Offset into a single block of memory */ 317 int size; /* Size */ 318 struct cram_codec *codec; 319 struct cram_map *next; // for noddy internal hash 320 } cram_map; 321 322 typedef struct { 323 struct cram_codec *codec; 324 cram_block *blk; 325 cram_metrics *m; 326 } cram_tag_map; 327 328 /* Mapped or unmapped slice header block */ 329 typedef struct { 330 enum cram_content_type content_type; 331 int32_t ref_seq_id; /* if content_type == MAPPED_SLICE */ 332 int64_t ref_seq_start; /* if content_type == MAPPED_SLICE */ 333 int64_t ref_seq_span; /* if content_type == MAPPED_SLICE */ 334 int32_t num_records; 335 int64_t record_counter; 336 int32_t num_blocks; 337 int32_t num_content_ids; 338 int32_t *block_content_ids; 339 int32_t ref_base_id; /* if content_type == MAPPED_SLICE */ 340 unsigned char md5[16]; 341 HashTable *tags; /* hash of optional tags */ 342 uint32_t BD_crc; /* base call digest */ 343 uint32_t SD_crc; /* quality score digest */ 344 } cram_block_slice_hdr; 345 346 struct ref_entry; 347 348 /* 349 * Container. 350 * 351 * Conceptually a container is split into slices, and slices into blocks. 352 * However on disk it's just a list of blocks and we need to query the 353 * block types to identify the start/end points of the slices. 354 * 355 * OR... are landmarks the start/end points of slices? 356 */ 357 typedef struct { 358 int32_t length; 359 int32_t ref_seq_id; 360 int64_t ref_seq_start; 361 int64_t ref_seq_span; 362 int64_t record_counter; 363 int64_t num_bases; 364 int32_t num_records; 365 int32_t num_blocks; 366 int32_t num_landmarks; 367 int32_t *landmark; 368 369 /* Size of container header above */ 370 size_t offset; 371 372 /* Compression header is always the first block? */ 373 cram_block_compression_hdr *comp_hdr; 374 cram_block *comp_hdr_block; 375 376 /* For construction purposes */ 377 int max_slice, curr_slice; // maximum number of slices 378 int curr_slice_mt; // Curr slice when reading ahead (via threads) 379 int max_rec, curr_rec; // current and max recs per slice 380 int max_c_rec, curr_c_rec; // current and max recs per container 381 int slice_rec; // rec no. for start of this slice 382 int curr_ref; // current ref ID. -2 for no previous 383 int64_t last_pos; // last record position 384 struct cram_slice **slices, *slice; 385 int pos_sorted; // boolean, 1=>position sorted data 386 int64_t max_apos; // maximum position, used if pos_sorted==0 387 int last_slice; // number of reads in last slice (0 for 1st) 388 int multi_seq; // true if packing multi seqs per cont/slice 389 int unsorted; // true is AP_delta is 0. 390 391 /* Copied from fd before encoding, to allow multi-threading */ 392 int64_t ref_start, first_base, last_base, ref_id, ref_end; 393 char *ref; 394 //struct ref_entry *ref; 395 396 /* For multi-threading */ 397 bam_seq_t **bams; 398 399 /* Statistics for encoding */ 400 cram_stats *stats[DS_END]; 401 402 HashTable *tags_used; // cram_tag_map[], per tag types in use. 403 404 int *refs_used; // array of frequency of ref seq IDs 405 406 // For experimental name delta 407 char *last_name; 408 409 uint32_t crc32; // Raw container bytes CRC 410 411 uint64_t s_num_bases; // number of bases in this slice 412 } cram_container; 413 414 /* 415 * A single cram record 416 */ 417 typedef struct { 418 struct cram_slice *s; // Filled out by cram_decode only 419 420 int32_t ref_id; // fixed for all recs in slice? 421 int32_t flags; // BF 422 int32_t cram_flags; // CF 423 int32_t len; // RL 424 int64_t apos; // AP 425 int32_t rg; // RG 426 int32_t name; // RN; idx to s->names_blk 427 int32_t name_len; 428 int32_t mate_line; // index to another cram_record 429 int32_t mate_ref_id; 430 int64_t mate_pos; // NP 431 int64_t tlen; // TS 432 433 // Auxiliary data 434 int32_t ntags; // TC 435 int32_t aux; // idx to s->aux_blk 436 int32_t aux_size; // total size of packed ntags in aux_blk 437 #ifndef TN_external 438 int32_t TN_idx; // TN; idx to s->TN; 439 #else 440 int32_t tn; // idx to s->tn_blk 441 #endif 442 int TL; 443 444 int32_t seq; // idx to s->seqs_blk 445 int32_t qual; // idx to s->qual_blk 446 int32_t cigar; // idx to s->cigar 447 int32_t ncigar; 448 int64_t aend; // alignment end 449 int32_t mqual; // MQ 450 451 int32_t feature; // idx to s->feature 452 int32_t nfeature; // number of features 453 int32_t mate_flags; // MF 454 } cram_record; 455 456 // Accessor macros as an analogue of the bam ones 457 #define cram_qname(c) (&(c)->s->name_blk->data[(c)->name]) 458 #define cram_seq(c) (&(c)->s->seqs_blk->data[(c)->seq]) 459 #define cram_qual(c) (&(c)->s->qual_blk->data[(c)->qual]) 460 #define cram_aux(c) (&(c)->s->aux_blk->data[(c)->aux]) 461 #define cram_seqi(c,i) (cram_seq((c))[(i)]) 462 #define cram_name_len(c) ((c)->name_len) 463 #define cram_strand(c) (((c)->flags & BAM_FREVERSE) != 0) 464 #define cram_mstrand(c) (((c)->flags & BAM_FMREVERSE) != 0) 465 #define cram_cigar(c) (&((cr)->s->cigar)[(c)->cigar]) 466 467 /* 468 * A feature is a base difference, used for the sequence reference encoding. 469 * (We generate these internally when writing CRAM.) 470 */ 471 typedef struct { 472 union { 473 struct { 474 int pos; 475 int code; 476 int base; // substitution code 477 } X; 478 struct { 479 int pos; 480 int code; 481 int base; // actual base & qual 482 int qual; 483 } B; 484 struct { 485 int pos; 486 int code; 487 int seq_idx; // index to s->seqs_blk 488 int len; 489 } b; 490 struct { 491 int pos; 492 int code; 493 int qual; 494 } Q; 495 struct { 496 int pos; 497 int code; 498 int len; 499 int seq_idx; // soft-clip multiple bases 500 } S; 501 struct { 502 int pos; 503 int code; 504 int len; 505 int seq_idx; // insertion multiple bases 506 } I; 507 struct { 508 int pos; 509 int code; 510 int base; // insertion single base 511 } i; 512 struct { 513 int pos; 514 int code; 515 int len; 516 } D; 517 struct { 518 int pos; 519 int code; 520 int len; 521 } N; 522 struct { 523 int pos; 524 int code; 525 int len; 526 } P; 527 struct { 528 int pos; 529 int code; 530 int len; 531 } H; 532 }; 533 } cram_feature; 534 535 //// Turns [A-Z][A-Z] into an integer from 0 to 32*32 536 //#define ID(a) ((((a)[0]-'A')<<5)+(a)[1]-'A') 537 538 /* 539 * A slice is really just a set of blocks, but it 540 * is the logical unit for decoding a number of 541 * sequences. 542 */ 543 typedef struct cram_slice { 544 cram_block_slice_hdr *hdr; 545 cram_block *hdr_block; 546 cram_block **block; 547 cram_block **block_by_id; 548 549 /* State used during encoding/decoding */ 550 int64_t last_apos, max_apos; 551 552 /* Array of decoded cram records */ 553 cram_record *crecs; 554 555 /* An dynamically growing buffers for data pointed 556 * to by crecs[] array. 557 */ 558 uint32_t *cigar; 559 uint32_t cigar_alloc; 560 uint32_t ncigar; 561 562 cram_feature *features; 563 int nfeatures; 564 int afeatures; // allocated size of features 565 566 #ifndef TN_external 567 // TN field (Tag Name) 568 uint32_t *TN; 569 int nTN, aTN; // used and allocated size for TN[] 570 #else 571 cram_block *tn_blk; 572 int tn_id; 573 #endif 574 575 // For variable sized elements which are always external blocks. 576 cram_block *name_blk; 577 cram_block *seqs_blk; 578 cram_block *qual_blk; 579 cram_block *base_blk; 580 cram_block *soft_blk; 581 cram_block *aux_blk; // BAM aux block, used when going from CRAM to BAM 582 583 HashTable *pair[2]; // for identifying read-pairs in this slice. 584 585 char *ref; // slice of current reference 586 int ref_start; // start position of current reference; 587 int ref_end; // end position of current reference; 588 int ref_id; 589 590 uint32_t BD_crc; // base call digest 591 uint32_t SD_crc; // quality score digest 592 593 // For going from BAM to CRAM; an array of auxiliary blocks per type 594 int naux_block; 595 cram_block **aux_block; 596 597 unsigned int data_series; // See cram_fields enum below 598 int decode_md; 599 600 // Caching of block ID to block ptr for some blocks. 601 cram_block *id2blk[256]; 602 603 int max_rec, curr_rec; // current and max recs per slice 604 int slice_num; // To be copied into c->curr_slice in decode 605 606 // Cache of converted BAM structs 607 bam_seq_t **bl; 608 } cram_slice; 609 610 /*----------------------------------------------------------------------------- 611 * Consider moving reference handling to cram_refs.[ch] 612 */ 613 // from fa.fai / samtools faidx files 614 typedef struct ref_entry { 615 char *name; 616 char *fn; 617 int64_t length; 618 int64_t offset; 619 int bases_per_line; 620 int line_length; 621 int64_t count; // for shared references so we know to dealloc seq 622 char *seq; 623 mFILE *mf; 624 } ref_entry; 625 626 // References structure. 627 typedef struct { 628 string_alloc_t *pool; // String pool for holding filenames and SN vals 629 630 HashTable *h_meta; // ref_entry*, index by name 631 ref_entry **ref_id; // ref_entry*, index by ID 632 int nref; // number of ref_entry 633 634 char *fn; // current file opened 635 bzi_FILE *fp; // and the bzi_FILE* to go with it. 636 637 int count; // how many cram_fd sharing this refs struct 638 639 pthread_mutex_t lock; // Mutex for multi-threaded updating 640 ref_entry *last; // Last queried sequence 641 int last_id; // Used in cram_ref_decr_locked to delay free 642 } refs_t; 643 644 /*----------------------------------------------------------------------------- 645 * CRAM index 646 * 647 * Detect format by number of entries per line. 648 * 5 => 1.0 (refid, start, nseq, C offset, slice) 649 * 6 => 1.1 (refid, start, span, C offset, S offset, S size) 650 * 651 * Indices are stored in a nested containment list, which is trivial to set 652 * up as the indices are on sorted data so we're appending to the nclist 653 * in sorted order. Basically if a slice entirely fits within a previous 654 * slice then we append to that slices list. This is done recursively. 655 * 656 * Lists are sorted on two dimensions: ref id + slice coords. 657 */ 658 typedef struct cram_index { 659 int nslice, nalloc; // total number of slices 660 struct cram_index *e; // array of size nslice 661 662 int refid; // 1.0 1.1 663 int start; // 1.0 1.1 664 int end; // 1.1 665 int nseq; // 1.0 - undocumented 666 int slice; // 1.0 landmark index, 1.1 landmark value 667 int len; // 1.1 - size of slice in bytes 668 int64_t offset; // 1.0 1.1 669 } cram_index; 670 671 typedef struct { 672 int refid; 673 int64_t start; 674 int64_t end; 675 } cram_range; 676 677 /*----------------------------------------------------------------------------- 678 */ 679 /* CRAM File handle */ 680 681 typedef struct spare_bams { 682 bam_seq_t **bams; 683 struct spare_bams *next; 684 } spare_bams; 685 686 #if defined(CRAM_IO_CUSTOM_BUFFERING) 687 typedef size_t (*cram_io_C_FILE_fread_t)(void *ptr, size_t size, size_t nmemb, void *stream); 688 typedef size_t (*cram_io_C_FILE_fwrite_t)(void *ptr, size_t size, size_t nmemb, void *stream); 689 typedef int (*cram_io_C_FILE_fseek_t)(void * fd, off_t offset, int whence); 690 typedef off_t (*cram_io_C_FILE_ftell_t)(void * fd); 691 692 typedef struct { 693 void *user_data; 694 cram_io_C_FILE_fread_t fread_callback; 695 cram_io_C_FILE_fseek_t fseek_callback; 696 cram_io_C_FILE_ftell_t ftell_callback; 697 } cram_io_input_t; 698 699 typedef struct { 700 void *user_data; 701 cram_io_C_FILE_fwrite_t fwrite_callback; 702 cram_io_C_FILE_ftell_t ftell_callback; 703 } cram_io_output_t; 704 705 typedef cram_io_input_t * (*cram_io_allocate_read_input_t)(char const * filename, int const decompress); 706 typedef cram_io_input_t * (*cram_io_deallocate_read_input_t)(cram_io_input_t * obj); 707 708 typedef cram_io_output_t * (*cram_io_allocate_write_output_t)(char const * filename); 709 typedef cram_io_output_t * (*cram_io_deallocate_write_output_t)(cram_io_output_t * obj); 710 711 712 713 // FIXME: make cram_fd_input_buffer and cram_fd_input_buffer the same thing. 714 // Ie cram_fd_io_buffer and internals fp_io_*. 715 716 typedef struct { 717 /* input buffer size */ 718 size_t fp_in_buf_size; 719 /* input buffer base pointer */ 720 char *fp_in_buffer; 721 /* position of buffer start in file */ 722 uint64_t fp_in_buf_start; 723 /* start of window pointer; same as fp_in_buffer */ 724 char *fp_in_buf_pa; 725 /* window current pointer */ 726 char *fp_in_buf_pc; 727 /* window end pointer; same as fp_in_buffer + fp_in_buf_size (no seeks) */ 728 char *fp_in_buf_pe; 729 } cram_fd_input_buffer; 730 731 typedef struct { 732 /* output buffer size */ 733 size_t fp_out_buf_size; 734 /* output buffer base pointer */ 735 char *fp_out_buffer; 736 /* position of buffer start in file */ 737 uint64_t fp_out_buf_start; 738 /* start of window pointer; same as fp_out_buffer */ 739 char *fp_out_buf_pa; 740 /* window current pointer */ 741 char *fp_out_buf_pc; 742 /* window end pointer */ 743 char *fp_out_buf_pe; 744 } cram_fd_output_buffer; 745 #endif 746 747 typedef struct { 748 FILE *fp_in; 749 #if defined(CRAM_IO_CUSTOM_BUFFERING) 750 cram_fd_input_buffer *fp_in_buffer; 751 cram_io_input_t *fp_in_callbacks; 752 cram_io_allocate_read_input_t fp_in_callback_allocate_function; 753 cram_io_deallocate_read_input_t fp_in_callback_deallocate_function; 754 755 cram_fd_output_buffer *fp_out_buffer; 756 cram_io_output_t *fp_out_callbacks; 757 cram_io_allocate_write_output_t fp_out_callback_allocate_function; 758 cram_io_deallocate_write_output_t fp_out_callback_deallocate_function; 759 #endif 760 761 FILE *fp_out; 762 int mode; // 'r' or 'w' 763 int version; 764 cram_file_def *file_def; 765 SAM_hdr *header; 766 767 char *prefix; 768 int64_t record_counter; 769 int err; 770 771 // Most recent compression header decoded 772 //cram_block_compression_hdr *comp_hdr; 773 //cram_block_slice_hdr *slice_hdr; 774 775 // Current container being processed. 776 cram_container *ctr; 777 778 // Current container used for decoder threads 779 cram_container *ctr_mt; 780 781 // positions for encoding or decoding 782 int64_t first_base, last_base; 783 784 // cached reference portion 785 refs_t *refs; // ref meta-data structure 786 char *ref, *ref_free; // current portion held in memory 787 int ref_id; 788 int ref_start; 789 int ref_end; 790 char *ref_fn; // reference fasta filename 791 792 // compression level and metrics 793 int level; 794 cram_metrics *m[DS_END]; 795 HashTable *tags_used; // cram_metrics[], per tag types in use. 796 797 // options 798 int decode_md; // Whether to export MD and NM tags 799 int verbose; 800 int seqs_per_slice; 801 int bases_per_slice; 802 int slices_per_container; 803 int embed_ref; 804 int no_ref; 805 int ignore_md5; 806 int use_bz2; 807 int use_rans; 808 int use_lzma; 809 int use_bsc; 810 int use_fqz; 811 int shared_ref; 812 enum quality_binning binning; 813 unsigned int required_fields; 814 cram_range range; 815 816 // lookup tables, stored here so we can be trivially multi-threaded 817 unsigned int bam_flag_swap[0x1000]; // cram -> bam flags 818 unsigned int cram_flag_swap[0x1000];// bam -> cram flags 819 unsigned char L1[256]; // ACGT{*} ->0123{4} 820 unsigned char L2[256]; // ACGTN{*}->01234{5} 821 char cram_sub_matrix[32][32]; // base substituion codes 822 823 int index_sz; 824 cram_index *index; // array, sizeof index_sz 825 off_t first_container; 826 int eof; 827 int last_slice; // number of recs encoded in last slice 828 int multi_seq; 829 int unsorted; 830 int empty_container; // Marker for EOF block 831 832 // thread pool 833 int own_pool; 834 t_pool *pool; 835 t_results_queue *rqueue; 836 pthread_mutex_t *metrics_lock; 837 pthread_mutex_t *ref_lock; 838 spare_bams *bl; 839 pthread_mutex_t *bam_list_lock; 840 void *job_pending; 841 842 int ooc; // out of containers. 843 int ignore_chksum; 844 int lossy_read_names; 845 int preserve_aux_order; // if set implies emitting RG, MD and NM 846 int preserve_aux_size; // does not replace 'i' with 'c' etc in aux. 847 } cram_fd; 848 849 #if defined(CRAM_IO_CUSTOM_BUFFERING) 850 extern size_t cram_io_input_buffer_read(void *ptr, size_t size, size_t nmemb, cram_fd * fd); 851 extern int cram_io_input_buffer_seek(cram_fd * fd, off_t offset, int whence); 852 extern int cram_io_input_buffer_underflow(cram_fd * fd); 853 extern char * cram_io_input_buffer_fgets(char * s, int size, cram_fd * fd); 854 extern int cram_io_flush_output_buffer(cram_fd *fd); 855 #endif 856 857 #if defined(CRAM_IO_CUSTOM_BUFFERING) 858 #define CRAM_IO_GETC(fd) ((fd->fp_in_buffer->fp_in_buf_pc!=fd->fp_in_buffer->fp_in_buf_pe) ? ((int)((unsigned char)(*(fd->fp_in_buffer->fp_in_buf_pc++)))) : cram_io_input_buffer_underflow(fd)) 859 #define CRAM_IO_READ(ptr, size, nmemb, fd) cram_io_input_buffer_read(ptr,size,nmemb,fd) 860 #define CRAM_IO_SEEK(fd, offset, whence) cram_io_input_buffer_seek(fd, offset, whence) 861 #define CRAM_IO_TELLO(fd) (fd->fp_in_buffer->fp_in_buf_start +(fd->fp_in_buffer->fp_in_buf_pc-fd->fp_in_buffer->fp_in_buf_pa)) 862 #define CRAM_IO_FGETS(s,size,fd) cram_io_input_buffer_fgets(s,size,fd) 863 #define CRAM_IO_PUTC(c,fd) cram_io_output_buffer_putc(c,fd) 864 #define CRAM_IO_WRITE(ptr, size, nmemb, fd) cram_io_output_buffer_write(ptr,size,nmemb,fd) 865 #define CRAM_IO_FLUSH(fd) cram_io_flush_output_buffer((fd)) 866 867 #else // ! CRAM_IO_CUSTOM_BUFFERING 868 #define CRAM_IO_GETC(fd) getc(fd->fp_in) 869 #define CRAM_IO_READ(ptr, size, nmemb, fd) fread(ptr,size,nmemb,fd->fp_in) 870 #define CRAM_IO_TELLO(fd) ftello(fd->fp_in) 871 #define CRAM_IO_SEEK(fd, offset, whence) fseeko(fd->fp_in,offset,whence) 872 #define CRAM_IO_FGETS(s,size,fd) fgets(s,size,fd->fp_in) 873 #define CRAM_IO_PUTC(c,fd) putc(c,fd->fp_out) 874 #define CRAM_IO_WRITE(ptr, size, nmemb, fd) fwrite(ptr,size,nmemb,fd->fp_out) 875 #define CRAM_IO_FLUSH(fd) (fd->fp_out ? fflush(fd->fp_out) : 0) 876 877 #endif // end CRAM_IO_CUSTOM_BUFFERING 878 879 // REQUIRED_FIELDS 880 enum sam_fields { 881 SAM_QNAME = 0x00000001, 882 SAM_FLAG = 0x00000002, 883 SAM_RNAME = 0x00000004, 884 SAM_POS = 0x00000008, 885 SAM_MAPQ = 0x00000010, 886 SAM_CIGAR = 0x00000020, 887 SAM_RNEXT = 0x00000040, 888 SAM_PNEXT = 0x00000080, 889 SAM_TLEN = 0x00000100, 890 SAM_SEQ = 0x00000200, 891 SAM_QUAL = 0x00000400, 892 SAM_AUX = 0x00000800, 893 SAM_RGAUX = 0x00001000, 894 }; 895 896 // Translation of required fields to cram data series 897 enum cram_fields { 898 CRAM_BF = 0x00000001, 899 CRAM_AP = 0x00000002, 900 CRAM_FP = 0x00000004, 901 CRAM_RL = 0x00000008, 902 CRAM_DL = 0x00000010, 903 CRAM_NF = 0x00000020, 904 CRAM_BA = 0x00000040, 905 CRAM_QS = 0x00000080, 906 CRAM_FC = 0x00000100, 907 CRAM_FN = 0x00000200, 908 CRAM_BS = 0x00000400, 909 CRAM_IN = 0x00000800, 910 CRAM_RG = 0x00001000, 911 CRAM_MQ = 0x00002000, 912 CRAM_TL = 0x00004000, 913 CRAM_RN = 0x00008000, 914 CRAM_NS = 0x00010000, 915 CRAM_NP = 0x00020000, 916 CRAM_TS = 0x00040000, 917 CRAM_MF = 0x00080000, 918 CRAM_CF = 0x00100000, 919 CRAM_RI = 0x00200000, 920 CRAM_RS = 0x00400000, 921 CRAM_PD = 0x00800000, 922 CRAM_HC = 0x01000000, 923 CRAM_SC = 0x02000000, 924 CRAM_BB = 0x04000000, 925 CRAM_BB_len = 0x08000000, 926 CRAM_QQ = 0x10000000, 927 CRAM_QQ_len = 0x20000000, 928 CRAM_aux= 0x40000000, 929 CRAM_ALL= 0x7fffffff, 930 }; 931 932 // A CIGAR opcode, but not necessarily the implications of it. Eg FC/FP may 933 // encode a base difference, but we don't need to know what it is for CIGAR. 934 // If we have a soft-clip or insertion, we do need SC/IN though to know how 935 // long that array is. 936 #define CRAM_CIGAR (CRAM_FN | CRAM_FP | CRAM_FC | CRAM_DL | CRAM_IN | \ 937 CRAM_SC | CRAM_HC | CRAM_PD | CRAM_RS | CRAM_RL | CRAM_BF) 938 939 #define CRAM_SEQ (CRAM_CIGAR | CRAM_BA | CRAM_BS | \ 940 CRAM_RL | CRAM_AP | CRAM_BB) 941 942 #define CRAM_QUAL (CRAM_CIGAR | CRAM_RL | CRAM_AP | CRAM_QS | CRAM_QQ) 943 944 enum cram_option { 945 CRAM_OPT_DECODE_MD, 946 CRAM_OPT_PREFIX, 947 CRAM_OPT_VERBOSITY, 948 CRAM_OPT_SEQS_PER_SLICE, 949 CRAM_OPT_SLICES_PER_CONTAINER, 950 CRAM_OPT_RANGE, 951 CRAM_OPT_VERSION, 952 CRAM_OPT_EMBED_REF, 953 CRAM_OPT_IGNORE_MD5, 954 CRAM_OPT_REFERENCE, 955 CRAM_OPT_MULTI_SEQ_PER_SLICE, 956 CRAM_OPT_NO_REF, 957 CRAM_OPT_USE_BZIP2, 958 CRAM_OPT_SHARED_REF, 959 CRAM_OPT_NTHREADS, 960 CRAM_OPT_THREAD_POOL, 961 CRAM_OPT_BINNING, 962 CRAM_OPT_USE_ARITH, 963 CRAM_OPT_USE_LZMA, 964 CRAM_OPT_REQUIRED_FIELDS, 965 CRAM_OPT_USE_RANS, 966 CRAM_OPT_IGNORE_CHKSUM, 967 CRAM_OPT_BASES_PER_SLICE, 968 CRAM_OPT_LOSSY_READ_NAMES, 969 CRAM_OPT_PRESERVE_AUX_ORDER, 970 CRAM_OPT_PRESERVE_AUX_SIZE, 971 CRAM_OPT_WITH_BGZIP_INDEX, 972 CRAM_OPT_OUTPUT_BGZIP_IDX, 973 CRAM_OPT_USE_BSC, 974 CRAM_OPT_USE_FQZ, 975 }; 976 977 /* BF bitfields */ 978 /* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */ 979 #define CRAM_FPAIRED 256 980 #define CRAM_FPROPER_PAIR 128 981 #define CRAM_FUNMAP 64 982 #define CRAM_FREVERSE 32 983 #define CRAM_FREAD1 16 984 #define CRAM_FREAD2 8 985 #define CRAM_FSECONDARY 4 986 #define CRAM_FQCFAIL 2 987 #define CRAM_FDUP 1 988 989 #define DS_aux_S "\001" 990 #define DS_aux_OQ_S "\002" 991 #define DS_aux_BQ_S "\003" 992 #define DS_aux_BD_S "\004" 993 #define DS_aux_BI_S "\005" 994 #define DS_aux_FZ_S "\006" 995 #define DS_aux_oq_S "\007" 996 #define DS_aux_os_S "\010" 997 #define DS_aux_oz_S "\011" 998 999 #define CRAM_M_REVERSE 1 1000 #define CRAM_M_UNMAP 2 1001 1002 1003 /* CF bitfields */ 1004 #define CRAM_FLAG_PRESERVE_QUAL_SCORES (1<<0) 1005 #define CRAM_FLAG_DETACHED (1<<1) 1006 #define CRAM_FLAG_MATE_DOWNSTREAM (1<<2) 1007 #define CRAM_FLAG_NO_SEQ (1<<3) 1008 #define CRAM_FLAG_MASK ((1<<4)-1) 1009 1010 /* Internal only */ 1011 #define CRAM_FLAG_STATS_ADDED (1<<30) 1012 #define CRAM_FLAG_DISCARD_NAME (1<<31) 1013 1014 #ifdef __cplusplus 1015 } 1016 #endif 1017 1018 #endif /* _CRAM_STRUCTS_H_ */ 1019