1 /*
2 Copyright (c) 2012-2016, 2018-2020 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8    1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11    2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 #ifndef HTSLIB_CRAM_STRUCTS_H
32 #define HTSLIB_CRAM_STRUCTS_H
33 
34 /*
35  * Defines in-memory structs for the basic file-format objects in the
36  * CRAM format.
37  *
38  * The basic file format is:
39  *     File-def SAM-hdr Container Container ...
40  *
41  * Container:
42  *     Service-block data-block data-block ...
43  *
44  * Multiple blocks in a container are grouped together as slices,
45  * also sometimes referred to as landmarks in the spec.
46  */
47 
48 
49 #include <pthread.h>
50 #include <stdint.h>
51 #include <sys/types.h>
52 
53 #include "../htslib/thread_pool.h"
54 #include "../htslib/cram.h"
55 #include "string_alloc.h"
56 #include "mFILE.h"
57 #include "../htslib/khash.h"
58 
59 #ifdef __cplusplus
60 extern "C" {
61 #endif
62 
63 // Generic hash-map integer -> integer
64 KHASH_MAP_INIT_INT64(m_i2i, int)
65 
66 // Generic hash-set integer -> (existence)
67 KHASH_SET_INIT_INT(s_i2i)
68 
69 // For brevity
70 typedef unsigned char uc;
71 
72 /*
73  * A union for the preservation map. Required for khash.
74  */
75 typedef union {
76     int i;
77     char *p;
78 } pmap_t;
79 
80 // Generates static functions here which isn't ideal, but we have no way
81 // currently to declare the kh_map_t structure here without also declaring a
82 // duplicate in the .c files due to the nature of the KHASH macros.
83 KHASH_MAP_INIT_STR(map, pmap_t)
84 
85 struct hFILE;
86 
87 #define SEQS_PER_SLICE 10000
88 #define BASES_PER_SLICE (SEQS_PER_SLICE*500)
89 #define SLICE_PER_CNT  1
90 
91 #define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT"
92 
93 #define MAX_STAT_VAL 1024
94 //#define MAX_STAT_VAL 16
95 typedef struct cram_stats {
96     int freqs[MAX_STAT_VAL];
97     khash_t(m_i2i) *h;
98     int nsamp; // total number of values added
99     int nvals; // total number of unique values added
100     int64_t min_val, max_val;
101 } cram_stats;
102 
103 /* NB: matches java impl, not the spec */
104 enum cram_encoding {
105     E_NULL               = 0,
106     E_EXTERNAL           = 1,  // Only for BYTE type in CRAM 4
107     E_GOLOMB             = 2,  // Not in CRAM 4
108     E_HUFFMAN            = 3,  // Not in CRAM 4
109     E_BYTE_ARRAY_LEN     = 4,
110     E_BYTE_ARRAY_STOP    = 5,
111     E_BETA               = 6,  // Not in CRAM 4
112     E_SUBEXP             = 7,  // Not in CRAM 4
113     E_GOLOMB_RICE        = 8,  // Not in CRAM 4
114     E_GAMMA              = 9,  // Not in CRAM 4
115 
116     // CRAM 4 specific codecs
117     E_VARINT_UNSIGNED    = 41, // Specialisation of EXTERNAL
118     E_VARINT_SIGNED      = 42, // Specialisation of EXTERNAL
119     E_CONST_BYTE         = 43, // Alternative to HUFFMAN with 1 symbol
120     E_CONST_INT          = 44, // Alternative to HUFFMAN with 1 symbol
121 
122     // More experimental ideas, not documented in spec yet
123     E_XHUFFMAN           = 50, // To external block
124     E_XPACK              = 51, // Transform to sub-codec
125     E_XRLE               = 52, // Transform to sub-codec
126     E_XDELTA             = 53, // Transform to sub-codec
127 
128     // Total number of codecs, not a real one.
129     E_NUM_CODECS,
130 };
131 
132 enum cram_external_type {
133     E_INT                = 1,
134     E_LONG               = 2,
135     E_BYTE               = 3,
136     E_BYTE_ARRAY         = 4,
137     E_BYTE_ARRAY_BLOCK   = 5,
138     E_SINT               = 6, // signed INT
139     E_SLONG              = 7, // signed LONG
140 };
141 
142 /* External IDs used by this implementation (only assumed during writing) */
143 enum cram_DS_ID {
144     DS_CORE   = 0,
145     DS_aux    = 1, // aux_blk
146     DS_aux_OQ = 2,
147     DS_aux_BQ = 3,
148     DS_aux_BD = 4,
149     DS_aux_BI = 5,
150     DS_aux_FZ = 6, // also ZM:B
151     DS_aux_oq = 7, // other qualities
152     DS_aux_os = 8, // other sequences
153     DS_aux_oz = 9, // other strings
154     DS_ref,
155     DS_RN, // name_blk
156     DS_QS, // qual_blk
157     DS_IN, // base_blk
158     DS_SC, // soft_blk
159 
160     DS_BF, // start loop
161     DS_CF,
162     DS_AP,
163     DS_RG,
164     DS_MQ,
165     DS_NS,
166     DS_MF,
167     DS_TS,
168     DS_NP,
169     DS_NF,
170     DS_RL,
171     DS_FN,
172     DS_FC,
173     DS_FP,
174     DS_DL,
175     DS_BA,
176     DS_BS,
177     DS_TL,
178     DS_RI,
179     DS_RS,
180     DS_PD,
181     DS_HC,
182     DS_BB,
183     DS_QQ,
184 
185     DS_TN, // end loop
186 
187     DS_RN_len,
188     DS_SC_len,
189     DS_BB_len,
190     DS_QQ_len,
191 
192     DS_TC, // CRAM v1.0 tags
193     DS_TM, // test
194     DS_TV, // test
195 
196     DS_END,
197 };
198 
199 /* "File Definition Structure" */
200 struct cram_file_def {
201     char    magic[4];
202     uint8_t major_version;
203     uint8_t minor_version;
204     char    file_id[20] HTS_NONSTRING; // Filename or SHA1 checksum
205 };
206 
207 #define CRAM_MAJOR_VERS(v) ((v) >> 8)
208 #define CRAM_MINOR_VERS(v) ((v) & 0xff)
209 
210 struct cram_slice;
211 
212 // Internal version of htslib/cram.h enum.
213 // Note these have to match the laout of methmap and methcost in
214 // cram_io.c:cram_compress_block2
215 enum cram_block_method_int {
216     // Public methods as defined in the CRAM spec.
217     BM_ERROR = -1,
218 
219     // CRAM 2.x and 3.0
220     RAW      = 0,
221     GZIP     = 1,
222     BZIP2    = 2,
223     LZMA     = 3,
224     RANS     = 4, RANS0 = RANS,
225 
226     // CRAM 3.1 onwards
227     RANSPR   = 5, RANS_PR0  = RANSPR,
228     ARITH    = 6, ARITH_PR0 = ARITH,
229     FQZ      = 7,
230     TOK3     = 8,
231     // BSC = 9, ZSTD = 10
232 
233     // Methods not externalised, but used in metrics.
234     // Externally they become one of the above methods.
235     GZIP_RLE = 11,
236     GZIP_1,      // Z_DEFAULT_STRATEGY level 1, NB: not externalised in CRAM
237 
238     FQZ_b, FQZ_c, FQZ_d, // Various preset FQZ methods
239 
240   //RANS0,       // Order 0
241     RANS1,
242 
243   //RANS_PR0,    // Order 0
244     RANS_PR1,    // Order 1
245     RANS_PR64,   // O0 + RLE
246     RANS_PR9,    // O1 + X4
247     RANS_PR128,  // O0 + Pack
248     RANS_PR129,  // O1 + Pack
249     RANS_PR192,  // O0 + RLE + pack
250     RANS_PR193,  // O1 + RLE + pack
251 
252   //TOK3,   // tok+rans
253     TOKA,   // tok+arith
254 
255   //ARITH_PR0,   // Order 0
256     ARITH_PR1,   // Order 1
257     ARITH_PR64,  // O0 + RLE
258     ARITH_PR9,   // O1 + X4
259     ARITH_PR128, // O0 + Pack
260     ARITH_PR129, // O1 + Pack
261     ARITH_PR192, // O0 + RLE + pack
262     ARITH_PR193, // O1 + RLE + pack
263 
264     // NB: must end on no more than 31 unless we change to a
265     // 64-bit method type.
266 };
267 
268 /* Now in htslib/cram.h
269 enum cram_content_type {
270     CT_ERROR           = -1,
271     FILE_HEADER        = 0,
272     COMPRESSION_HEADER = 1,
273     MAPPED_SLICE       = 2,
274     UNMAPPED_SLICE     = 3, // CRAM V1.0 only
275     EXTERNAL           = 4,
276     CORE               = 5,
277 };
278 */
279 
280 /* Maximum simultaneous codecs allowed, 1 per bit */
281 #define CRAM_MAX_METHOD 32
282 
283 /* Compression metrics */
284 struct cram_metrics {
285     // number of trials and time to next trial
286     int trial;
287     int next_trial;
288     int consistency;
289 
290     // aggregate sizes during trials
291     int sz[CRAM_MAX_METHOD];
292     int input_avg_sz, input_avg_delta;
293 
294     // resultant method from trials
295     int method, revised_method;
296     int strat;
297 
298     // Revisions of method, to allow culling of continually failing ones.
299     int cnt[CRAM_MAX_METHOD];
300 
301     double extra[CRAM_MAX_METHOD];
302 
303     // Not amenable to rANS bit-packing techniques; cardinality > 16
304     int unpackable;
305 };
306 
307 // Hash aux key (XX:i) to cram_metrics
308 KHASH_MAP_INIT_INT(m_metrics, cram_metrics*)
309 
310 
311 /* Block */
312 struct cram_block {
313     enum cram_block_method_int  method, orig_method;
314     enum cram_content_type  content_type;
315     int32_t  content_id;
316     int32_t  comp_size;
317     int32_t  uncomp_size;
318     uint32_t crc32;
319     int32_t  idx; /* offset into data */
320     unsigned char    *data;
321 
322     // For bit I/O
323     size_t alloc;
324     size_t byte;
325     int bit;
326 
327     // To aid compression
328     cram_metrics *m; // used to track aux block compression only
329 
330     int crc32_checked;
331     uint32_t crc_part;
332 };
333 
334 struct cram_codec; /* defined in cram_codecs.h */
335 struct cram_map;
336 
337 #define CRAM_MAP_HASH 32
338 #define CRAM_MAP(a,b) (((a)*3+(b))&(CRAM_MAP_HASH-1))
339 
340 /* Compression header block */
341 struct cram_block_compression_hdr {
342     int32_t ref_seq_id;
343     int64_t ref_seq_start;
344     int64_t ref_seq_span;
345     int32_t num_records;
346     int32_t num_landmarks;
347     int32_t *landmark;
348 
349     /* Flags from preservation map */
350     int read_names_included;
351     int AP_delta;
352     // indexed by ref-base and subst. code
353     char substitution_matrix[5][4];
354     int no_ref;
355     int qs_seq_orient; // 1 => same as seq. 0 => original orientation
356 
357     // TD Dictionary as a concatenated block
358     cram_block *TD_blk;          // Tag Dictionary
359     int nTL;                     // number of TL entries in TD
360     unsigned char **TL;          // array of size nTL, pointer into TD_blk.
361     khash_t(m_s2i) *TD_hash;     // Keyed on TD strings, map to TL[] indices
362     string_alloc_t *TD_keys;     // Pooled keys for TD hash.
363 
364     khash_t(map) *preservation_map;
365     struct cram_map *rec_encoding_map[CRAM_MAP_HASH];
366     struct cram_map *tag_encoding_map[CRAM_MAP_HASH];
367 
368     struct cram_codec *codecs[DS_END];
369 
370     char *uncomp; // A single block of uncompressed data
371     size_t uncomp_size, uncomp_alloc;
372 
373     // Total codec count, used for index to block_by_id for transforms
374     int ncodecs;
375 };
376 
377 typedef struct cram_map {
378     int key;    /* 0xe0 + 3 bytes */
379     enum cram_encoding encoding;
380     int offset; /* Offset into a single block of memory */
381     int size;   /* Size */
382     struct cram_codec *codec;
383     struct cram_map *next; // for noddy internal hash
384 } cram_map;
385 
386 typedef struct cram_tag_map {
387     struct cram_codec *codec;
388     cram_block *blk;
389     cram_block *blk2;
390     cram_metrics *m;
391 } cram_tag_map;
392 
393 // Hash aux key (XX:i) to cram_tag_map
394 KHASH_MAP_INIT_INT(m_tagmap, cram_tag_map*)
395 
396 /* Mapped or unmapped slice header block */
397 struct cram_block_slice_hdr {
398     enum cram_content_type content_type;
399     int32_t ref_seq_id;     /* if content_type == MAPPED_SLICE */
400     int64_t ref_seq_start;  /* if content_type == MAPPED_SLICE */
401     int64_t ref_seq_span;   /* if content_type == MAPPED_SLICE */
402     int32_t num_records;
403     int64_t record_counter;
404     int32_t num_blocks;
405     int32_t num_content_ids;
406     int32_t *block_content_ids;
407     int32_t ref_base_id;    /* if content_type == MAPPED_SLICE */
408     unsigned char md5[16];
409 };
410 
411 struct ref_entry;
412 
413 /*
414  * Container.
415  *
416  * Conceptually a container is split into slices, and slices into blocks.
417  * However on disk it's just a list of blocks and we need to query the
418  * block types to identify the start/end points of the slices.
419  *
420  * OR... are landmarks the start/end points of slices?
421  */
422 struct cram_container {
423     int32_t  length;
424     int32_t  ref_seq_id;
425     int64_t  ref_seq_start;
426     int64_t  ref_seq_span;
427     int64_t  record_counter;
428     int64_t  num_bases;
429     int32_t  num_records;
430     int32_t  num_blocks;
431     int32_t  num_landmarks;
432     int32_t *landmark;
433 
434     /* Size of container header above */
435     size_t   offset;
436 
437     /* Compression header is always the first block? */
438     cram_block_compression_hdr *comp_hdr;
439     cram_block *comp_hdr_block;
440 
441     /* For construction purposes */
442     int max_slice, curr_slice;   // maximum number of slices
443     int curr_slice_mt;           // Curr_slice when reading ahead (via threads)
444     int max_rec, curr_rec;       // current and max recs per slice
445     int max_c_rec, curr_c_rec;   // current and max recs per container
446     int slice_rec;               // rec no. for start of this slice
447     int curr_ref;                // current ref ID. -2 for no previous
448     int64_t last_pos;                // last record position
449     struct cram_slice **slices, *slice;
450     int pos_sorted;              // boolean, 1=>position sorted data
451     int64_t max_apos;                // maximum position, used if pos_sorted==0
452     int last_slice;              // number of reads in last slice (0 for 1st)
453     int multi_seq;               // true if packing multi seqs per cont/slice
454     int unsorted;                // true is AP_delta is 0.
455     int qs_seq_orient;           // 1 => same as seq. 0 => original orientation
456 
457     /* Copied from fd before encoding, to allow multi-threading */
458     int ref_start, first_base, last_base, ref_id, ref_end;
459     char *ref;
460     //struct ref_entry *ref;
461 
462     /* For multi-threading */
463     bam_seq_t **bams;
464 
465     /* Statistics for encoding */
466     cram_stats *stats[DS_END];
467 
468     khash_t(m_tagmap) *tags_used; // set of tag types in use, for tag encoding map
469     int *refs_used;       // array of frequency of ref seq IDs
470 
471     uint32_t crc32;       // CRC32
472 
473     uint64_t s_num_bases; // number of bases in this slice
474 
475     uint32_t n_mapped;    // Number of mapped reads
476 };
477 
478 /*
479  * A single cram record
480  */
481 typedef struct cram_record {
482     struct cram_slice *s; // Filled out by cram_decode only
483 
484     int32_t ref_id;       // fixed for all recs in slice?
485     int32_t flags;        // BF
486     int32_t cram_flags;   // CF
487     int32_t len;          // RL
488     int64_t apos;         // AP
489     int32_t rg;           // RG
490     int32_t name;         // RN; idx to s->names_blk
491     int32_t name_len;
492     int32_t mate_line;    // index to another cram_record
493     int32_t mate_ref_id;
494     int64_t mate_pos;     // NP
495     int64_t tlen;         // TS
496     int64_t explicit_tlen;// TS, but PNEXT/RNEXT still need auto-computing
497 
498     // Auxiliary data
499     int32_t ntags;        // TC
500     int32_t aux;          // idx to s->aux_blk
501     int32_t aux_size;     // total size of packed ntags in aux_blk
502 #ifndef TN_external
503     int32_t TN_idx;       // TN; idx to s->TN;
504 #else
505     int32_t tn;           // idx to s->tn_blk
506 #endif
507     int     TL;
508 
509     int32_t seq;          // idx to s->seqs_blk
510     int32_t qual;         // idx to s->qual_blk
511     int32_t cigar;        // idx to s->cigar
512     int32_t ncigar;
513     int64_t aend;         // alignment end
514     int32_t mqual;        // MQ
515 
516     int32_t feature;      // idx to s->feature
517     int32_t nfeature;     // number of features
518     int32_t mate_flags;   // MF
519 } cram_record;
520 
521 // Accessor macros as an analogue of the bam ones
522 #define cram_qname(c)    (&(c)->s->name_blk->data[(c)->name])
523 #define cram_seq(c)      (&(c)->s->seqs_blk->data[(c)->seq])
524 #define cram_qual(c)     (&(c)->s->qual_blk->data[(c)->qual])
525 #define cram_aux(c)      (&(c)->s->aux_blk->data[(c)->aux])
526 #define cram_seqi(c,i)   (cram_seq((c))[(i)])
527 #define cram_name_len(c) ((c)->name_len)
528 #define cram_strand(c)   (((c)->flags & BAM_FREVERSE) != 0)
529 #define cram_mstrand(c)  (((c)->flags & BAM_FMREVERSE) != 0)
530 #define cram_cigar(c)    (&((cr)->s->cigar)[(c)->cigar])
531 
532 /*
533  * A feature is a base difference, used for the sequence reference encoding.
534  * (We generate these internally when writing CRAM.)
535  */
536 typedef union cram_feature {
537     struct {
538         int pos;
539         int code;
540         int base;    // substitution code
541     } X;
542     struct {
543         int pos;
544         int code;
545         int base;    // actual base & qual
546         int qual;
547     } B;
548     struct {
549         int pos;
550         int code;
551         int seq_idx; // index to s->seqs_blk
552         int len;
553     } b;
554     struct {
555         int pos;
556         int code;
557         int qual;
558     } Q;
559     struct {
560         int pos;
561         int code;
562         int len;
563         int seq_idx; // soft-clip multiple bases
564     } S;
565     struct {
566         int pos;
567         int code;
568         int len;
569         int seq_idx; // insertion multiple bases
570     } I;
571     struct {
572         int pos;
573         int code;
574         int base; // insertion single base
575     } i;
576     struct {
577         int pos;
578         int code;
579         int len;
580     } D;
581     struct {
582         int pos;
583         int code;
584         int len;
585     } N;
586     struct {
587         int pos;
588         int code;
589         int len;
590     } P;
591     struct {
592         int pos;
593         int code;
594         int len;
595     } H;
596 } cram_feature;
597 
598 /*
599  * A slice is really just a set of blocks, but it
600  * is the logical unit for decoding a number of
601  * sequences.
602  */
603 struct cram_slice {
604     cram_block_slice_hdr *hdr;
605     cram_block *hdr_block;
606     cram_block **block;
607     cram_block **block_by_id;
608 
609     /* State used during encoding/decoding */
610     int64_t last_apos, max_apos;
611 
612     /* Array of decoded cram records */
613     cram_record *crecs;
614 
615     /* An dynamically growing buffers for data pointed
616      * to by crecs[] array.
617      */
618     uint32_t  *cigar;
619     uint32_t   cigar_alloc;
620     uint32_t   ncigar;
621 
622     cram_feature *features;
623     int           nfeatures;
624     int           afeatures; // allocated size of features
625 
626 #ifndef TN_external
627     // TN field (Tag Name)
628     uint32_t      *TN;
629     int           nTN, aTN;  // used and allocated size for TN[]
630 #else
631     cram_block *tn_blk;
632     int tn_id;
633 #endif
634 
635     // For variable sized elements which are always external blocks.
636     cram_block *name_blk;
637     cram_block *seqs_blk;
638     cram_block *qual_blk;
639     cram_block *base_blk;
640     cram_block *soft_blk;
641     cram_block *aux_blk;       // BAM aux block, created while decoding CRAM
642 
643     string_alloc_t *pair_keys; // Pooled keys for pair hash.
644     khash_t(m_s2i) *pair[2];   // for identifying read-pairs in this slice.
645 
646     char *ref;                 // slice of current reference
647     int ref_start;             // start position of current reference;
648     int ref_end;               // end position of current reference;
649     int ref_id;
650 
651     // For going from BAM to CRAM; an array of auxiliary blocks per type
652     int naux_block;
653     cram_block **aux_block;
654 
655     unsigned int data_series; // See cram_fields enum
656     int decode_md;
657 
658     int max_rec, curr_rec;       // current and max recs per slice
659     int slice_num;               // To be copied into c->curr_slice in decode
660 };
661 
662 /*-----------------------------------------------------------------------------
663  * Consider moving reference handling to cram_refs.[ch]
664  */
665 // from fa.fai / samtools faidx files
666 typedef struct ref_entry {
667     char *name;
668     char *fn;
669     int64_t length;
670     int64_t offset;
671     int bases_per_line;
672     int line_length;
673     int64_t count;         // for shared references so we know to dealloc seq
674     char *seq;
675     mFILE *mf;
676     int is_md5;            // Reference comes from a raw seq found by MD5
677 } ref_entry;
678 
679 KHASH_MAP_INIT_STR(refs, ref_entry*)
680 
681 // References structure.
682 struct refs_t {
683     string_alloc_t *pool;  // String pool for holding filenames and SN vals
684 
685     khash_t(refs) *h_meta; // ref_entry*, index by name
686     ref_entry **ref_id;    // ref_entry*, index by ID
687     int nref;              // number of ref_entry
688 
689     char *fn;              // current file opened
690     BGZF *fp;              // and the hFILE* to go with it.
691 
692     int count;             // how many cram_fd sharing this refs struct
693 
694     pthread_mutex_t lock;  // Mutex for multi-threaded updating
695     ref_entry *last;       // Last queried sequence
696     int last_id;           // Used in cram_ref_decr_locked to delay free
697 };
698 
699 /*-----------------------------------------------------------------------------
700  * CRAM index
701  *
702  * Detect format by number of entries per line.
703  * 5 => 1.0 (refid, start, nseq, C offset, slice)
704  * 6 => 1.1 (refid, start, span, C offset, S offset, S size)
705  *
706  * Indices are stored in a nested containment list, which is trivial to set
707  * up as the indices are on sorted data so we're appending to the nclist
708  * in sorted order. Basically if a slice entirely fits within a previous
709  * slice then we append to that slices list. This is done recursively.
710  *
711  * Lists are sorted on two dimensions: ref id + slice coords.
712  */
713 typedef struct cram_index {
714     int nslice, nalloc;   // total number of slices
715     struct cram_index *e; // array of size nslice
716 
717     int     refid;  // 1.0                 1.1
718     int     start;  // 1.0                 1.1
719     int     end;    //                     1.1
720     int     nseq;   // 1.0 - undocumented
721     int     slice;  // 1.0 landmark index, 1.1 landmark value
722     int     len;    //                     1.1 - size of slice in bytes
723     int64_t offset; // 1.0                 1.1
724     int64_t next;   // derived: offset of next container.
725 } cram_index;
726 
727 typedef struct {
728     int refid;
729     int64_t start;
730     int64_t end;
731 } cram_range;
732 
733 /*-----------------------------------------------------------------------------
734  */
735 /* CRAM File handle */
736 
737 typedef struct spare_bams {
738     bam_seq_t **bams;
739     struct spare_bams *next;
740 } spare_bams;
741 
742 struct cram_fd;
743 typedef struct varint_vec {
744     // Returns number of bytes decoded from fd, 0 on error
745     int (*varint_decode32_crc)(struct cram_fd *fd, int32_t *val_p, uint32_t *crc);
746     int (*varint_decode32s_crc)(struct cram_fd *fd, int32_t *val_p, uint32_t *crc);
747     int (*varint_decode64_crc)(struct cram_fd *fd, int64_t *val_p, uint32_t *crc);
748 
749     // Returns the value and increments *cp.  Sets err to 1 iff an error occurs.
750     // NOTE: Does not set err to 0 on success.
751     int64_t (*varint_get32) (char **cp, const char *endp, int *err);
752     int64_t (*varint_get32s)(char **cp, const char *endp, int *err);
753     int64_t (*varint_get64) (char **cp, const char *endp, int *err);
754     int64_t (*varint_get64s)(char **cp, const char *endp, int *err);
755 
756     // Returns the number of bytes written, <= 0 on error.
757     int (*varint_put32) (char *cp, char *endp, int32_t val_p);
758     int (*varint_put32s)(char *cp, char *endp, int32_t val_p);
759     int (*varint_put64) (char *cp, char *endp, int64_t val_p);
760     int (*varint_put64s)(char *cp, char *endp, int64_t val_p);
761 
762     // Returns the number of bytes written, <= 0 on error.
763     int (*varint_put32_blk) (cram_block *blk, int32_t val_p);
764     int (*varint_put32s_blk)(cram_block *blk, int32_t val_p);
765     int (*varint_put64_blk) (cram_block *blk, int64_t val_p);
766     int (*varint_put64s_blk)(cram_block *blk, int64_t val_p);
767 
768     // Returns number of bytes needed to encode 'val'
769     int (*varint_size)(int64_t val);
770 } varint_vec;
771 
772 struct cram_fd {
773     struct hFILE  *fp;
774     int            mode;     // 'r' or 'w'
775     int            version;
776     cram_file_def *file_def;
777     sam_hdr_t     *header;
778 
779     char          *prefix;
780     int64_t        record_counter;
781     int            err;
782 
783     // Most recent compression header decoded
784     //cram_block_compression_hdr *comp_hdr;
785     //cram_block_slice_hdr       *slice_hdr;
786 
787     // Current container being processed
788     cram_container *ctr;
789 
790     // Current container used for decoder threads
791     cram_container *ctr_mt;
792 
793     // positions for encoding or decoding
794     int first_base, last_base;
795 
796     // cached reference portion
797     refs_t *refs;              // ref meta-data structure
798     char *ref, *ref_free;      // current portion held in memory
799     int   ref_id;
800     int   ref_start;
801     int   ref_end;
802     char *ref_fn;   // reference fasta filename
803 
804     // compression level and metrics
805     int level;
806     cram_metrics *m[DS_END];
807     khash_t(m_metrics) *tags_used; // cram_metrics[], per tag types in use.
808 
809     // options
810     int decode_md; // Whether to export MD and NM tags
811     int seqs_per_slice;
812     int bases_per_slice;
813     int slices_per_container;
814     int embed_ref;
815     int no_ref;
816     int ignore_md5;
817     int use_bz2;
818     int use_rans;
819     int use_lzma;
820     int use_fqz;
821     int use_tok;
822     int use_arith;
823     int shared_ref;
824     unsigned int required_fields;
825     int store_md;
826     int store_nm;
827     cram_range range;
828 
829     // lookup tables, stored here so we can be trivially multi-threaded
830     unsigned int bam_flag_swap[0x1000]; // cram -> bam flags
831     unsigned int cram_flag_swap[0x1000];// bam -> cram flags
832     unsigned char L1[256];              // ACGT{*} ->0123{4}
833     unsigned char L2[256];              // ACGTN{*}->01234{5}
834     char cram_sub_matrix[32][32];       // base substitution codes
835 
836     int         index_sz;
837     cram_index *index;                  // array, sizeof index_sz
838     off_t first_container;
839     off_t curr_position;
840     int eof;
841     int last_slice;                     // number of recs encoded in last slice
842     int last_RI_count;                  // number of references encoded in last container
843     int multi_seq;                      // -1 is auto, 0 is one ref per container, 1 is multi...
844     int multi_seq_user;                 // Original user setting (CRAM_OPT_MULTI_SEQ_PER_SLICE)
845     int unsorted;
846     int last_mapped;                    // number of mapped reads in last container
847     int empty_container;                // Marker for EOF block
848 
849     // thread pool
850     int own_pool;
851     hts_tpool *pool;
852     hts_tpool_process *rqueue;
853     pthread_mutex_t metrics_lock;
854     pthread_mutex_t ref_lock;
855     pthread_mutex_t range_lock;
856     spare_bams *bl;
857     pthread_mutex_t bam_list_lock;
858     void *job_pending;
859     int ooc;                            // out of containers.
860 
861     int lossy_read_names;               // boolean
862     int tlen_approx;                    // max TLEN calculation offset.
863     int tlen_zero;                      // If true, permit tlen 0 (=> tlen calculated)
864 
865     BGZF *idxfp;                        // File pointer for on-the-fly index creation
866 
867     // variable integer decoding callbacks.
868     // This changed in CRAM4.0 to a data-size agnostic encoding.
869     varint_vec vv;
870 
871     // Force AP delta even on non positional sorted data.
872     // This can be beneficial for pairs where pairs are nearby each other.
873     // We suffer with delta to unrelated things (previous pair), but gain
874     // in delta between them.  (Ideal would be a per read setting.)
875     int ap_delta;
876 };
877 
878 // Translation of required fields to cram data series
879 enum cram_fields {
880     CRAM_BF = 0x00000001,
881     CRAM_AP = 0x00000002,
882     CRAM_FP = 0x00000004,
883     CRAM_RL = 0x00000008,
884     CRAM_DL = 0x00000010,
885     CRAM_NF = 0x00000020,
886     CRAM_BA = 0x00000040,
887     CRAM_QS = 0x00000080,
888     CRAM_FC = 0x00000100,
889     CRAM_FN = 0x00000200,
890     CRAM_BS = 0x00000400,
891     CRAM_IN = 0x00000800,
892     CRAM_RG = 0x00001000,
893     CRAM_MQ = 0x00002000,
894     CRAM_TL = 0x00004000,
895     CRAM_RN = 0x00008000,
896     CRAM_NS = 0x00010000,
897     CRAM_NP = 0x00020000,
898     CRAM_TS = 0x00040000,
899     CRAM_MF = 0x00080000,
900     CRAM_CF = 0x00100000,
901     CRAM_RI = 0x00200000,
902     CRAM_RS = 0x00400000,
903     CRAM_PD = 0x00800000,
904     CRAM_HC = 0x01000000,
905     CRAM_SC = 0x02000000,
906     CRAM_BB = 0x04000000,
907     CRAM_BB_len = 0x08000000,
908     CRAM_QQ = 0x10000000,
909     CRAM_QQ_len = 0x20000000,
910     CRAM_aux= 0x40000000,
911     CRAM_ALL= 0x7fffffff,
912 };
913 
914 // A CIGAR opcode, but not necessarily the implications of it. Eg FC/FP may
915 // encode a base difference, but we don't need to know what it is for CIGAR.
916 // If we have a soft-clip or insertion, we do need SC/IN though to know how
917 // long that array is.
918 #define CRAM_CIGAR (CRAM_FN | CRAM_FP | CRAM_FC | CRAM_DL | CRAM_IN | \
919                     CRAM_SC | CRAM_HC | CRAM_PD | CRAM_RS | CRAM_RL | CRAM_BF)
920 
921 #define CRAM_SEQ (CRAM_CIGAR | CRAM_BA | CRAM_BS | \
922                   CRAM_RL    | CRAM_AP | CRAM_BB)
923 
924 #define CRAM_QUAL (CRAM_CIGAR | CRAM_RL | CRAM_AP | CRAM_QS | CRAM_QQ)
925 
926 /* BF bitfields */
927 /* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */
928 #define CRAM_FPAIRED      256
929 #define CRAM_FPROPER_PAIR 128
930 #define CRAM_FUNMAP        64
931 #define CRAM_FREVERSE      32
932 #define CRAM_FREAD1        16
933 #define CRAM_FREAD2         8
934 #define CRAM_FSECONDARY     4
935 #define CRAM_FQCFAIL        2
936 #define CRAM_FDUP           1
937 
938 #define DS_aux_S "\001"
939 #define DS_aux_OQ_S "\002"
940 #define DS_aux_BQ_S "\003"
941 #define DS_aux_BD_S "\004"
942 #define DS_aux_BI_S "\005"
943 #define DS_aux_FZ_S "\006"
944 #define DS_aux_oq_S "\007"
945 #define DS_aux_os_S "\010"
946 #define DS_aux_oz_S "\011"
947 
948 #define CRAM_M_REVERSE  1
949 #define CRAM_M_UNMAP    2
950 
951 
952 /* CF bitfields */
953 #define CRAM_FLAG_PRESERVE_QUAL_SCORES (1<<0)
954 #define CRAM_FLAG_DETACHED             (1<<1)
955 #define CRAM_FLAG_MATE_DOWNSTREAM      (1<<2)
956 #define CRAM_FLAG_NO_SEQ               (1<<3)
957 #define CRAM_FLAG_EXPLICIT_TLEN        (1<<4)
958 #define CRAM_FLAG_MASK                 ((1<<5)-1)
959 
960 /* Internal only */
961 #define CRAM_FLAG_STATS_ADDED          (1<<30)
962 #define CRAM_FLAG_DISCARD_NAME         (1U<<31)
963 
964 #ifdef __cplusplus
965 }
966 #endif
967 
968 #endif /* HTSLIB_CRAM_STRUCTS_H */
969