1 /*
2 Copyright (c) 2012-2016, 2018-2020 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8    1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11    2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 #ifndef HTSLIB_CRAM_STRUCTS_H
32 #define HTSLIB_CRAM_STRUCTS_H
33 
34 /*
35  * Defines in-memory structs for the basic file-format objects in the
36  * CRAM format.
37  *
38  * The basic file format is:
39  *     File-def SAM-hdr Container Container ...
40  *
41  * Container:
42  *     Service-block data-block data-block ...
43  *
44  * Multiple blocks in a container are grouped together as slices,
45  * also sometimes referred to as landmarks in the spec.
46  */
47 
48 
49 #include <pthread.h>
50 #include <stdint.h>
51 #include <sys/types.h>
52 
53 #include "../htslib/thread_pool.h"
54 #include "../htslib/cram.h"
55 #include "string_alloc.h"
56 #include "mFILE.h"
57 #include "../htslib/khash.h"
58 
59 #ifdef __cplusplus
60 extern "C" {
61 #endif
62 
63 // Generic hash-map integer -> integer
64 KHASH_MAP_INIT_INT64(m_i2i, int)
65 
66 // Generic hash-set integer -> (existence)
67 KHASH_SET_INIT_INT(s_i2i)
68 
69 // For brevity
70 typedef unsigned char uc;
71 
72 /*
73  * A union for the preservation map. Required for khash.
74  */
75 typedef union {
76     int i;
77     char *p;
78 } pmap_t;
79 
80 // Generates static functions here which isn't ideal, but we have no way
81 // currently to declare the kh_map_t structure here without also declaring a
82 // duplicate in the .c files due to the nature of the KHASH macros.
83 KHASH_MAP_INIT_STR(map, pmap_t)
84 
85 struct hFILE;
86 
87 #define SEQS_PER_SLICE 10000
88 #define BASES_PER_SLICE (SEQS_PER_SLICE*500)
89 #define SLICE_PER_CNT  1
90 
91 #define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT"
92 
93 #define MAX_STAT_VAL 1024
94 //#define MAX_STAT_VAL 16
95 typedef struct cram_stats {
96     int freqs[MAX_STAT_VAL];
97     khash_t(m_i2i) *h;
98     int nsamp; // total number of values added
99     int nvals; // total number of unique values added
100 } cram_stats;
101 
102 /* NB: matches java impl, not the spec */
103 enum cram_encoding {
104     E_NULL               = 0,
105     E_EXTERNAL           = 1,
106     E_GOLOMB             = 2,
107     E_HUFFMAN            = 3,
108     E_BYTE_ARRAY_LEN     = 4,
109     E_BYTE_ARRAY_STOP    = 5,
110     E_BETA               = 6,
111     E_SUBEXP             = 7,
112     E_GOLOMB_RICE        = 8,
113     E_GAMMA              = 9,
114     E_NUM_CODECS         = 10, /* Number of codecs, not a real one. */
115 };
116 
117 enum cram_external_type {
118     E_INT                = 1,
119     E_LONG               = 2,
120     E_BYTE               = 3,
121     E_BYTE_ARRAY         = 4,
122     E_BYTE_ARRAY_BLOCK   = 5,
123 };
124 
125 /* External IDs used by this implementation (only assumed during writing) */
126 enum cram_DS_ID {
127     DS_CORE   = 0,
128     DS_aux    = 1, // aux_blk
129     DS_aux_OQ = 2,
130     DS_aux_BQ = 3,
131     DS_aux_BD = 4,
132     DS_aux_BI = 5,
133     DS_aux_FZ = 6, // also ZM:B
134     DS_aux_oq = 7, // other qualities
135     DS_aux_os = 8, // other sequences
136     DS_aux_oz = 9, // other strings
137     DS_ref,
138     DS_RN, // name_blk
139     DS_QS, // qual_blk
140     DS_IN, // base_blk
141     DS_SC, // soft_blk
142 
143     DS_BF, // start loop
144     DS_CF,
145     DS_AP,
146     DS_RG,
147     DS_MQ,
148     DS_NS,
149     DS_MF,
150     DS_TS,
151     DS_NP,
152     DS_NF,
153     DS_RL,
154     DS_FN,
155     DS_FC,
156     DS_FP,
157     DS_DL,
158     DS_BA,
159     DS_BS,
160     DS_TL,
161     DS_RI,
162     DS_RS,
163     DS_PD,
164     DS_HC,
165     DS_BB,
166     DS_QQ,
167 
168     DS_TN, // end loop
169 
170     DS_RN_len,
171     DS_SC_len,
172     DS_BB_len,
173     DS_QQ_len,
174 
175     DS_TC, // CRAM v1.0 tags
176     DS_TM, // test
177     DS_TV, // test
178 
179     DS_END,
180 };
181 
182 /* "File Definition Structure" */
183 struct cram_file_def {
184     char    magic[4];
185     uint8_t major_version;
186     uint8_t minor_version;
187     char    file_id[20] HTS_NONSTRING; // Filename or SHA1 checksum
188 };
189 
190 #define CRAM_MAJOR_VERS(v) ((v) >> 8)
191 #define CRAM_MINOR_VERS(v) ((v) & 0xff)
192 
193 struct cram_slice;
194 
195 /* Now in htslib/cram.h
196 enum cram_block_method {
197     BM_ERROR = -1,
198     RAW      = 0,
199     GZIP     = 1,
200     BZIP2    = 2,
201     LZMA     = 3,
202     RANS     = 4,  // Generic; either order
203     RANS0    = 4,
204     RANS1    = 10, // Not externalised; stored as RANS (generic)
205     GZIP_RLE = 11, // NB: not externalised in CRAM
206 };
207 */
208 
209 /* Now in htslib/cram.h
210 enum cram_content_type {
211     CT_ERROR           = -1,
212     FILE_HEADER        = 0,
213     COMPRESSION_HEADER = 1,
214     MAPPED_SLICE       = 2,
215     UNMAPPED_SLICE     = 3, // CRAM V1.0 only
216     EXTERNAL           = 4,
217     CORE               = 5,
218 };
219 */
220 
221 /* Compression metrics */
222 struct cram_metrics {
223     // number of trials and time to next trial
224     int trial;
225     int next_trial;
226 
227     // aggregate sizes during trials
228     int sz_gz_rle;
229     int sz_gz_def;
230     int sz_rans0;
231     int sz_rans1;
232     int sz_bzip2;
233     int sz_lzma;
234 
235     // resultant method from trials
236     int method;
237     int strat;
238 
239     // Revisions of method, to allow culling of continually failing ones.
240     int gz_rle_cnt;
241     int gz_def_cnt;
242     int rans0_cnt;
243     int rans1_cnt;
244     int bzip2_cnt;
245     int lzma_cnt;
246     int revised_method;
247 
248     double gz_rle_extra;
249     double gz_def_extra;
250     double rans0_extra;
251     double rans1_extra;
252     double bzip2_extra;
253     double lzma_extra;
254 };
255 
256 // Hash aux key (XX:i) to cram_metrics
257 KHASH_MAP_INIT_INT(m_metrics, cram_metrics*)
258 
259 
260 /* Block */
261 struct cram_block {
262     enum cram_block_method  method, orig_method;
263     enum cram_content_type  content_type;
264     int32_t  content_id;
265     int32_t  comp_size;
266     int32_t  uncomp_size;
267     uint32_t crc32;
268     int32_t  idx; /* offset into data */
269     unsigned char    *data;
270 
271     // For bit I/O
272     size_t alloc;
273     size_t byte;
274     int bit;
275 
276     // To aid compression
277     cram_metrics *m; // used to track aux block compression only
278 
279     int crc32_checked;
280     uint32_t crc_part;
281 };
282 
283 struct cram_codec; /* defined in cram_codecs.h */
284 struct cram_map;
285 
286 #define CRAM_MAP_HASH 32
287 #define CRAM_MAP(a,b) (((a)*3+(b))&(CRAM_MAP_HASH-1))
288 
289 /* Compression header block */
290 struct cram_block_compression_hdr {
291     int32_t ref_seq_id;
292     int64_t ref_seq_start;
293     int64_t ref_seq_span;
294     int32_t num_records;
295     int32_t num_landmarks;
296     int32_t *landmark;
297 
298     /* Flags from preservation map */
299     int read_names_included;
300     int AP_delta;
301     // indexed by ref-base and subst. code
302     char substitution_matrix[5][4];
303     int no_ref;
304 
305     // TD Dictionary as a concatenated block
306     cram_block *TD_blk;          // Tag Dictionary
307     int nTL;                     // number of TL entries in TD
308     unsigned char **TL;          // array of size nTL, pointer into TD_blk.
309     khash_t(m_s2i) *TD_hash;     // Keyed on TD strings, map to TL[] indices
310     string_alloc_t *TD_keys;     // Pooled keys for TD hash.
311 
312     khash_t(map) *preservation_map;
313     struct cram_map *rec_encoding_map[CRAM_MAP_HASH];
314     struct cram_map *tag_encoding_map[CRAM_MAP_HASH];
315 
316     struct cram_codec *codecs[DS_END];
317 
318     char *uncomp; // A single block of uncompressed data
319     size_t uncomp_size, uncomp_alloc;
320 };
321 
322 typedef struct cram_map {
323     int key;    /* 0xe0 + 3 bytes */
324     enum cram_encoding encoding;
325     int offset; /* Offset into a single block of memory */
326     int size;   /* Size */
327     struct cram_codec *codec;
328     struct cram_map *next; // for noddy internal hash
329 } cram_map;
330 
331 typedef struct cram_tag_map {
332     struct cram_codec *codec;
333     cram_block *blk;
334     cram_metrics *m;
335 } cram_tag_map;
336 
337 // Hash aux key (XX:i) to cram_tag_map
338 KHASH_MAP_INIT_INT(m_tagmap, cram_tag_map*)
339 
340 /* Mapped or unmapped slice header block */
341 struct cram_block_slice_hdr {
342     enum cram_content_type content_type;
343     int32_t ref_seq_id;     /* if content_type == MAPPED_SLICE */
344     int64_t ref_seq_start;  /* if content_type == MAPPED_SLICE */
345     int64_t ref_seq_span;   /* if content_type == MAPPED_SLICE */
346     int32_t num_records;
347     int64_t record_counter;
348     int32_t num_blocks;
349     int32_t num_content_ids;
350     int32_t *block_content_ids;
351     int32_t ref_base_id;    /* if content_type == MAPPED_SLICE */
352     unsigned char md5[16];
353 };
354 
355 struct ref_entry;
356 
357 /*
358  * Container.
359  *
360  * Conceptually a container is split into slices, and slices into blocks.
361  * However on disk it's just a list of blocks and we need to query the
362  * block types to identify the start/end points of the slices.
363  *
364  * OR... are landmarks the start/end points of slices?
365  */
366 struct cram_container {
367     int32_t  length;
368     int32_t  ref_seq_id;
369     int64_t  ref_seq_start;
370     int64_t  ref_seq_span;
371     int64_t  record_counter;
372     int64_t  num_bases;
373     int32_t  num_records;
374     int32_t  num_blocks;
375     int32_t  num_landmarks;
376     int32_t *landmark;
377 
378     /* Size of container header above */
379     size_t   offset;
380 
381     /* Compression header is always the first block? */
382     cram_block_compression_hdr *comp_hdr;
383     cram_block *comp_hdr_block;
384 
385     /* For construction purposes */
386     int max_slice, curr_slice;   // maximum number of slices
387     int curr_slice_mt;           // Curr_slice when reading ahead (via threads)
388     int max_rec, curr_rec;       // current and max recs per slice
389     int max_c_rec, curr_c_rec;   // current and max recs per container
390     int slice_rec;               // rec no. for start of this slice
391     int curr_ref;                // current ref ID. -2 for no previous
392     int64_t last_pos;                // last record position
393     struct cram_slice **slices, *slice;
394     int pos_sorted;              // boolean, 1=>position sorted data
395     int64_t max_apos;                // maximum position, used if pos_sorted==0
396     int last_slice;              // number of reads in last slice (0 for 1st)
397     int multi_seq;               // true if packing multi seqs per cont/slice
398     int unsorted;                // true is AP_delta is 0.
399 
400     /* Copied from fd before encoding, to allow multi-threading */
401     int ref_start, first_base, last_base, ref_id, ref_end;
402     char *ref;
403     //struct ref_entry *ref;
404 
405     /* For multi-threading */
406     bam_seq_t **bams;
407 
408     /* Statistics for encoding */
409     cram_stats *stats[DS_END];
410 
411     khash_t(m_tagmap) *tags_used; // set of tag types in use, for tag encoding map
412     int *refs_used;       // array of frequency of ref seq IDs
413 
414     uint32_t crc32;       // CRC32
415 
416     uint64_t s_num_bases; // number of bases in this slice
417 
418     uint32_t n_mapped;    // Number of mapped reads
419 };
420 
421 /*
422  * A single cram record
423  */
424 typedef struct cram_record {
425     struct cram_slice *s; // Filled out by cram_decode only
426 
427     int32_t ref_id;       // fixed for all recs in slice?
428     int32_t flags;        // BF
429     int32_t cram_flags;   // CF
430     int32_t len;          // RL
431     int64_t apos;         // AP
432     int32_t rg;           // RG
433     int32_t name;         // RN; idx to s->names_blk
434     int32_t name_len;
435     int32_t mate_line;    // index to another cram_record
436     int32_t mate_ref_id;
437     int64_t mate_pos;     // NP
438     int64_t tlen;         // TS
439 
440     // Auxiliary data
441     int32_t ntags;        // TC
442     int32_t aux;          // idx to s->aux_blk
443     int32_t aux_size;     // total size of packed ntags in aux_blk
444 #ifndef TN_external
445     int32_t TN_idx;       // TN; idx to s->TN;
446 #else
447     int32_t tn;           // idx to s->tn_blk
448 #endif
449     int     TL;
450 
451     int32_t seq;          // idx to s->seqs_blk
452     int32_t qual;         // idx to s->qual_blk
453     int32_t cigar;        // idx to s->cigar
454     int32_t ncigar;
455     int64_t aend;         // alignment end
456     int32_t mqual;        // MQ
457 
458     int32_t feature;      // idx to s->feature
459     int32_t nfeature;     // number of features
460     int32_t mate_flags;   // MF
461 } cram_record;
462 
463 // Accessor macros as an analogue of the bam ones
464 #define cram_qname(c)    (&(c)->s->name_blk->data[(c)->name])
465 #define cram_seq(c)      (&(c)->s->seqs_blk->data[(c)->seq])
466 #define cram_qual(c)     (&(c)->s->qual_blk->data[(c)->qual])
467 #define cram_aux(c)      (&(c)->s->aux_blk->data[(c)->aux])
468 #define cram_seqi(c,i)   (cram_seq((c))[(i)])
469 #define cram_name_len(c) ((c)->name_len)
470 #define cram_strand(c)   (((c)->flags & BAM_FREVERSE) != 0)
471 #define cram_mstrand(c)  (((c)->flags & BAM_FMREVERSE) != 0)
472 #define cram_cigar(c)    (&((cr)->s->cigar)[(c)->cigar])
473 
474 /*
475  * A feature is a base difference, used for the sequence reference encoding.
476  * (We generate these internally when writing CRAM.)
477  */
478 typedef union cram_feature {
479     struct {
480         int pos;
481         int code;
482         int base;    // substitution code
483     } X;
484     struct {
485         int pos;
486         int code;
487         int base;    // actual base & qual
488         int qual;
489     } B;
490     struct {
491         int pos;
492         int code;
493         int seq_idx; // index to s->seqs_blk
494         int len;
495     } b;
496     struct {
497         int pos;
498         int code;
499         int qual;
500     } Q;
501     struct {
502         int pos;
503         int code;
504         int len;
505         int seq_idx; // soft-clip multiple bases
506     } S;
507     struct {
508         int pos;
509         int code;
510         int len;
511         int seq_idx; // insertion multiple bases
512     } I;
513     struct {
514         int pos;
515         int code;
516         int base; // insertion single base
517     } i;
518     struct {
519         int pos;
520         int code;
521         int len;
522     } D;
523     struct {
524         int pos;
525         int code;
526         int len;
527     } N;
528     struct {
529         int pos;
530         int code;
531         int len;
532     } P;
533     struct {
534         int pos;
535         int code;
536         int len;
537     } H;
538 } cram_feature;
539 
540 /*
541  * A slice is really just a set of blocks, but it
542  * is the logical unit for decoding a number of
543  * sequences.
544  */
545 struct cram_slice {
546     cram_block_slice_hdr *hdr;
547     cram_block *hdr_block;
548     cram_block **block;
549     cram_block **block_by_id;
550 
551     /* State used during encoding/decoding */
552     int64_t last_apos, max_apos;
553 
554     /* Array of decoded cram records */
555     cram_record *crecs;
556 
557     /* An dynamically growing buffers for data pointed
558      * to by crecs[] array.
559      */
560     uint32_t  *cigar;
561     uint32_t   cigar_alloc;
562     uint32_t   ncigar;
563 
564     cram_feature *features;
565     int           nfeatures;
566     int           afeatures; // allocated size of features
567 
568 #ifndef TN_external
569     // TN field (Tag Name)
570     uint32_t      *TN;
571     int           nTN, aTN;  // used and allocated size for TN[]
572 #else
573     cram_block *tn_blk;
574     int tn_id;
575 #endif
576 
577     // For variable sized elements which are always external blocks.
578     cram_block *name_blk;
579     cram_block *seqs_blk;
580     cram_block *qual_blk;
581     cram_block *base_blk;
582     cram_block *soft_blk;
583     cram_block *aux_blk;       // BAM aux block, created while decoding CRAM
584 
585     string_alloc_t *pair_keys; // Pooled keys for pair hash.
586     khash_t(m_s2i) *pair[2];   // for identifying read-pairs in this slice.
587 
588     char *ref;                 // slice of current reference
589     int ref_start;             // start position of current reference;
590     int ref_end;               // end position of current reference;
591     int ref_id;
592 
593     // For going from BAM to CRAM; an array of auxiliary blocks per type
594     int naux_block;
595     cram_block **aux_block;
596 
597     unsigned int data_series; // See cram_fields enum
598     int decode_md;
599 
600     int max_rec, curr_rec;       // current and max recs per slice
601     int slice_num;               // To be copied into c->curr_slice in decode
602 };
603 
604 /*-----------------------------------------------------------------------------
605  * Consider moving reference handling to cram_refs.[ch]
606  */
607 // from fa.fai / samtools faidx files
608 typedef struct ref_entry {
609     char *name;
610     char *fn;
611     int64_t length;
612     int64_t offset;
613     int bases_per_line;
614     int line_length;
615     int64_t count;         // for shared references so we know to dealloc seq
616     char *seq;
617     mFILE *mf;
618     int is_md5;            // Reference comes from a raw seq found by MD5
619 } ref_entry;
620 
621 KHASH_MAP_INIT_STR(refs, ref_entry*)
622 
623 // References structure.
624 struct refs_t {
625     string_alloc_t *pool;  // String pool for holding filenames and SN vals
626 
627     khash_t(refs) *h_meta; // ref_entry*, index by name
628     ref_entry **ref_id;    // ref_entry*, index by ID
629     int nref;              // number of ref_entry
630 
631     char *fn;              // current file opened
632     BGZF *fp;              // and the hFILE* to go with it.
633 
634     int count;             // how many cram_fd sharing this refs struct
635 
636     pthread_mutex_t lock;  // Mutex for multi-threaded updating
637     ref_entry *last;       // Last queried sequence
638     int last_id;           // Used in cram_ref_decr_locked to delay free
639 };
640 
641 /*-----------------------------------------------------------------------------
642  * CRAM index
643  *
644  * Detect format by number of entries per line.
645  * 5 => 1.0 (refid, start, nseq, C offset, slice)
646  * 6 => 1.1 (refid, start, span, C offset, S offset, S size)
647  *
648  * Indices are stored in a nested containment list, which is trivial to set
649  * up as the indices are on sorted data so we're appending to the nclist
650  * in sorted order. Basically if a slice entirely fits within a previous
651  * slice then we append to that slices list. This is done recursively.
652  *
653  * Lists are sorted on two dimensions: ref id + slice coords.
654  */
655 typedef struct cram_index {
656     int nslice, nalloc;   // total number of slices
657     struct cram_index *e; // array of size nslice
658 
659     int     refid;  // 1.0                 1.1
660     int     start;  // 1.0                 1.1
661     int     end;    //                     1.1
662     int     nseq;   // 1.0 - undocumented
663     int     slice;  // 1.0 landmark index, 1.1 landmark value
664     int     len;    //                     1.1 - size of slice in bytes
665     int64_t offset; // 1.0                 1.1
666     int64_t next;   // derived: offset of next container.
667 } cram_index;
668 
669 typedef struct {
670     int refid;
671     int64_t start;
672     int64_t end;
673 } cram_range;
674 
675 /*-----------------------------------------------------------------------------
676  */
677 /* CRAM File handle */
678 
679 typedef struct spare_bams {
680     bam_seq_t **bams;
681     struct spare_bams *next;
682 } spare_bams;
683 
684 struct cram_fd {
685     struct hFILE  *fp;
686     int            mode;     // 'r' or 'w'
687     int            version;
688     cram_file_def *file_def;
689     sam_hdr_t     *header;
690 
691     char          *prefix;
692     int64_t        record_counter;
693     int            err;
694 
695     // Most recent compression header decoded
696     //cram_block_compression_hdr *comp_hdr;
697     //cram_block_slice_hdr       *slice_hdr;
698 
699     // Current container being processed
700     cram_container *ctr;
701 
702     // Current container used for decoder threads
703     cram_container *ctr_mt;
704 
705     // positions for encoding or decoding
706     int first_base, last_base;
707 
708     // cached reference portion
709     refs_t *refs;              // ref meta-data structure
710     char *ref, *ref_free;      // current portion held in memory
711     int   ref_id;
712     int   ref_start;
713     int   ref_end;
714     char *ref_fn;   // reference fasta filename
715 
716     // compression level and metrics
717     int level;
718     cram_metrics *m[DS_END];
719     khash_t(m_metrics) *tags_used; // cram_metrics[], per tag types in use.
720 
721     // options
722     int decode_md; // Whether to export MD and NM tags
723     int seqs_per_slice;
724     int bases_per_slice;
725     int slices_per_container;
726     int embed_ref;
727     int no_ref;
728     int ignore_md5;
729     int use_bz2;
730     int use_rans;
731     int use_lzma;
732     int shared_ref;
733     unsigned int required_fields;
734     int store_md;
735     int store_nm;
736     cram_range range;
737 
738     // lookup tables, stored here so we can be trivially multi-threaded
739     unsigned int bam_flag_swap[0x1000]; // cram -> bam flags
740     unsigned int cram_flag_swap[0x1000];// bam -> cram flags
741     unsigned char L1[256];              // ACGT{*} ->0123{4}
742     unsigned char L2[256];              // ACGTN{*}->01234{5}
743     char cram_sub_matrix[32][32];       // base substitution codes
744 
745     int         index_sz;
746     cram_index *index;                  // array, sizeof index_sz
747     off_t first_container;
748     off_t curr_position;
749     int eof;
750     int last_slice;                     // number of recs encoded in last slice
751     int last_RI_count;                  // number of references encoded in last container
752     int multi_seq;                      // -1 is auto, 0 is one ref per container, 1 is multi...
753     int multi_seq_user;                 // Original user setting (CRAM_OPT_MULTI_SEQ_PER_SLICE)
754     int unsorted;
755     int last_mapped;                    // number of mapped reads in last container
756     int empty_container;                // Marker for EOF block
757 
758     // thread pool
759     int own_pool;
760     hts_tpool *pool;
761     hts_tpool_process *rqueue;
762     pthread_mutex_t metrics_lock;
763     pthread_mutex_t ref_lock;
764     pthread_mutex_t range_lock;
765     spare_bams *bl;
766     pthread_mutex_t bam_list_lock;
767     void *job_pending;
768     int ooc;                            // out of containers.
769 
770     int lossy_read_names;               // boolean
771     int tlen_approx;                    // max TLEN calculation offset.
772     int tlen_zero;                      // If true, permit tlen 0 (=> tlen calculated)
773 
774     BGZF *idxfp;                        // File pointer for on-the-fly index creation
775 };
776 
777 // Translation of required fields to cram data series
778 enum cram_fields {
779     CRAM_BF = 0x00000001,
780     CRAM_AP = 0x00000002,
781     CRAM_FP = 0x00000004,
782     CRAM_RL = 0x00000008,
783     CRAM_DL = 0x00000010,
784     CRAM_NF = 0x00000020,
785     CRAM_BA = 0x00000040,
786     CRAM_QS = 0x00000080,
787     CRAM_FC = 0x00000100,
788     CRAM_FN = 0x00000200,
789     CRAM_BS = 0x00000400,
790     CRAM_IN = 0x00000800,
791     CRAM_RG = 0x00001000,
792     CRAM_MQ = 0x00002000,
793     CRAM_TL = 0x00004000,
794     CRAM_RN = 0x00008000,
795     CRAM_NS = 0x00010000,
796     CRAM_NP = 0x00020000,
797     CRAM_TS = 0x00040000,
798     CRAM_MF = 0x00080000,
799     CRAM_CF = 0x00100000,
800     CRAM_RI = 0x00200000,
801     CRAM_RS = 0x00400000,
802     CRAM_PD = 0x00800000,
803     CRAM_HC = 0x01000000,
804     CRAM_SC = 0x02000000,
805     CRAM_BB = 0x04000000,
806     CRAM_BB_len = 0x08000000,
807     CRAM_QQ = 0x10000000,
808     CRAM_QQ_len = 0x20000000,
809     CRAM_aux= 0x40000000,
810     CRAM_ALL= 0x7fffffff,
811 };
812 
813 // A CIGAR opcode, but not necessarily the implications of it. Eg FC/FP may
814 // encode a base difference, but we don't need to know what it is for CIGAR.
815 // If we have a soft-clip or insertion, we do need SC/IN though to know how
816 // long that array is.
817 #define CRAM_CIGAR (CRAM_FN | CRAM_FP | CRAM_FC | CRAM_DL | CRAM_IN | \
818                     CRAM_SC | CRAM_HC | CRAM_PD | CRAM_RS | CRAM_RL | CRAM_BF)
819 
820 #define CRAM_SEQ (CRAM_CIGAR | CRAM_BA | CRAM_BS | \
821                   CRAM_RL    | CRAM_AP | CRAM_BB)
822 
823 #define CRAM_QUAL (CRAM_CIGAR | CRAM_RL | CRAM_AP | CRAM_QS | CRAM_QQ)
824 
825 /* BF bitfields */
826 /* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */
827 #define CRAM_FPAIRED      256
828 #define CRAM_FPROPER_PAIR 128
829 #define CRAM_FUNMAP        64
830 #define CRAM_FREVERSE      32
831 #define CRAM_FREAD1        16
832 #define CRAM_FREAD2         8
833 #define CRAM_FSECONDARY     4
834 #define CRAM_FQCFAIL        2
835 #define CRAM_FDUP           1
836 
837 #define DS_aux_S "\001"
838 #define DS_aux_OQ_S "\002"
839 #define DS_aux_BQ_S "\003"
840 #define DS_aux_BD_S "\004"
841 #define DS_aux_BI_S "\005"
842 #define DS_aux_FZ_S "\006"
843 #define DS_aux_oq_S "\007"
844 #define DS_aux_os_S "\010"
845 #define DS_aux_oz_S "\011"
846 
847 #define CRAM_M_REVERSE  1
848 #define CRAM_M_UNMAP    2
849 
850 
851 /* CF bitfields */
852 #define CRAM_FLAG_PRESERVE_QUAL_SCORES (1<<0)
853 #define CRAM_FLAG_DETACHED             (1<<1)
854 #define CRAM_FLAG_MATE_DOWNSTREAM      (1<<2)
855 #define CRAM_FLAG_NO_SEQ               (1<<3)
856 #define CRAM_FLAG_MASK                 ((1<<4)-1)
857 
858 /* Internal only */
859 #define CRAM_FLAG_STATS_ADDED          (1<<30)
860 #define CRAM_FLAG_DISCARD_NAME         (1U<<31)
861 
862 #ifdef __cplusplus
863 }
864 #endif
865 
866 #endif /* HTSLIB_CRAM_STRUCTS_H */
867