1 /*
2 Copyright (c) 2012-2013 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8    1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11    2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 #ifndef _CRAM_STRUCTS_H_
32 #define _CRAM_STRUCTS_H_
33 
34 /*
35  * Defines in-memory structs for the basic file-format objects in the
36  * CRAM format.
37  *
38  * The basic file format is:
39  *     File-def SAM-hdr Container Container ...
40  *
41  * Container:
42  *     Service-block data-block data-block ...
43  *
44  * Multiple blocks in a container are grouped together as slices,
45  * also sometimes referred to as landmarks in the spec.
46  */
47 
48 
49 #include <pthread.h>
50 #include <stdint.h>
51 #include <sys/types.h>
52 
53 #include "htslib/thread_pool.h"
54 #include "cram/string_alloc.h"
55 #include "cram/mFILE.h"
56 #include "htslib/khash.h"
57 
58 #ifdef __cplusplus
59 extern "C" {
60 #endif
61 
62 // Generic hash-map integer -> integer
63 KHASH_MAP_INIT_INT(m_i2i, int)
64 
65 // Generic hash-set integer -> (existance)
66 KHASH_SET_INIT_INT(s_i2i)
67 
68 // For brevity
69 typedef unsigned char uc;
70 
71 /*
72  * A union for the preservation map. Required for khash.
73  */
74 typedef union {
75     int i;
76     char *p;
77 } pmap_t;
78 
79 // Generates static functions here which isn't ideal, but we have no way
80 // currently to declare the kh_map_t structure here without also declaring a
81 // duplicate in the .c files due to the nature of the KHASH macros.
82 KHASH_MAP_INIT_STR(map, pmap_t)
83 
84 struct hFILE;
85 
86 #define SEQS_PER_SLICE 10000
87 #define BASES_PER_SLICE (SEQS_PER_SLICE*500)
88 #define SLICE_PER_CNT  1
89 
90 #define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT"
91 
92 #define MAX_STAT_VAL 1024
93 //#define MAX_STAT_VAL 16
94 typedef struct cram_stats {
95     int freqs[MAX_STAT_VAL];
96     khash_t(m_i2i) *h;
97     int nsamp; // total number of values added
98     int nvals; // total number of unique values added
99 } cram_stats;
100 
101 /* NB: matches java impl, not the spec */
102 enum cram_encoding {
103     E_NULL               = 0,
104     E_EXTERNAL           = 1,
105     E_GOLOMB             = 2,
106     E_HUFFMAN            = 3,
107     E_BYTE_ARRAY_LEN     = 4,
108     E_BYTE_ARRAY_STOP    = 5,
109     E_BETA               = 6,
110     E_SUBEXP             = 7,
111     E_GOLOMB_RICE        = 8,
112     E_GAMMA              = 9,
113     E_NUM_CODECS         = 10, /* Number of codecs, not a real one. */
114 };
115 
116 enum cram_external_type {
117     E_INT                = 1,
118     E_LONG               = 2,
119     E_BYTE               = 3,
120     E_BYTE_ARRAY         = 4,
121     E_BYTE_ARRAY_BLOCK   = 5,
122 };
123 
124 /* External IDs used by this implementation (only assumed during writing) */
125 enum cram_DS_ID {
126     DS_CORE   = 0,
127     DS_aux    = 1, // aux_blk
128     DS_aux_OQ = 2,
129     DS_aux_BQ = 3,
130     DS_aux_BD = 4,
131     DS_aux_BI = 5,
132     DS_aux_FZ = 6, // also ZM:B
133     DS_aux_oq = 7, // other qualities
134     DS_aux_os = 8, // other sequences
135     DS_aux_oz = 9, // other strings
136     DS_ref,
137     DS_RN, // name_blk
138     DS_QS, // qual_blk
139     DS_IN, // base_blk
140     DS_SC, // soft_blk
141 
142     DS_BF, // start loop
143     DS_CF,
144     DS_AP,
145     DS_RG,
146     DS_MQ,
147     DS_NS,
148     DS_MF,
149     DS_TS,
150     DS_NP,
151     DS_NF,
152     DS_RL,
153     DS_FN,
154     DS_FC,
155     DS_FP,
156     DS_DL,
157     DS_BA,
158     DS_BS,
159     DS_TL,
160     DS_RI,
161     DS_RS,
162     DS_PD,
163     DS_HC,
164     DS_BB,
165     DS_QQ,
166 
167     DS_TN, // end loop
168 
169     DS_RN_len,
170     DS_SC_len,
171     DS_BB_len,
172     DS_QQ_len,
173 
174     DS_TC, // CRAM v1.0 tags
175     DS_TM, // test
176     DS_TV, // test
177 
178     DS_END,
179 };
180 
181 /* "File Definition Structure" */
182 typedef struct cram_file_def {
183     char    magic[4];
184     uint8_t major_version;
185     uint8_t minor_version;
186     char    file_id[20];      // Filename or SHA1 checksum
187 } cram_file_def;
188 
189 #define CRAM_MAJOR_VERS(v) ((v) >> 8)
190 #define CRAM_MINOR_VERS(v) ((v) & 0xff)
191 
192 struct cram_slice;
193 
194 enum cram_block_method {
195     BM_ERROR = -1,
196     RAW      = 0,
197     GZIP     = 1,
198     BZIP2    = 2,
199     LZMA     = 3,
200     RANS     = 4,  // Generic; either order
201     RANS0    = 4,
202     RANS1    = 10, // Not externalised; stored as RANS (generic)
203     GZIP_RLE = 11, // NB: not externalised in CRAM
204 };
205 
206 enum cram_content_type {
207     CT_ERROR           = -1,
208     FILE_HEADER        = 0,
209     COMPRESSION_HEADER = 1,
210     MAPPED_SLICE       = 2,
211     UNMAPPED_SLICE     = 3, // CRAM V1.0 only
212     EXTERNAL           = 4,
213     CORE               = 5,
214 };
215 
216 /* Compression metrics */
217 typedef struct {
218     // number of trials and time to next trial
219     int trial;
220     int next_trial;
221 
222     // aggregate sizes during trials
223     int sz_gz_rle;
224     int sz_gz_def;
225     int sz_rans0;
226     int sz_rans1;
227     int sz_bzip2;
228     int sz_lzma;
229 
230     // resultant method from trials
231     int method;
232     int strat;
233 
234     // Revisions of method, to allow culling of continually failing ones.
235     int gz_rle_cnt;
236     int gz_def_cnt;
237     int rans0_cnt;
238     int rans1_cnt;
239     int bzip2_cnt;
240     int lzma_cnt;
241     int revised_method;
242 
243     double gz_rle_extra;
244     double gz_def_extra;
245     double rans0_extra;
246     double rans1_extra;
247     double bzip2_extra;
248     double lzma_extra;
249 } cram_metrics;
250 
251 // Hash aux key (XX:i) to cram_metrics
252 KHASH_MAP_INIT_INT(m_metrics, cram_metrics*)
253 
254 
255 /* Block */
256 typedef struct cram_block {
257     enum cram_block_method  method, orig_method;
258     enum cram_content_type  content_type;
259     int32_t  content_id;
260     int32_t  comp_size;
261     int32_t  uncomp_size;
262     uint32_t crc32;
263     int32_t  idx; /* offset into data */
264     unsigned char    *data;
265 
266     // For bit I/O
267     size_t alloc;
268     size_t byte;
269     int bit;
270 
271     // To aid compression
272     cram_metrics *m; // used to track aux block compression only
273 } cram_block;
274 
275 struct cram_codec; /* defined in cram_codecs.h */
276 struct cram_map;
277 
278 #define CRAM_MAP_HASH 32
279 #define CRAM_MAP(a,b) (((a)*3+(b))&(CRAM_MAP_HASH-1))
280 
281 /* Compression header block */
282 typedef struct cram_block_compression_hdr {
283     int32_t ref_seq_id;
284     int32_t ref_seq_start;
285     int32_t ref_seq_span;
286     int32_t num_records;
287     int32_t num_landmarks;
288     int32_t *landmark;
289 
290     /* Flags from preservation map */
291     int mapped_qs_included;
292     int unmapped_qs_included;
293     int unmapped_placed;
294     int qs_included;
295     int read_names_included;
296     int AP_delta;
297     // indexed by ref-base and subst. code
298     char substitution_matrix[5][4];
299 
300     // TD Dictionary as a concatenated block
301     cram_block *TD_blk;          // Tag Dictionary
302     int nTL;		         // number of TL entries in TD
303     unsigned char **TL;          // array of size nTL, pointer into TD_blk.
304     khash_t(m_s2i) *TD_hash;     // Keyed on TD strings, map to TL[] indices
305     string_alloc_t *TD_keys;     // Pooled keys for TD hash.
306 
307     khash_t(map) *preservation_map;
308     struct cram_map *rec_encoding_map[CRAM_MAP_HASH];
309     struct cram_map *tag_encoding_map[CRAM_MAP_HASH];
310 
311     struct cram_codec *codecs[DS_END];
312 
313     char *uncomp; // A single block of uncompressed data
314     size_t uncomp_size, uncomp_alloc;
315 
316     unsigned int data_series; // See cram_fields enum below
317 } cram_block_compression_hdr;
318 
319 typedef struct cram_map {
320     int key;    /* 0xe0 + 3 bytes */
321     enum cram_encoding encoding;
322     int offset; /* Offset into a single block of memory */
323     int size;   /* Size */
324     struct cram_codec *codec;
325     struct cram_map *next; // for noddy internal hash
326 } cram_map;
327 
328 typedef struct cram_tag_map {
329     struct cram_codec *codec;
330     cram_block *blk;
331     cram_metrics *m;
332 } cram_tag_map;
333 
334 // Hash aux key (XX:i) to cram_tag_map
335 KHASH_MAP_INIT_INT(m_tagmap, cram_tag_map*)
336 
337 /* Mapped or unmapped slice header block */
338 typedef struct cram_block_slice_hdr {
339     enum cram_content_type content_type;
340     int32_t ref_seq_id;     /* if content_type == MAPPED_SLICE */
341     int32_t ref_seq_start;  /* if content_type == MAPPED_SLICE */
342     int32_t ref_seq_span;   /* if content_type == MAPPED_SLICE */
343     int32_t num_records;
344     int64_t record_counter;
345     int32_t num_blocks;
346     int32_t num_content_ids;
347     int32_t *block_content_ids;
348     int32_t ref_base_id;    /* if content_type == MAPPED_SLICE */
349     unsigned char md5[16];
350 } cram_block_slice_hdr;
351 
352 struct ref_entry;
353 
354 /*
355  * Container.
356  *
357  * Conceptually a container is split into slices, and slices into blocks.
358  * However on disk it's just a list of blocks and we need to query the
359  * block types to identify the start/end points of the slices.
360  *
361  * OR... are landmarks the start/end points of slices?
362  */
363 typedef struct cram_container {
364     int32_t  length;
365     int32_t  ref_seq_id;
366     int32_t  ref_seq_start;
367     int32_t  ref_seq_span;
368     int64_t  record_counter;
369     int64_t  num_bases;
370     int32_t  num_records;
371     int32_t  num_blocks;
372     int32_t  num_landmarks;
373     int32_t *landmark;
374 
375     /* Size of container header above */
376     size_t   offset;
377 
378     /* Compression header is always the first block? */
379     cram_block_compression_hdr *comp_hdr;
380     cram_block *comp_hdr_block;
381 
382     /* For construction purposes */
383     int max_slice, curr_slice;   // maximum number of slices
384     int max_rec, curr_rec;       // current and max recs per slice
385     int max_c_rec, curr_c_rec;   // current and max recs per container
386     int slice_rec;               // rec no. for start of this slice
387     int curr_ref;                // current ref ID. -2 for no previous
388     int last_pos;                // last record position
389     struct cram_slice **slices, *slice;
390     int pos_sorted;              // boolean, 1=>position sorted data
391     int max_apos;                // maximum position, used if pos_sorted==0
392     int last_slice;              // number of reads in last slice (0 for 1st)
393     int multi_seq;               // true if packing multi seqs per cont/slice
394     int unsorted;		 // true is AP_delta is 0.
395 
396     /* Copied from fd before encoding, to allow multi-threading */
397     int ref_start, first_base, last_base, ref_id, ref_end;
398     char *ref;
399     //struct ref_entry *ref;
400 
401     /* For multi-threading */
402     bam_seq_t **bams;
403 
404     /* Statistics for encoding */
405     cram_stats *stats[DS_END];
406 
407     khash_t(m_tagmap) *tags_used; // set of tag types in use, for tag encoding map
408     int *refs_used;       // array of frequency of ref seq IDs
409 
410     uint32_t crc32;       // CRC32
411 
412     uint64_t s_num_bases; // number of bases in this slice
413 } cram_container;
414 
415 /*
416  * A single cram record
417  */
418 typedef struct cram_record {
419     struct cram_slice *s; // Filled out by cram_decode only
420 
421     int32_t ref_id;       // fixed for all recs in slice?
422     int32_t flags;        // BF
423     int32_t cram_flags;   // CF
424     int32_t len;          // RL
425     int32_t apos;         // AP
426     int32_t rg;           // RG
427     int32_t name;         // RN; idx to s->names_blk
428     int32_t name_len;
429     int32_t mate_line;    // index to another cram_record
430     int32_t mate_ref_id;
431     int32_t mate_pos;     // NP
432     int32_t tlen;         // TS
433 
434     // Auxiliary data
435     int32_t ntags;        // TC
436     int32_t aux;          // idx to s->aux_blk
437     int32_t aux_size;     // total size of packed ntags in aux_blk
438 #ifndef TN_external
439     int32_t TN_idx;       // TN; idx to s->TN;
440 #else
441     int32_t tn;           // idx to s->tn_blk
442 #endif
443     int     TL;
444 
445     int32_t seq;          // idx to s->seqs_blk
446     int32_t qual;         // idx to s->qual_blk
447     int32_t cigar;        // idx to s->cigar
448     int32_t ncigar;
449     int32_t aend;         // alignment end
450     int32_t mqual;        // MQ
451 
452     int32_t feature;      // idx to s->feature
453     int32_t nfeature;     // number of features
454     int32_t mate_flags;   // MF
455 } cram_record;
456 
457 // Accessor macros as an analogue of the bam ones
458 #define cram_qname(c)    (&(c)->s->name_blk->data[(c)->name])
459 #define cram_seq(c)      (&(c)->s->seqs_blk->data[(c)->seq])
460 #define cram_qual(c)     (&(c)->s->qual_blk->data[(c)->qual])
461 #define cram_aux(c)      (&(c)->s->aux_blk->data[(c)->aux])
462 #define cram_seqi(c,i)   (cram_seq((c))[(i)])
463 #define cram_name_len(c) ((c)->name_len)
464 #define cram_strand(c)   (((c)->flags & BAM_FREVERSE) != 0)
465 #define cram_mstrand(c)  (((c)->flags & BAM_FMREVERSE) != 0)
466 #define cram_cigar(c)    (&((cr)->s->cigar)[(c)->cigar])
467 
468 /*
469  * A feature is a base difference, used for the sequence reference encoding.
470  * (We generate these internally when writing CRAM.)
471  */
472 typedef struct cram_feature {
473     union {
474 	struct {
475 	    int pos;
476 	    int code;
477 	    int base;    // substitution code
478 	} X;
479 	struct {
480 	    int pos;
481 	    int code;
482 	    int base;    // actual base & qual
483 	    int qual;
484 	} B;
485 	struct {
486 	    int pos;
487 	    int code;
488 	    int seq_idx; // index to s->seqs_blk
489 	    int len;
490 	} b;
491 	struct {
492 	    int pos;
493 	    int code;
494 	    int qual;
495 	} Q;
496 	struct {
497 	    int pos;
498 	    int code;
499 	    int len;
500 	    int seq_idx; // soft-clip multiple bases
501 	} S;
502 	struct {
503 	    int pos;
504 	    int code;
505 	    int len;
506 	    int seq_idx; // insertion multiple bases
507 	} I;
508 	struct {
509 	    int pos;
510 	    int code;
511 	    int base; // insertion single base
512 	} i;
513 	struct {
514 	    int pos;
515 	    int code;
516 	    int len;
517 	} D;
518 	struct {
519 	    int pos;
520 	    int code;
521 	    int len;
522 	} N;
523 	struct {
524 	    int pos;
525 	    int code;
526 	    int len;
527 	} P;
528 	struct {
529 	    int pos;
530 	    int code;
531 	    int len;
532 	} H;
533     };
534 } cram_feature;
535 
536 /*
537  * A slice is really just a set of blocks, but it
538  * is the logical unit for decoding a number of
539  * sequences.
540  */
541 typedef struct cram_slice {
542     cram_block_slice_hdr *hdr;
543     cram_block *hdr_block;
544     cram_block **block;
545     cram_block **block_by_id;
546 
547     /* State used during encoding/decoding */
548     int last_apos, max_apos;
549 
550     /* Array of decoded cram records */
551     cram_record *crecs;
552 
553     /* An dynamically growing buffers for data pointed
554      * to by crecs[] array.
555      */
556     uint32_t  *cigar;
557     uint32_t   cigar_alloc;
558     uint32_t   ncigar;
559 
560     cram_feature *features;
561     int           nfeatures;
562     int           afeatures; // allocated size of features
563 
564 #ifndef TN_external
565     // TN field (Tag Name)
566     uint32_t      *TN;
567     int           nTN, aTN;  // used and allocated size for TN[]
568 #else
569     cram_block *tn_blk;
570     int tn_id;
571 #endif
572 
573     // For variable sized elements which are always external blocks.
574     cram_block *name_blk;
575     cram_block *seqs_blk;
576     cram_block *qual_blk;
577     cram_block *base_blk;
578     cram_block *soft_blk;
579     cram_block *aux_blk;       // BAM aux block, created while decoding CRAM
580 
581     string_alloc_t *pair_keys; // Pooled keys for pair hash.
582     khash_t(m_s2i) *pair[2];   // for identifying read-pairs in this slice.
583 
584     char *ref;                 // slice of current reference
585     int ref_start;             // start position of current reference;
586     int ref_end;               // end position of current reference;
587     int ref_id;
588 
589     // For going from BAM to CRAM; an array of auxiliary blocks per type
590     int naux_block;
591     cram_block **aux_block;
592 } cram_slice;
593 
594 /*-----------------------------------------------------------------------------
595  * Consider moving reference handling to cram_refs.[ch]
596  */
597 // from fa.fai / samtools faidx files
598 typedef struct ref_entry {
599     char *name;
600     char *fn;
601     int64_t length;
602     int64_t offset;
603     int bases_per_line;
604     int line_length;
605     int64_t count;	   // for shared references so we know to dealloc seq
606     char *seq;
607     mFILE *mf;
608     int is_md5;            // Reference comes from a raw seq found by MD5
609 } ref_entry;
610 
611 KHASH_MAP_INIT_STR(refs, ref_entry*)
612 
613 // References structure.
614 typedef struct {
615     string_alloc_t *pool;  // String pool for holding filenames and SN vals
616 
617     khash_t(refs) *h_meta; // ref_entry*, index by name
618     ref_entry **ref_id;    // ref_entry*, index by ID
619     int nref;              // number of ref_entry
620 
621     char *fn;              // current file opened
622     BGZF *fp;              // and the hFILE* to go with it.
623 
624     int count;             // how many cram_fd sharing this refs struct
625 
626     pthread_mutex_t lock;  // Mutex for multi-threaded updating
627     ref_entry *last;       // Last queried sequence
628     int last_id;           // Used in cram_ref_decr_locked to delay free
629 } refs_t;
630 
631 /*-----------------------------------------------------------------------------
632  * CRAM index
633  *
634  * Detect format by number of entries per line.
635  * 5 => 1.0 (refid, start, nseq, C offset, slice)
636  * 6 => 1.1 (refid, start, span, C offset, S offset, S size)
637  *
638  * Indices are stored in a nested containment list, which is trivial to set
639  * up as the indices are on sorted data so we're appending to the nclist
640  * in sorted order. Basically if a slice entirely fits within a previous
641  * slice then we append to that slices list. This is done recursively.
642  *
643  * Lists are sorted on two dimensions: ref id + slice coords.
644  */
645 typedef struct cram_index {
646     int nslice, nalloc;   // total number of slices
647     struct cram_index *e; // array of size nslice
648 
649     int     refid;  // 1.0                 1.1
650     int     start;  // 1.0                 1.1
651     int     end;    //                     1.1
652     int     nseq;   // 1.0 - undocumented
653     int     slice;  // 1.0 landmark index, 1.1 landmark value
654     int     len;    //                     1.1 - size of slice in bytes
655     int64_t offset; // 1.0                 1.1
656 } cram_index;
657 
658 typedef struct {
659     int refid;
660     int start;
661     int end;
662 } cram_range;
663 
664 /*-----------------------------------------------------------------------------
665  */
666 /* CRAM File handle */
667 
668 typedef struct spare_bams {
669     bam_seq_t **bams;
670     struct spare_bams *next;
671 } spare_bams;
672 
673 typedef struct cram_fd {
674     struct hFILE  *fp;
675     int            mode;     // 'r' or 'w'
676     int            version;
677     cram_file_def *file_def;
678     SAM_hdr       *header;
679 
680     char          *prefix;
681     int64_t        record_counter;
682     int            err;
683 
684     // Most recent compression header decoded
685     //cram_block_compression_hdr *comp_hdr;
686     //cram_block_slice_hdr       *slice_hdr;
687 
688     // Current container being processed.
689     cram_container *ctr;
690 
691     // positions for encoding or decoding
692     int first_base, last_base;
693 
694     // cached reference portion
695     refs_t *refs;              // ref meta-data structure
696     char *ref, *ref_free;      // current portion held in memory
697     int   ref_id;
698     int   ref_start;
699     int   ref_end;
700     char *ref_fn;   // reference fasta filename
701 
702     // compression level and metrics
703     int level;
704     cram_metrics *m[DS_END];
705     khash_t(m_metrics) *tags_used; // cram_metrics[], per tag types in use.
706 
707     // options
708     int decode_md; // Whether to export MD and NM tags
709     int seqs_per_slice;
710     int bases_per_slice;
711     int slices_per_container;
712     int embed_ref;
713     int no_ref;
714     int ignore_md5;
715     int use_bz2;
716     int use_rans;
717     int use_lzma;
718     int shared_ref;
719     unsigned int required_fields;
720     cram_range range;
721 
722     // lookup tables, stored here so we can be trivially multi-threaded
723     unsigned int bam_flag_swap[0x1000]; // cram -> bam flags
724     unsigned int cram_flag_swap[0x1000];// bam -> cram flags
725     unsigned char L1[256];              // ACGT{*} ->0123{4}
726     unsigned char L2[256];              // ACGTN{*}->01234{5}
727     char cram_sub_matrix[32][32];	// base substituion codes
728 
729     int         index_sz;
730     cram_index *index;                  // array, sizeof index_sz
731     off_t first_container;
732     int eof;
733     int last_slice;                     // number of recs encoded in last slice
734     int multi_seq;
735     int unsorted;
736     int empty_container; 		// Marker for EOF block
737 
738     // thread pool
739     int own_pool;
740     hts_tpool *pool;
741     hts_tpool_process *rqueue;
742     pthread_mutex_t metrics_lock;
743     pthread_mutex_t ref_lock;
744     spare_bams *bl;
745     pthread_mutex_t bam_list_lock;
746     void *job_pending;
747     int ooc;                            // out of containers.
748 
749     int lossy_read_names;               // boolean
750     int tlen_approx;                    // max TLEN calculation offset.
751     int tlen_zero;                      // If true, permit tlen 0 (=> tlen calculated)
752 } cram_fd;
753 
754 // Translation of required fields to cram data series
755 enum cram_fields {
756     CRAM_BF = 0x00000001,
757     CRAM_AP = 0x00000002,
758     CRAM_FP = 0x00000004,
759     CRAM_RL = 0x00000008,
760     CRAM_DL = 0x00000010,
761     CRAM_NF = 0x00000020,
762     CRAM_BA = 0x00000040,
763     CRAM_QS = 0x00000080,
764     CRAM_FC = 0x00000100,
765     CRAM_FN = 0x00000200,
766     CRAM_BS = 0x00000400,
767     CRAM_IN = 0x00000800,
768     CRAM_RG = 0x00001000,
769     CRAM_MQ = 0x00002000,
770     CRAM_TL = 0x00004000,
771     CRAM_RN = 0x00008000,
772     CRAM_NS = 0x00010000,
773     CRAM_NP = 0x00020000,
774     CRAM_TS = 0x00040000,
775     CRAM_MF = 0x00080000,
776     CRAM_CF = 0x00100000,
777     CRAM_RI = 0x00200000,
778     CRAM_RS = 0x00400000,
779     CRAM_PD = 0x00800000,
780     CRAM_HC = 0x01000000,
781     CRAM_SC = 0x02000000,
782     CRAM_BB = 0x04000000,
783     CRAM_BB_len = 0x08000000,
784     CRAM_QQ = 0x10000000,
785     CRAM_QQ_len = 0x20000000,
786     CRAM_aux= 0x40000000,
787     CRAM_ALL= 0x7fffffff,
788 };
789 
790 // A CIGAR opcode, but not necessarily the implications of it. Eg FC/FP may
791 // encode a base difference, but we don't need to know what it is for CIGAR.
792 // If we have a soft-clip or insertion, we do need SC/IN though to know how
793 // long that array is.
794 #define CRAM_CIGAR (CRAM_FN | CRAM_FP | CRAM_FC | CRAM_DL | CRAM_IN | \
795 		    CRAM_SC | CRAM_HC | CRAM_PD | CRAM_RS | CRAM_RL | CRAM_BF)
796 
797 #define CRAM_SEQ (CRAM_CIGAR | CRAM_BA | CRAM_BS | \
798 		  CRAM_RL    | CRAM_AP | CRAM_BB)
799 
800 #define CRAM_QUAL (CRAM_CIGAR | CRAM_RL | CRAM_AP | CRAM_QS | CRAM_QQ)
801 
802 /* BF bitfields */
803 /* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */
804 #define CRAM_FPAIRED      256
805 #define CRAM_FPROPER_PAIR 128
806 #define CRAM_FUNMAP        64
807 #define CRAM_FREVERSE      32
808 #define CRAM_FREAD1        16
809 #define CRAM_FREAD2         8
810 #define CRAM_FSECONDARY     4
811 #define CRAM_FQCFAIL        2
812 #define CRAM_FDUP           1
813 
814 #define DS_aux_S "\001"
815 #define DS_aux_OQ_S "\002"
816 #define DS_aux_BQ_S "\003"
817 #define DS_aux_BD_S "\004"
818 #define DS_aux_BI_S "\005"
819 #define DS_aux_FZ_S "\006"
820 #define DS_aux_oq_S "\007"
821 #define DS_aux_os_S "\010"
822 #define DS_aux_oz_S "\011"
823 
824 #define CRAM_M_REVERSE  1
825 #define CRAM_M_UNMAP    2
826 
827 
828 /* CF bitfields */
829 #define CRAM_FLAG_PRESERVE_QUAL_SCORES (1<<0)
830 #define CRAM_FLAG_DETACHED             (1<<1)
831 #define CRAM_FLAG_MATE_DOWNSTREAM      (1<<2)
832 #define CRAM_FLAG_NO_SEQ               (1<<3)
833 #define CRAM_FLAG_MASK                 ((1<<4)-1)
834 
835 /* Internal only */
836 #define CRAM_FLAG_STATS_ADDED          (1<<30)
837 #define CRAM_FLAG_DISCARD_NAME         (1<<31)
838 
839 #ifdef __cplusplus
840 }
841 #endif
842 
843 #endif /* _CRAM_STRUCTS_H_ */
844