1 /*
2  * Copyright (c) 2013 Genome Research Ltd.
3  * Author(s): James Bonfield
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  *    1. Redistributions of source code must retain the above copyright notice,
9  *       this list of conditions and the following disclaimer.
10  *
11  *    2. Redistributions in binary form must reproduce the above
12  *       copyright notice, this list of conditions and the following
13  *       disclaimer in the documentation and/or other materials provided
14  *       with the distribution.
15  *
16  *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17  *    Institute nor the names of its contributors may be used to endorse
18  *    or promote products derived from this software without specific
19  *    prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25  * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #ifndef _CRAM_STRUCTS_H_
35 #define _CRAM_STRUCTS_H_
36 
37 #ifdef __cplusplus
38 extern "C" {
39 #endif
40 
41 /*
42  * Defines in-memory structs for the basic file-format objects in the
43  * CRAM format.
44  *
45  * The basic file format is:
46  *     File-def SAM-hdr Container Container ...
47  *
48  * Container:
49  *     Service-block data-block data-block ...
50  *
51  * Multiple blocks in a container are grouped together as slices,
52  * also sometimes referred to as landmarks in the spec.
53  */
54 
55 
56 #include <stdint.h>
57 
58 #include "io_lib/hash_table.h"       // From io_lib aka staden-read
59 #include "io_lib/thread_pool.h"
60 #include "io_lib/mFILE.h"
61 #include "io_lib/bgzip.h"
62 
63 #ifdef SAMTOOLS
64 // From within samtools/HTSlib
65 #  include "io_lib/string_alloc.h"
66 #else
67 // From within io_lib
68 #  include "io_lib/bam.h"              // For BAM header parsing
69 #endif
70 
71 #define SEQS_PER_SLICE 10000
72 #define BASES_PER_SLICE (SEQS_PER_SLICE*500)
73 #define SLICE_PER_CNT  1
74 
75 #define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT"
76 
77 // TN only in Cram v1
78 //#define TN_external
79 
80 #define MAX_STAT_VAL 1024
81 //#define MAX_STAT_VAL 16
82 typedef struct {
83     int freqs[MAX_STAT_VAL];
84     HashTable *h;
85     int nsamp; // total number of values added
86     int nvals; // total number of unique values added
87 } cram_stats;
88 
89 /* NB: matches java impl, not the spec */
90 enum cram_encoding {
91     E_NULL               = 0,
92     E_EXTERNAL           = 1,
93     E_GOLOMB             = 2,
94     E_HUFFMAN            = 3,
95     E_BYTE_ARRAY_LEN     = 4,
96     E_BYTE_ARRAY_STOP    = 5,
97     E_BETA               = 6,
98     E_SUBEXP             = 7,
99     E_GOLOMB_RICE        = 8,
100     E_GAMMA              = 9,
101     E_NUM_CODECS         = 10, /* Number of codecs, not a real one. */
102 };
103 
104 enum cram_external_type {
105     E_INT                = 1,
106     E_LONG               = 2,
107     E_BYTE               = 3,
108     E_BYTE_ARRAY         = 4,
109     E_BYTE_ARRAY_BLOCK   = 5,
110 };
111 
112 /* External IDs used by this implementation (only assumed during writing) */
113 enum cram_DS_ID {
114     DS_CORE   = 0,
115     DS_aux    = 1, // aux_blk
116     DS_aux_OQ = 2,
117     DS_aux_BQ = 3,
118     DS_aux_BD = 4,
119     DS_aux_BI = 5,
120     DS_aux_FZ = 6, // also ZM:B
121     DS_aux_oq = 7, // other qualities
122     DS_aux_os = 8, // other sequences
123     DS_aux_oz = 9, // other strings
124     DS_ref,
125     DS_RN, // name_blk
126     DS_QS, // qual_blk
127     DS_IN, // base_blk
128     DS_SC, // soft_blk
129 
130     DS_BF, // start loop
131     DS_CF,
132     DS_AP,
133     DS_RG,
134     DS_MQ,
135     DS_NS,
136     DS_MF,
137     DS_TS,
138     DS_NP,
139     DS_NF,
140     DS_RL,
141     DS_FN,
142     DS_FC,
143     DS_FP,
144     DS_DL,
145     DS_BA,
146     DS_BS,
147     DS_TL,
148     DS_RI,
149     DS_RS,
150     DS_PD,
151     DS_HC,
152     DS_BB,
153     DS_QQ,
154 
155     DS_TN, // end loop
156 
157     DS_RN_len,
158     DS_SC_len,
159     DS_BB_len,
160     DS_QQ_len,
161 
162     DS_TC, // CRAM v1.0 tags
163     DS_TM, // test
164     DS_TV, // test
165 
166     DS_END,
167 };
168 
169 /* "File Definition Structure" */
170 typedef struct {
171     char    magic[4];
172     uint8_t major_version;
173     uint8_t minor_version;
174     char    file_id[20];      // Filename or SHA1 checksum
175 } cram_file_def;
176 
177 #define CRAM_MAJOR_VERS(v) ((v) >> 8)
178 #define CRAM_MINOR_VERS(v) ((v) & 0xff)
179 #define IS_CRAM_1_VERS(fd) (CRAM_MAJOR_VERS((fd)->version)==1)
180 #define IS_CRAM_2_VERS(fd) (CRAM_MAJOR_VERS((fd)->version)==2)
181 #define IS_CRAM_3_VERS(fd) (CRAM_MAJOR_VERS((fd)->version)>=3)
182 
183 struct cram_slice;
184 
185 enum cram_block_method {
186     BM_ERROR  = -1,
187     RAW    = 0,
188     GZIP   = 1,    // Z_FILTERED
189     BZIP2  = 2,
190     LZMA   = 3,
191     RANS0  = 4,
192     BSC    = 5,
193     FQZ    = 6,
194     RANS1  = 10,   // Not externalised; stored as RANS (generic)
195     GZIP_RLE = 11, // Z_RLE, NB: not externalised in CRAM
196     GZIP_1 = 12,   // Z_DEFAULT_STRATEGY level 1, NB: not externalised in CRAM
197 
198     RANS_PR0   = 13, // Parameterised r4x16pr rANS codecs.
199     RANS_PR1   = 14,
200     RANS_PR64  = 15,
201     RANS_PR65  = 16,
202     RANS_PR128 = 17,
203     RANS_PR129 = 18,
204     RANS_PR192 = 19,
205     RANS_PR193 = 20,
206 
207     NAME_TOK3  = 21,
208 
209     BLOCK_METHOD_END = 31
210 };
211 
212 enum cram_content_type {
213     CT_ERROR           = -1,
214     FILE_HEADER        = 0,
215     COMPRESSION_HEADER = 1,
216     MAPPED_SLICE       = 2,
217     UNMAPPED_SLICE     = 3, // CRAM_1_VERS only
218     EXTERNAL           = 4,
219     CORE               = 5,
220 };
221 
222 /* Maximum simultaneous codecs allowed, 1 per bit */
223 #define CRAM_MAX_METHOD 32
224 
225 /* Compression metrics */
226 typedef struct {
227     // number of trials and time to next trial
228     int trial;
229     int next_trial;
230     int consistency;
231 
232     // aggregate sizes during trials
233     int sz[CRAM_MAX_METHOD];
234 
235     // resultant method from trials
236     int method;
237     int strat;
238 
239     // Revisions of method, to allow culling of continually failing ones.
240     int cnt[CRAM_MAX_METHOD];
241     int revised_method;
242 
243     double extra[CRAM_MAX_METHOD];
244 
245     cram_stats *stats;
246 } cram_metrics;
247 
248 /* Block */
249 typedef struct {
250     enum cram_block_method  method, orig_method;
251     enum cram_content_type  content_type;
252     int32_t  content_id;
253     int32_t  comp_size;
254     int32_t  uncomp_size;
255     uint32_t crc32;
256     int32_t  idx; /* offset into data */
257     unsigned char    *data;
258 
259     // For bit I/O
260     size_t alloc;
261     size_t byte;
262     int bit;
263 
264     // To aid compression
265     cram_metrics *m; // used to track aux block compression only
266 
267     int crc32_checked;
268     uint32_t crc_part;
269 } cram_block;
270 
271 struct cram_codec; /* defined in cram_codecs.h */
272 struct cram_map;
273 
274 #define CRAM_MAP_HASH 32
275 #define CRAM_MAP(a,b) (((a)*3+(b))&(CRAM_MAP_HASH-1))
276 
277 /* Compression header block */
278 typedef struct {
279     int32_t ref_seq_id;
280     int64_t ref_seq_start;
281     int64_t ref_seq_span;
282     int32_t num_records;
283     int32_t num_landmarks;
284     int32_t *landmark;
285 
286     /* Flags from preservation map */
287     int mapped_qs_included;
288     int unmapped_qs_included;
289     int unmapped_placed;
290     int qs_included;
291     int read_names_included;
292     int AP_delta;
293     // indexed by ref-base and subst. code
294     char substitution_matrix[5][4];
295     int no_ref;
296 
297     // TD Dictionary as a concatenated block
298     cram_block *TD_blk;  // Tag Dictionary
299     int nTL;		 // number of TL entries in TD
300     unsigned char **TL;  // array of size nTL, pointer into TD_blk.
301     HashTable *TD;       // for encoding, keyed on TD entries
302 
303     HashTable *preservation_map;
304     struct cram_map *rec_encoding_map[CRAM_MAP_HASH];
305     struct cram_map *tag_encoding_map[CRAM_MAP_HASH];
306 
307     struct cram_codec *codecs[DS_END];
308 
309     char *uncomp; // A single block of uncompressed data
310     size_t uncomp_size, uncomp_alloc;
311 } cram_block_compression_hdr;
312 
313 typedef struct cram_map {
314     int key;    /* 0xe0 + 3 bytes */
315     enum cram_encoding encoding;
316     int offset; /* Offset into a single block of memory */
317     int size;   /* Size */
318     struct cram_codec *codec;
319     struct cram_map *next; // for noddy internal hash
320 } cram_map;
321 
322 typedef struct {
323     struct cram_codec *codec;
324     cram_block *blk;
325     cram_metrics *m;
326 } cram_tag_map;
327 
328 /* Mapped or unmapped slice header block */
329 typedef struct {
330     enum cram_content_type content_type;
331     int32_t ref_seq_id;     /* if content_type == MAPPED_SLICE */
332     int64_t ref_seq_start;  /* if content_type == MAPPED_SLICE */
333     int64_t ref_seq_span;   /* if content_type == MAPPED_SLICE */
334     int32_t num_records;
335     int64_t record_counter;
336     int32_t num_blocks;
337     int32_t num_content_ids;
338     int32_t *block_content_ids;
339     int32_t ref_base_id;    /* if content_type == MAPPED_SLICE */
340     unsigned char md5[16];
341     HashTable *tags;        /* hash of optional tags */
342     uint32_t BD_crc;        /* base call digest */
343     uint32_t SD_crc;        /* quality score digest */
344 } cram_block_slice_hdr;
345 
346 struct ref_entry;
347 
348 /*
349  * Container.
350  *
351  * Conceptually a container is split into slices, and slices into blocks.
352  * However on disk it's just a list of blocks and we need to query the
353  * block types to identify the start/end points of the slices.
354  *
355  * OR... are landmarks the start/end points of slices?
356  */
357 typedef struct {
358     int32_t  length;
359     int32_t  ref_seq_id;
360     int64_t  ref_seq_start;
361     int64_t  ref_seq_span;
362     int64_t  record_counter;
363     int64_t  num_bases;
364     int32_t  num_records;
365     int32_t  num_blocks;
366     int32_t  num_landmarks;
367     int32_t *landmark;
368 
369     /* Size of container header above */
370     size_t   offset;
371 
372     /* Compression header is always the first block? */
373     cram_block_compression_hdr *comp_hdr;
374     cram_block *comp_hdr_block;
375 
376     /* For construction purposes */
377     int max_slice, curr_slice;   // maximum number of slices
378     int curr_slice_mt;           // Curr slice when reading ahead (via threads)
379     int max_rec, curr_rec;       // current and max recs per slice
380     int max_c_rec, curr_c_rec;   // current and max recs per container
381     int slice_rec;               // rec no. for start of this slice
382     int curr_ref;                // current ref ID. -2 for no previous
383     int64_t last_pos;                // last record position
384     struct cram_slice **slices, *slice;
385     int pos_sorted;              // boolean, 1=>position sorted data
386     int64_t max_apos;                // maximum position, used if pos_sorted==0
387     int last_slice;              // number of reads in last slice (0 for 1st)
388     int multi_seq;               // true if packing multi seqs per cont/slice
389     int unsorted;		 // true is AP_delta is 0.
390 
391     /* Copied from fd before encoding, to allow multi-threading */
392     int64_t ref_start, first_base, last_base, ref_id, ref_end;
393     char *ref;
394     //struct ref_entry *ref;
395 
396     /* For multi-threading */
397     bam_seq_t **bams;
398 
399     /* Statistics for encoding */
400     cram_stats *stats[DS_END];
401 
402     HashTable *tags_used; // cram_tag_map[], per tag types in use.
403 
404     int *refs_used;       // array of frequency of ref seq IDs
405 
406     // For experimental name delta
407     char *last_name;
408 
409     uint32_t crc32;       // Raw container bytes CRC
410 
411     uint64_t s_num_bases; // number of bases in this slice
412 } cram_container;
413 
414 /*
415  * A single cram record
416  */
417 typedef struct {
418     struct cram_slice *s; // Filled out by cram_decode only
419 
420     int32_t ref_id;       // fixed for all recs in slice?
421     int32_t flags;        // BF
422     int32_t cram_flags;   // CF
423     int32_t len;          // RL
424     int64_t apos;         // AP
425     int32_t rg;           // RG
426     int32_t name;         // RN; idx to s->names_blk
427     int32_t name_len;
428     int32_t mate_line;    // index to another cram_record
429     int32_t mate_ref_id;
430     int64_t mate_pos;     // NP
431     int64_t tlen;         // TS
432 
433     // Auxiliary data
434     int32_t ntags;        // TC
435     int32_t aux;          // idx to s->aux_blk
436     int32_t aux_size;     // total size of packed ntags in aux_blk
437 #ifndef TN_external
438     int32_t TN_idx;       // TN; idx to s->TN;
439 #else
440     int32_t tn;           // idx to s->tn_blk
441 #endif
442     int     TL;
443 
444     int32_t seq;          // idx to s->seqs_blk
445     int32_t qual;         // idx to s->qual_blk
446     int32_t cigar;        // idx to s->cigar
447     int32_t ncigar;
448     int64_t aend;         // alignment end
449     int32_t mqual;        // MQ
450 
451     int32_t feature;      // idx to s->feature
452     int32_t nfeature;     // number of features
453     int32_t mate_flags;   // MF
454 } cram_record;
455 
456 // Accessor macros as an analogue of the bam ones
457 #define cram_qname(c)    (&(c)->s->name_blk->data[(c)->name])
458 #define cram_seq(c)      (&(c)->s->seqs_blk->data[(c)->seq])
459 #define cram_qual(c)     (&(c)->s->qual_blk->data[(c)->qual])
460 #define cram_aux(c)      (&(c)->s->aux_blk->data[(c)->aux])
461 #define cram_seqi(c,i)   (cram_seq((c))[(i)])
462 #define cram_name_len(c) ((c)->name_len)
463 #define cram_strand(c)   (((c)->flags & BAM_FREVERSE) != 0)
464 #define cram_mstrand(c)  (((c)->flags & BAM_FMREVERSE) != 0)
465 #define cram_cigar(c)    (&((cr)->s->cigar)[(c)->cigar])
466 
467 /*
468  * A feature is a base difference, used for the sequence reference encoding.
469  * (We generate these internally when writing CRAM.)
470  */
471 typedef struct {
472     union {
473 	struct {
474 	    int pos;
475 	    int code;
476 	    int base;    // substitution code
477 	} X;
478 	struct {
479 	    int pos;
480 	    int code;
481 	    int base;    // actual base & qual
482 	    int qual;
483 	} B;
484 	struct {
485 	    int pos;
486 	    int code;
487 	    int seq_idx; // index to s->seqs_blk
488 	    int len;
489 	} b;
490 	struct {
491 	    int pos;
492 	    int code;
493 	    int qual;
494 	} Q;
495 	struct {
496 	    int pos;
497 	    int code;
498 	    int len;
499 	    int seq_idx; // soft-clip multiple bases
500 	} S;
501 	struct {
502 	    int pos;
503 	    int code;
504 	    int len;
505 	    int seq_idx; // insertion multiple bases
506 	} I;
507 	struct {
508 	    int pos;
509 	    int code;
510 	    int base; // insertion single base
511 	} i;
512 	struct {
513 	    int pos;
514 	    int code;
515 	    int len;
516 	} D;
517 	struct {
518 	    int pos;
519 	    int code;
520 	    int len;
521 	} N;
522 	struct {
523 	    int pos;
524 	    int code;
525 	    int len;
526 	} P;
527 	struct {
528 	    int pos;
529 	    int code;
530 	    int len;
531 	} H;
532     };
533 } cram_feature;
534 
535 //// Turns [A-Z][A-Z] into an integer from 0 to 32*32
536 //#define ID(a) ((((a)[0]-'A')<<5)+(a)[1]-'A')
537 
538 /*
539  * A slice is really just a set of blocks, but it
540  * is the logical unit for decoding a number of
541  * sequences.
542  */
543 typedef struct cram_slice {
544     cram_block_slice_hdr *hdr;
545     cram_block *hdr_block;
546     cram_block **block;
547     cram_block **block_by_id;
548 
549     /* State used during encoding/decoding */
550     int64_t last_apos, max_apos;
551 
552     /* Array of decoded cram records */
553     cram_record *crecs;
554 
555     /* An dynamically growing buffers for data pointed
556      * to by crecs[] array.
557      */
558     uint32_t  *cigar;
559     uint32_t   cigar_alloc;
560     uint32_t   ncigar;
561 
562     cram_feature *features;
563     int           nfeatures;
564     int           afeatures; // allocated size of features
565 
566 #ifndef TN_external
567     // TN field (Tag Name)
568     uint32_t      *TN;
569     int           nTN, aTN;  // used and allocated size for TN[]
570 #else
571     cram_block *tn_blk;
572     int tn_id;
573 #endif
574 
575     // For variable sized elements which are always external blocks.
576     cram_block *name_blk;
577     cram_block *seqs_blk;
578     cram_block *qual_blk;
579     cram_block *base_blk;
580     cram_block *soft_blk;
581     cram_block *aux_blk;  // BAM aux block, used when going from CRAM to BAM
582 
583     HashTable *pair[2];      // for identifying read-pairs in this slice.
584 
585     char *ref;               // slice of current reference
586     int ref_start;           // start position of current reference;
587     int ref_end;             // end position of current reference;
588     int ref_id;
589 
590     uint32_t BD_crc;         // base call digest
591     uint32_t SD_crc;         // quality score digest
592 
593     // For going from BAM to CRAM; an array of auxiliary blocks per type
594     int naux_block;
595     cram_block **aux_block;
596 
597     unsigned int data_series;    // See cram_fields enum below
598     int decode_md;
599 
600     // Caching of block ID to block ptr for some blocks.
601     cram_block *id2blk[256];
602 
603     int max_rec, curr_rec;       // current and max recs per slice
604     int slice_num;               // To be copied into c->curr_slice in decode
605 
606     // Cache of converted BAM structs
607     bam_seq_t **bl;
608 } cram_slice;
609 
610 /*-----------------------------------------------------------------------------
611  * Consider moving reference handling to cram_refs.[ch]
612  */
613 // from fa.fai / samtools faidx files
614 typedef struct ref_entry {
615     char *name;
616     char *fn;
617     int64_t length;
618     int64_t offset;
619     int bases_per_line;
620     int line_length;
621     int64_t count;	   // for shared references so we know to dealloc seq
622     char *seq;
623     mFILE *mf;
624 } ref_entry;
625 
626 // References structure.
627 typedef struct {
628     string_alloc_t *pool;  // String pool for holding filenames and SN vals
629 
630     HashTable *h_meta;     // ref_entry*, index by name
631     ref_entry **ref_id;    // ref_entry*, index by ID
632     int nref;              // number of ref_entry
633 
634     char *fn;              // current file opened
635     bzi_FILE *fp;          // and the bzi_FILE* to go with it.
636 
637     int count;             // how many cram_fd sharing this refs struct
638 
639     pthread_mutex_t lock;  // Mutex for multi-threaded updating
640     ref_entry *last;       // Last queried sequence
641     int last_id;           // Used in cram_ref_decr_locked to delay free
642 } refs_t;
643 
644 /*-----------------------------------------------------------------------------
645  * CRAM index
646  *
647  * Detect format by number of entries per line.
648  * 5 => 1.0 (refid, start, nseq, C offset, slice)
649  * 6 => 1.1 (refid, start, span, C offset, S offset, S size)
650  *
651  * Indices are stored in a nested containment list, which is trivial to set
652  * up as the indices are on sorted data so we're appending to the nclist
653  * in sorted order. Basically if a slice entirely fits within a previous
654  * slice then we append to that slices list. This is done recursively.
655  *
656  * Lists are sorted on two dimensions: ref id + slice coords.
657  */
658 typedef struct cram_index {
659     int nslice, nalloc;   // total number of slices
660     struct cram_index *e; // array of size nslice
661 
662     int     refid;  // 1.0                 1.1
663     int     start;  // 1.0                 1.1
664     int     end;    //                     1.1
665     int     nseq;   // 1.0 - undocumented
666     int     slice;  // 1.0 landmark index, 1.1 landmark value
667     int     len;    //                     1.1 - size of slice in bytes
668     int64_t offset; // 1.0                 1.1
669 } cram_index;
670 
671 typedef struct {
672     int refid;
673     int64_t start;
674     int64_t end;
675 } cram_range;
676 
677 /*-----------------------------------------------------------------------------
678  */
679 /* CRAM File handle */
680 
681 typedef struct spare_bams {
682     bam_seq_t **bams;
683     struct spare_bams *next;
684 } spare_bams;
685 
686 #if defined(CRAM_IO_CUSTOM_BUFFERING)
687 typedef size_t (*cram_io_C_FILE_fread_t)(void *ptr, size_t size, size_t nmemb, void *stream);
688 typedef size_t (*cram_io_C_FILE_fwrite_t)(void *ptr, size_t size, size_t nmemb, void *stream);
689 typedef int    (*cram_io_C_FILE_fseek_t)(void * fd, off_t offset, int whence);
690 typedef off_t  (*cram_io_C_FILE_ftell_t)(void * fd);
691 
692 typedef struct {
693     void                   *user_data;
694     cram_io_C_FILE_fread_t  fread_callback;
695     cram_io_C_FILE_fseek_t  fseek_callback;
696     cram_io_C_FILE_ftell_t  ftell_callback;
697 } cram_io_input_t;
698 
699 typedef struct {
700     void                   *user_data;
701     cram_io_C_FILE_fwrite_t fwrite_callback;
702     cram_io_C_FILE_ftell_t  ftell_callback;
703 } cram_io_output_t;
704 
705 typedef cram_io_input_t * (*cram_io_allocate_read_input_t)(char const * filename, int const decompress);
706 typedef cram_io_input_t * (*cram_io_deallocate_read_input_t)(cram_io_input_t * obj);
707 
708 typedef cram_io_output_t * (*cram_io_allocate_write_output_t)(char const * filename);
709 typedef cram_io_output_t * (*cram_io_deallocate_write_output_t)(cram_io_output_t * obj);
710 
711 
712 
713 // FIXME: make cram_fd_input_buffer and cram_fd_input_buffer the same thing.
714 // Ie cram_fd_io_buffer and internals fp_io_*.
715 
716 typedef struct {
717     /* input buffer size */
718     size_t         fp_in_buf_size;
719     /* input buffer base pointer */
720     char          *fp_in_buffer;
721     /* position of buffer start in file */
722     uint64_t       fp_in_buf_start;
723     /* start of window pointer; same as fp_in_buffer */
724     char          *fp_in_buf_pa;
725     /* window current pointer */
726     char          *fp_in_buf_pc;
727     /* window end pointer;  same as fp_in_buffer + fp_in_buf_size (no seeks) */
728     char          *fp_in_buf_pe;
729 } cram_fd_input_buffer;
730 
731 typedef struct {
732     /* output buffer size */
733     size_t         fp_out_buf_size;
734     /* output buffer base pointer */
735     char          *fp_out_buffer;
736     /* position of buffer start in file */
737     uint64_t       fp_out_buf_start;
738     /* start of window pointer; same as fp_out_buffer */
739     char          *fp_out_buf_pa;
740     /* window current pointer */
741     char          *fp_out_buf_pc;
742     /* window end pointer */
743     char          *fp_out_buf_pe;
744 } cram_fd_output_buffer;
745 #endif
746 
747 typedef struct {
748     FILE                 *fp_in;
749 #if defined(CRAM_IO_CUSTOM_BUFFERING)
750     cram_fd_input_buffer            *fp_in_buffer;
751     cram_io_input_t                 *fp_in_callbacks;
752     cram_io_allocate_read_input_t    fp_in_callback_allocate_function;
753     cram_io_deallocate_read_input_t  fp_in_callback_deallocate_function;
754 
755     cram_fd_output_buffer            *fp_out_buffer;
756     cram_io_output_t                 *fp_out_callbacks;
757     cram_io_allocate_write_output_t   fp_out_callback_allocate_function;
758     cram_io_deallocate_write_output_t fp_out_callback_deallocate_function;
759 #endif
760 
761     FILE          *fp_out;
762     int            mode;     // 'r' or 'w'
763     int            version;
764     cram_file_def *file_def;
765     SAM_hdr       *header;
766 
767     char          *prefix;
768     int64_t        record_counter;
769     int            err;
770 
771     // Most recent compression header decoded
772     //cram_block_compression_hdr *comp_hdr;
773     //cram_block_slice_hdr       *slice_hdr;
774 
775     // Current container being processed.
776     cram_container *ctr;
777 
778     // Current container used for decoder threads
779     cram_container *ctr_mt;
780 
781     // positions for encoding or decoding
782     int64_t first_base, last_base;
783 
784     // cached reference portion
785     refs_t *refs;              // ref meta-data structure
786     char *ref, *ref_free;      // current portion held in memory
787     int   ref_id;
788     int   ref_start;
789     int   ref_end;
790     char *ref_fn;   // reference fasta filename
791 
792     // compression level and metrics
793     int level;
794     cram_metrics *m[DS_END];
795     HashTable *tags_used; // cram_metrics[], per tag types in use.
796 
797     // options
798     int decode_md; // Whether to export MD and NM tags
799     int verbose;
800     int seqs_per_slice;
801     int bases_per_slice;
802     int slices_per_container;
803     int embed_ref;
804     int no_ref;
805     int ignore_md5;
806     int use_bz2;
807     int use_rans;
808     int use_lzma;
809     int use_bsc;
810     int use_fqz;
811     int shared_ref;
812     enum quality_binning binning;
813     unsigned int required_fields;
814     cram_range range;
815 
816     // lookup tables, stored here so we can be trivially multi-threaded
817     unsigned int bam_flag_swap[0x1000]; // cram -> bam flags
818     unsigned int cram_flag_swap[0x1000];// bam -> cram flags
819     unsigned char L1[256];              // ACGT{*} ->0123{4}
820     unsigned char L2[256];              // ACGTN{*}->01234{5}
821     char cram_sub_matrix[32][32];	// base substituion codes
822 
823     int         index_sz;
824     cram_index *index;                  // array, sizeof index_sz
825     off_t first_container;
826     int eof;
827     int last_slice;                     // number of recs encoded in last slice
828     int multi_seq;
829     int unsorted;
830     int empty_container; 		// Marker for EOF block
831 
832     // thread pool
833     int own_pool;
834     t_pool *pool;
835     t_results_queue *rqueue;
836     pthread_mutex_t *metrics_lock;
837     pthread_mutex_t *ref_lock;
838     spare_bams *bl;
839     pthread_mutex_t *bam_list_lock;
840     void *job_pending;
841 
842     int ooc;                            // out of containers.
843     int ignore_chksum;
844     int lossy_read_names;
845     int preserve_aux_order;             // if set implies emitting RG, MD and NM
846     int preserve_aux_size;              // does not replace 'i' with 'c' etc in aux.
847 } cram_fd;
848 
849 #if defined(CRAM_IO_CUSTOM_BUFFERING)
850 extern size_t cram_io_input_buffer_read(void *ptr, size_t size, size_t nmemb, cram_fd * fd);
851 extern int cram_io_input_buffer_seek(cram_fd * fd, off_t offset, int whence);
852 extern int cram_io_input_buffer_underflow(cram_fd * fd);
853 extern char * cram_io_input_buffer_fgets(char * s, int size, cram_fd * fd);
854 extern int cram_io_flush_output_buffer(cram_fd *fd);
855 #endif
856 
857 #if defined(CRAM_IO_CUSTOM_BUFFERING)
858 #define CRAM_IO_GETC(fd) ((fd->fp_in_buffer->fp_in_buf_pc!=fd->fp_in_buffer->fp_in_buf_pe) ? ((int)((unsigned char)(*(fd->fp_in_buffer->fp_in_buf_pc++)))) : cram_io_input_buffer_underflow(fd))
859 #define CRAM_IO_READ(ptr, size, nmemb, fd) cram_io_input_buffer_read(ptr,size,nmemb,fd)
860 #define CRAM_IO_SEEK(fd, offset, whence) cram_io_input_buffer_seek(fd, offset, whence)
861 #define CRAM_IO_TELLO(fd) (fd->fp_in_buffer->fp_in_buf_start +(fd->fp_in_buffer->fp_in_buf_pc-fd->fp_in_buffer->fp_in_buf_pa))
862 #define CRAM_IO_FGETS(s,size,fd) cram_io_input_buffer_fgets(s,size,fd)
863 #define CRAM_IO_PUTC(c,fd) cram_io_output_buffer_putc(c,fd)
864 #define CRAM_IO_WRITE(ptr, size, nmemb, fd) cram_io_output_buffer_write(ptr,size,nmemb,fd)
865 #define CRAM_IO_FLUSH(fd) cram_io_flush_output_buffer((fd))
866 
867 #else // ! CRAM_IO_CUSTOM_BUFFERING
868 #define CRAM_IO_GETC(fd) getc(fd->fp_in)
869 #define CRAM_IO_READ(ptr, size, nmemb, fd) fread(ptr,size,nmemb,fd->fp_in)
870 #define CRAM_IO_TELLO(fd) ftello(fd->fp_in)
871 #define CRAM_IO_SEEK(fd, offset, whence) fseeko(fd->fp_in,offset,whence)
872 #define CRAM_IO_FGETS(s,size,fd) fgets(s,size,fd->fp_in)
873 #define CRAM_IO_PUTC(c,fd) putc(c,fd->fp_out)
874 #define CRAM_IO_WRITE(ptr, size, nmemb, fd) fwrite(ptr,size,nmemb,fd->fp_out)
875 #define CRAM_IO_FLUSH(fd) (fd->fp_out ? fflush(fd->fp_out) : 0)
876 
877 #endif // end CRAM_IO_CUSTOM_BUFFERING
878 
879 // REQUIRED_FIELDS
880 enum sam_fields {
881     SAM_QNAME = 0x00000001,
882     SAM_FLAG  = 0x00000002,
883     SAM_RNAME = 0x00000004,
884     SAM_POS   = 0x00000008,
885     SAM_MAPQ  = 0x00000010,
886     SAM_CIGAR = 0x00000020,
887     SAM_RNEXT = 0x00000040,
888     SAM_PNEXT = 0x00000080,
889     SAM_TLEN  = 0x00000100,
890     SAM_SEQ   = 0x00000200,
891     SAM_QUAL  = 0x00000400,
892     SAM_AUX   = 0x00000800,
893     SAM_RGAUX = 0x00001000,
894 };
895 
896 // Translation of required fields to cram data series
897 enum cram_fields {
898     CRAM_BF = 0x00000001,
899     CRAM_AP = 0x00000002,
900     CRAM_FP = 0x00000004,
901     CRAM_RL = 0x00000008,
902     CRAM_DL = 0x00000010,
903     CRAM_NF = 0x00000020,
904     CRAM_BA = 0x00000040,
905     CRAM_QS = 0x00000080,
906     CRAM_FC = 0x00000100,
907     CRAM_FN = 0x00000200,
908     CRAM_BS = 0x00000400,
909     CRAM_IN = 0x00000800,
910     CRAM_RG = 0x00001000,
911     CRAM_MQ = 0x00002000,
912     CRAM_TL = 0x00004000,
913     CRAM_RN = 0x00008000,
914     CRAM_NS = 0x00010000,
915     CRAM_NP = 0x00020000,
916     CRAM_TS = 0x00040000,
917     CRAM_MF = 0x00080000,
918     CRAM_CF = 0x00100000,
919     CRAM_RI = 0x00200000,
920     CRAM_RS = 0x00400000,
921     CRAM_PD = 0x00800000,
922     CRAM_HC = 0x01000000,
923     CRAM_SC = 0x02000000,
924     CRAM_BB = 0x04000000,
925     CRAM_BB_len = 0x08000000,
926     CRAM_QQ = 0x10000000,
927     CRAM_QQ_len = 0x20000000,
928     CRAM_aux= 0x40000000,
929     CRAM_ALL= 0x7fffffff,
930 };
931 
932 // A CIGAR opcode, but not necessarily the implications of it. Eg FC/FP may
933 // encode a base difference, but we don't need to know what it is for CIGAR.
934 // If we have a soft-clip or insertion, we do need SC/IN though to know how
935 // long that array is.
936 #define CRAM_CIGAR (CRAM_FN | CRAM_FP | CRAM_FC | CRAM_DL | CRAM_IN | \
937 		    CRAM_SC | CRAM_HC | CRAM_PD | CRAM_RS | CRAM_RL | CRAM_BF)
938 
939 #define CRAM_SEQ (CRAM_CIGAR | CRAM_BA | CRAM_BS | \
940 		  CRAM_RL    | CRAM_AP | CRAM_BB)
941 
942 #define CRAM_QUAL (CRAM_CIGAR | CRAM_RL | CRAM_AP | CRAM_QS | CRAM_QQ)
943 
944 enum cram_option {
945     CRAM_OPT_DECODE_MD,
946     CRAM_OPT_PREFIX,
947     CRAM_OPT_VERBOSITY,
948     CRAM_OPT_SEQS_PER_SLICE,
949     CRAM_OPT_SLICES_PER_CONTAINER,
950     CRAM_OPT_RANGE,
951     CRAM_OPT_VERSION,
952     CRAM_OPT_EMBED_REF,
953     CRAM_OPT_IGNORE_MD5,
954     CRAM_OPT_REFERENCE,
955     CRAM_OPT_MULTI_SEQ_PER_SLICE,
956     CRAM_OPT_NO_REF,
957     CRAM_OPT_USE_BZIP2,
958     CRAM_OPT_SHARED_REF,
959     CRAM_OPT_NTHREADS,
960     CRAM_OPT_THREAD_POOL,
961     CRAM_OPT_BINNING,
962     CRAM_OPT_USE_ARITH,
963     CRAM_OPT_USE_LZMA,
964     CRAM_OPT_REQUIRED_FIELDS,
965     CRAM_OPT_USE_RANS,
966     CRAM_OPT_IGNORE_CHKSUM,
967     CRAM_OPT_BASES_PER_SLICE,
968     CRAM_OPT_LOSSY_READ_NAMES,
969     CRAM_OPT_PRESERVE_AUX_ORDER,
970     CRAM_OPT_PRESERVE_AUX_SIZE,
971     CRAM_OPT_WITH_BGZIP_INDEX,
972     CRAM_OPT_OUTPUT_BGZIP_IDX,
973     CRAM_OPT_USE_BSC,
974     CRAM_OPT_USE_FQZ,
975 };
976 
977 /* BF bitfields */
978 /* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */
979 #define CRAM_FPAIRED      256
980 #define CRAM_FPROPER_PAIR 128
981 #define CRAM_FUNMAP        64
982 #define CRAM_FREVERSE      32
983 #define CRAM_FREAD1        16
984 #define CRAM_FREAD2         8
985 #define CRAM_FSECONDARY     4
986 #define CRAM_FQCFAIL        2
987 #define CRAM_FDUP           1
988 
989 #define DS_aux_S "\001"
990 #define DS_aux_OQ_S "\002"
991 #define DS_aux_BQ_S "\003"
992 #define DS_aux_BD_S "\004"
993 #define DS_aux_BI_S "\005"
994 #define DS_aux_FZ_S "\006"
995 #define DS_aux_oq_S "\007"
996 #define DS_aux_os_S "\010"
997 #define DS_aux_oz_S "\011"
998 
999 #define CRAM_M_REVERSE  1
1000 #define CRAM_M_UNMAP    2
1001 
1002 
1003 /* CF bitfields */
1004 #define CRAM_FLAG_PRESERVE_QUAL_SCORES (1<<0)
1005 #define CRAM_FLAG_DETACHED             (1<<1)
1006 #define CRAM_FLAG_MATE_DOWNSTREAM      (1<<2)
1007 #define CRAM_FLAG_NO_SEQ               (1<<3)
1008 #define CRAM_FLAG_MASK                 ((1<<4)-1)
1009 
1010 /* Internal only */
1011 #define CRAM_FLAG_STATS_ADDED  (1<<30)
1012 #define CRAM_FLAG_DISCARD_NAME (1<<31)
1013 
1014 #ifdef __cplusplus
1015 }
1016 #endif
1017 
1018 #endif /* _CRAM_STRUCTS_H_ */
1019