1 /*
2  * Copyright (c) 2013 Genome Research Ltd.
3  * Author(s): James Bonfield, Rob Davies
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  *    1. Redistributions of source code must retain the above copyright notice,
9  *       this list of conditions and the following disclaimer.
10  *
11  *    2. Redistributions in binary form must reproduce the above
12  *       copyright notice, this list of conditions and the following
13  *       disclaimer in the documentation and/or other materials provided
14  *       with the distribution.
15  *
16  *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17  *    Institute nor the names of its contributors may be used to endorse
18  *    or promote products derived from this software without specific
19  *    prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25  * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /*
35  * Author: James Bonfield, Wellcome Trust Sanger Institute. 2010-3
36  */
37 
38 /*! \file
39  * The primary SAM/BAM API.
40  *
41  * Consider using scram.h if you wish to also have support for CRAM.
42  */
43 
44 #ifndef _BAM_H_
45 #define _BAM_H_
46 
47 #ifdef __cplusplus
48 extern "C" {
49 #endif
50 
51 #include <inttypes.h>
52 #include <zlib.h>
53 
54 #include "io_lib/os.h"
55 #include "io_lib/hash_table.h"
56 #include "io_lib/sam_header.h"
57 #include "io_lib/thread_pool.h"
58 #include "io_lib/binning.h"
59 #include "io_lib/bgzip.h"
60 
61 /* BAM header structs */
62 typedef struct tag_list {
63     char *value; /* NULL => end of tags */
64     int key;
65     int length;
66 } tag_list_t;
67 
68 /* The main bam sequence struct */
69 typedef struct bam_seq_s {
70     uint32_t alloc; /* total size of this struct + 'data' onwards */
71     uint32_t blk_size;
72 
73     // 64-bit variant of our position, for general use
74     int64_t pos;
75     int64_t mate_pos;
76     int64_t ins_size;
77 
78     /* The raw bam block follows, in same order as on the disk */
79     /* This is the analogue of a bam1_core_t in samtools */
80     int32_t  ref;
81     int32_t  pos_32;
82 
83     union {
84 	struct {
85 	    uint32_t name_len:8, map_qual:8, bin:16;
86 	};
87 	uint32_t bin_packed;
88     };
89     union {
90 	struct {
91 	    uint32_t cigar_len:16, flag:16;
92 	};
93 	uint32_t flag_packed;
94     };
95 
96     int32_t  len;
97     int32_t  mate_ref;
98     int32_t  mate_pos_32;
99     int32_t  ins_size_32;
100 
101     /* Followed by arbitrary bytes of packed data */
102     unsigned char data; /* unknown size */
103 } bam_seq_t;
104 
105 
106 /* Auxillary field handling */
107 typedef union {
108     char    *s;
109     int      i;
110     uint64_t i64;
111     float    f;
112     double   d;
113     struct {
114 	int n, t;
115 	unsigned char *s;
116     } B;
117 } bam_aux_t;
118 
119 /* Struct for making arrays of aux tags */
120 
121 typedef struct {
122     char     tag[2];
123     char     type;
124     uint32_t array_len;
125     union {
126 	char    *z;
127 	uint8_t *h;
128 	char     a;
129 	int32_t  i;
130 	uint32_t ui;
131 	float    f;
132 	double   d;
133 	void    *array;
134     } value;
135 } bam_aux_tag_t;
136 
137 /*
138  * Our bam stream consists of a zlib gzFile stream and a buffer for it to
139  * output to. This allows us to call small bam_read requests while
140  * translating these to fewer, larger, gzread calls. The overhead is
141  * therefore minimal.
142  */
143 #define Z_BUFF_SIZE 65536    /* Max size of a zlib block */
144 #define BGZF_BUFF_SIZE 65273 // 65535 - MIN_LOOKAHEAD to avoid fill_window()
145 typedef struct {
146     FILE *fp;
147 
148     int mode, binary, level;
149     z_stream s;
150 
151     unsigned char comp[Z_BUFF_SIZE];
152     unsigned char *comp_p;
153     size_t comp_sz;
154 
155     unsigned char uncomp[Z_BUFF_SIZE];
156     unsigned char *uncomp_p;
157     size_t uncomp_sz;
158 
159     /* BAM specifics */
160     int32_t next_len;
161 
162     SAM_hdr *header;      /* Parsed SAM header */
163 
164     /* Cached bam_seq_t, to avoid excessive mallocs */
165     bam_seq_t *bs;
166     int bs_size;
167 
168     /* Boolean to indicate if we've finished the most recent z stream */
169     int z_finish;
170 
171     /* Indicates whether gzipped, and if so with bgzf extra fields */
172     int gzip;
173     int bgzf;
174 
175     /* Whether BAM or SAM format */
176     int bam;
177 
178     /* If true, skip auxillary field parsing while reading SAM */
179     int no_aux;
180 
181     /* line number (when in SAM mode) */
182     int line;
183 
184     /* EOF block present in BAM */
185     int eof_block;
186 
187     /* Static avoidance: used in sam_next_seq() */
188     unsigned char *sam_str;
189     size_t alloc_l;
190 
191     /* Thread pool for encoding */
192     t_pool *pool;
193     t_results_queue *equeue;
194 
195     /* Decoding queue */
196     t_results_queue *dqueue;
197     void *job_pending;
198     int eof;
199     int nd_jobs, ne_jobs;
200 
201     /* Quality binning */
202     enum quality_binning binning;
203 
204     /* Disabling CRC checks */
205     int ignore_chksum;
206 
207     /* Used when gzi files are supplied. */
208     gzi *idx;
209     char *idx_fn;
210     uint64_t current_block;
211     unsigned char bgbuf[Z_BUFF_SIZE];
212     unsigned char *bgbuf_p;
213     size_t bgbuf_sz;
214 } bam_file_t;
215 
216 /* BAM flags */
217 #define BAM_FPAIRED           1
218 #define BAM_FPROPER_PAIR      2
219 #define BAM_FUNMAP            4
220 #define BAM_FMUNMAP           8
221 #define BAM_FREVERSE         16
222 #define BAM_FMREVERSE        32
223 #define BAM_FREAD1           64
224 #define BAM_FREAD2          128
225 #define BAM_FSECONDARY      256
226 #define BAM_FQCFAIL         512
227 #define BAM_FDUP           1024
228 #define BAM_FSUPPLEMENTARY 2048
229 
230 #define BAM_CIGAR32    32768
231 
232 /* Decoding the bam_seq_t struct */
233 #define bam_blk_size(b)       ((b)->blk_size)
234 #define bam_set_blk_size(b,v) ((b)->blk_size = (v))
235 
236 #define bam_ref(b)       ((b)->ref)
237 #define bam_pos(b)       ((b)->pos)
238 #define bam_mate_ref(b)  ((b)->mate_ref)
239 #define bam_mate_pos(b)  ((b)->mate_pos)
240 #define bam_ins_size(b)  ((b)->ins_size)
241 #define bam_cigar_len(b) (((b)->flag & BAM_CIGAR32 ? ((b)->bin<<16) : 0) + (b)->cigar_len)
242 #define bam_name_len(b)  ((b)->name_len)
243 #define bam_map_qual(b)  ((b)->map_qual)
244 #define bam_bin(b)       ((b)->flag & BAM_CIGAR32 ? 0 : (b)->bin)
245 #define bam_flag(b)      ((b)->flag)
246 #define bam_seq_len(b)   ((b)->len)
247 #define bam_strand(b)    ((bam_flag((b)) & BAM_FREVERSE) != 0)
248 #define bam_mstrand(b)   ((bam_flag((b)) & BAM_FMREVERSE) != 0)
249 
250 #define bam_set_ref(b,v)        ((b)->ref = (v))
251 #define bam_set_pos(b,v)        ((b)->pos = (v))
252 #define bam_set_mate_ref(b,v)   ((b)->mate_ref = (v))
253 #define bam_set_mate_pos(b,v)   ((b)->mate_pos = (v))
254 #define bam_set_ins_size(b,v)   ((b)->ins_size = (v))
255 #define bam_set_cigar_len(b, v) (((v)>>16) ? (((b)->flag |= BAM_CIGAR32), (b)->bin = ((v)>>16), (b)->cigar_len = (v)&0xffff) : ((b)->cigar_len = (v)))
256 #define bam_set_name_len(b,v)   ((b)->name_len = (v))
257 #define bam_set_map_qual(b,v)   ((b)->map_qual = (v))
bam_set_bin(bam_seq_t * b,uint32_t v)258 static inline void bam_set_bin(bam_seq_t *b, uint32_t v) {
259     if (!(b->flag & BAM_CIGAR32)) b->bin = v;
260 }
261 #define bam_set_flag(b,v)       ((b)->flag = (v))
262 #define bam_set_seq_len(b,v)    ((b)->len = (v))
263 
264 #define bam_name(b)      ((char *)(&(b)->data))
265 #ifdef ALLOW_UAC
266 #define bam_cigar(b)     ((uint32_t *)(bam_name((b)) + bam_name_len((b))))
267 #else
268 #define bam_cigar(b)     ((uint32_t *)(bam_name((b)) + round4(bam_name_len((b)))))
269 #endif
270 #define bam_seq(b)       (((char *)bam_cigar((b))) + 4*bam_cigar_len(b))
271 #define bam_qual(b)      (bam_seq(b) + (int)(((b)->len+1)/2))
272 #define bam_aux(b)       (bam_qual(b) + (b)->len)
273 
274 /* Rounds up to the next multiple of 4 */
275 #define round4(v) (((v-1)&~3)+4)
276 
277 /* CIGAR operations, taken from samtools bam.h */
278 #define BAM_CIGAR_SHIFT 4
279 #define BAM_CIGAR_MASK  ((1 << BAM_CIGAR_SHIFT) - 1)
280 
281 enum cigar_op {
282     BAM_UNKNOWN=-1,
283     BAM_CMATCH=0,
284     BAM_CINS=1,
285     BAM_CDEL=2,
286     BAM_CREF_SKIP=3,
287     BAM_CSOFT_CLIP=4,
288     BAM_CHARD_CLIP=5,
289     BAM_CPAD=6,
290     BAM_CBASE_MATCH=7,
291     BAM_CBASE_MISMATCH=8
292 };
293 
294 /*
295  * Whether this cigar op marches along ref, seq or both
296  *
297  * Op   Ref   Seq
298  * M    1     1
299  * I    0     1
300  * D    1     0
301  * N    1     0
302  * S    0     1
303  * H    0     0
304  * P    0     0
305  * =    1     1
306  * X    1     1
307  */
308 #define BAM_CONSUME_REF(op) ((0x18d>>(op))&1)
309 #define BAM_CONSUME_SEQ(op) ((0x193>>(op))&1)
310 
311 
312 /* ----------------------------------------------------------------------
313  * Function prototypes
314  * We only support reading, so basically we have open, read, close along
315  * with some utility functions for querying aux records.
316  */
317 
318 /*! Opens a SAM or BAM file.
319  *
320  * The mode parameter indicates the file
321  * type (if not auto-detecting) and whether it is for reading or
322  * writing. Use "rb" or "wb" for reading or writing BAM and "r" or
323  * "w" or reading or writing SAM. When writing BAM, the mode may end
324  * with a digit from 0 to 9 to indicate the compression to use with 0
325  * indicating uncompressed data.
326  *
327  * @param fn The filename to open or create.
328  * @param mode The input/output mode, similar to fopen().
329  *
330  * @return
331  * Returns a bam_file_t pointer on success;
332  *         NULL on failure.
333  */
334 bam_file_t *bam_open(const char *fn, const char *mode);
335 
336 bam_file_t *bam_open_block(const char *blk, size_t blk_size, SAM_hdr *sh);
337 
338 /*! Closes a SAM or BAM file.
339  *
340  * @param b The file to close.
341  *
342  * @return
343  * Retrurns 0 on success;
344  *         -1 on failure.
345  */
346 int bam_close(bam_file_t *b);
347 
348 /*! Deprecated: please use bam_get_seq() instead.
349  */
350 int bam_next_seq(bam_file_t *b, bam_seq_t **bsp);
351 
352 /*! Reads the next sequence.
353  *
354  * Fills out the next bam_seq_t struct.
355  * This function will alloc and/or grow the memory accordingly, allowing for
356  * efficient reuse.
357  *
358  * @param bsp Must be non-null, but *bsp may be NULL or an existing
359  * bam_seq_t pointer.
360  *
361  * @return
362  * Returns 1 on success;
363  *         0 on eof;
364  *        -1 on error.
365  */
366 int bam_get_seq(bam_file_t *b, bam_seq_t **bsp);
367 
368 /*!Looks for aux field 'key' and returns the value.
369  * The type is the first char and the value is the 2nd character onwards.
370  *
371  * @return
372  * Returns the value for key; NULL if not found.
373  */
374 char *bam_aux_find(bam_seq_t *b, const char *key);
375 
376 //!Converts an encoded integer value return by bam_aux_find to an integer.
377 //
378 //Analogous to the bam_aux2i functions in samtools.
379 int32_t bam_aux_i(const uint8_t *dat);
380 float bam_aux_f(const uint8_t *s);
381 double bam_aux_d(const uint8_t *s);
382 char bam_aux_A(const uint8_t *s);
383 char *bam_aux_Z(const uint8_t *s);
384 
385 //int bam_aux_del(bam_seq_t *b, uint8_t *s); // not implemented yet
386 
387 /*! Add auxiliary tags to a bam_seq_t structure.
388  *
389  * Appends a tag onto the end of a bam_seq_t structure.  The tag name
390  * is supplied in the 'tag' parameter, and the data type code in 'type'.
391  * Valid type codes are [AcCsSiIfdHZ], as described in the SAM specification.
392  *
393  * The array_len parameter is used for B type (i.e. array) tags.  If
394  * array_len is 0, an ordinary non-array tag is added.  If it is greater
395  * than zero, a B type tag of the apropriate data type is made.  Arrays
396  * of types H and Z are not allowed, so array_len must be zero if these types
397  * are specified.
398  *
399  * data should point to the data to be added.  This should be of appropriate
400  * size for the tag type, i.e. (u)int8_t for A, C and c; (u)int16_t for S and s;
401  * (u)int32_t for I and i; a float for f; a double for d; a NUL-terminated
402  * string for H and Z.  If array_len is greater than zero, then data should
403  * point to an array of the given type.  All data should be in the native
404  * format for the machine - it will be converted to little-endian if necessary
405  * by the function.
406  *
407  * @param b         Points to the location of a bam_seq_t *.  If (*b)->alloc
408  *                  is too small, the bam_seq_t struct will be reallocated.
409  *                  Neither b nor *b should be NULL.
410  * @param tag       The tag name (RG, NM, etc.)
411  * @param type      The tag data type.
412  * @param array_len Array length for array tags, zero for ordinary ones.
413  * @param data      Pointer to the tag value.
414  *
415  * @return
416  * Returns  0 on success;
417  *         -1 on error
418  */
419 
420 int bam_aux_add(bam_seq_t **b, const char tag[2], char type,
421 		uint32_t array_len, const void *data);
422 
423 /*! Add multiple auxiliary tags to a bam_seq_t structure
424  *
425  * bam_aux_add_vec adds one or more tags listed in the bam_aux_tag_t array
426  * to a bam_seq_t structure.  The bam_aux_tag_t struct has four elements,
427  * the tag name (char[2]), the type code (char), the array length for B tags and
428  * the value which is a union.
429  *
430  * The type code determines both the data type of the tag, and the member
431  * of value used to access the data.  If array_len is zero, data types 'H' and
432  * 'Z' are accessed via member value.h or value.z; 'f' via value.f; 'd' via
433  * value.d; 'A' via value.a.  Signed integers should have type 'i', and are
434  * accessed through value.i.  Unsigned integers should use type 'I' and
435  * value.ui.  The actual type used to store the integer will be the smallest
436  * that it will fit in, so signed and unsigned integers that fit in one byte
437  * will be stored as type 'c' or 'C'.  Similarly, integers that fit in two
438  * bytes will be stored as type 's' or 'S' for unsigned.
439  *
440  * If array_len is non-zero, a B-type (array) tag is stored.  In this case
441  * value.array should point to the data to be stored.  The type of array
442  * is interpreted according to the requested tag type, i.e. char for 'A';
443  * int8_t for 'c'; uint8_t for 'C'; int16_t for 's'; uint16_t for 'S';
444  * int32_t for 'i'; uint32_t for 'I'; float for 'f'; double for 'd'.  No
445  * attempt is made to adjust the size of integers when storing arrays. All
446  * data should be in the native format for the machine - it will be converted
447  * to little-endian if necessary by the function.
448  *
449  * @param b         Points to the location of a bam_seq_t *.  If (*b)->alloc
450  *                  is too small, the bam_seq_t struct will be reallocated.
451  *                  Neither b nor *b should be NULL.
452  * @param count     The number of elements in the tags array
453  * @param tags      Array of bam_aux_tag_t structs, listing the tags to add.
454  *
455  * @return
456  * Returns  0 on success;
457  *         -1 on error
458  */
459 
460 int bam_aux_add_vec(bam_seq_t **b, uint32_t count, bam_aux_tag_t *tags);
461 
462 /*! Calculate the amount of space needed to store auxiliary tags
463  *
464  * The tags array should be filled out as described in bam_aux_add_vec.
465  * This function iterates through the list of items in the tags array
466  * to work out how much space will be needed to store them.  This value
467  * can be passed to bam_construct_seq in the extra_len parameter to
468  * ensure that enough space is allocated to store the tags.
469  *
470  * @param count     The number of elements in the tags array
471  * @param tags      Array of bam_aux_tag_t structs, listing the tags to add.
472  *
473  * @return
474  * Returns the total space required for the tags on success;
475  *         -1 on failure.
476  */
477 
478 ssize_t bam_aux_size_vec(uint32_t count, bam_aux_tag_t *tags);
479 
480 /*! Helper for filling in the bam_aux_tag_t struct
481  * @param tag   Pointer to bam_aux_tag_t struct
482  * @param name  Tag name
483  * @param val   Tag value
484  */
bam_aux_tag_char(bam_aux_tag_t * tag,char * name,char val)485 static inline void bam_aux_tag_char(bam_aux_tag_t *tag, char *name, char val) {
486     tag->tag[0] = name[0];
487     tag->tag[1] = name[1];
488     tag->type = 'A';
489     tag->array_len = 0;
490     tag->value.a = val;
491 }
492 /*! Helper for filling in the bam_aux_tag_t struct
493  * @param tag   Pointer to bam_aux_tag_t struct
494  * @param name  Tag name
495  * @param val   Tag value
496  */
bam_aux_tag_int(bam_aux_tag_t * tag,char * name,int32_t val)497 static inline void bam_aux_tag_int(bam_aux_tag_t *tag,
498 				   char *name, int32_t val) {
499     tag->tag[0] = name[0];
500     tag->tag[1] = name[1];
501     tag->type = 'i';
502     tag->array_len = 0;
503     tag->value.i = val;
504 }
505 /*! Helper for filling in the bam_aux_tag_t struct
506  * @param tag   Pointer to bam_aux_tag_t struct
507  * @param name  Tag name
508  * @param val   Tag value
509  */
bam_aux_tag_uint(bam_aux_tag_t * tag,char * name,uint32_t val)510 static inline void bam_aux_tag_uint(bam_aux_tag_t *tag,
511 				    char *name, uint32_t val) {
512     tag->tag[0] = name[0];
513     tag->tag[1] = name[1];
514     tag->type = 'I';
515     tag->array_len = 0;
516     tag->value.ui = val;
517 }
518 /*! Helper for filling in the bam_aux_tag_t struct
519  * @param tag   Pointer to bam_aux_tag_t struct
520  * @param name  Tag name
521  * @param val   Tag value
522  */
bam_aux_tag_float(bam_aux_tag_t * tag,char * name,float val)523 static inline void bam_aux_tag_float(bam_aux_tag_t *tag,
524 				     char *name, float val) {
525     tag->tag[0] = name[0];
526     tag->tag[1] = name[1];
527     tag->type = 'f';
528     tag->array_len = 0;
529     tag->value.f = val;
530 }
531 /*! Helper for filling in the bam_aux_tag_t struct
532  * @param tag   Pointer to bam_aux_tag_t struct
533  * @param name  Tag name
534  * @param val   Tag value
535  */
bam_aux_tag_double(bam_aux_tag_t * tag,char * name,double val)536 static inline void bam_aux_tag_double(bam_aux_tag_t *tag,
537 				      char *name, double val) {
538     tag->tag[0] = name[0];
539     tag->tag[1] = name[1];
540     tag->type = 'd';
541     tag->array_len = 0;
542     tag->value.d = val;
543 }
544 /*! Helper for filling in the bam_aux_tag_t struct
545  * @param tag   Pointer to bam_aux_tag_t struct
546  * @param name  Tag name
547  * @param val   Tag value
548  */
bam_aux_tag_string(bam_aux_tag_t * tag,char * name,char * str)549 static inline void bam_aux_tag_string(bam_aux_tag_t *tag,
550 				      char *name, char *str) {
551     tag->tag[0] = name[0];
552     tag->tag[1] = name[1];
553     tag->type = 'Z';
554     tag->array_len = 0;
555     tag->value.z = str;
556 }
557 /*! Helper for filling in the bam_aux_tag_t struct
558  * @param tag   Pointer to bam_aux_tag_t struct
559  * @param name  Tag name
560  * @param val   Tag value
561  */
bam_aux_tag_hexstring(bam_aux_tag_t * tag,char * name,uint8_t * str)562 static inline void bam_aux_tag_hexstring(bam_aux_tag_t *tag,
563 					 char *name, uint8_t *str) {
564     tag->tag[0] = name[0];
565     tag->tag[1] = name[1];
566     tag->type = 'H';
567     tag->array_len = 0;
568     tag->value.h = str;
569 }
570 /*! Helper for filling in the bam_aux_tag_t struct
571  * @param tag   Pointer to bam_aux_tag_t struct
572  * @param name  Tag name
573  * @param val   Tag value
574  */
bam_aux_tag_array(bam_aux_tag_t * tag,char * name,char type,uint32_t array_len,void * data)575 static inline void bam_aux_tag_array(bam_aux_tag_t *tag, char *name, char type,
576 				     uint32_t array_len, void *data) {
577     tag->tag[0] = name[0];
578     tag->tag[1] = name[1];
579     tag->type = type;
580     tag->array_len = array_len;
581     tag->value.array = data;
582 }
583 
584 /*! Add SAM formatted aux tags to a bam_seq_t.
585  *
586  * Appends one or more SAM-format tags onto the end of a bam_seq_t structure.
587  * If multiple tags are present, they should be separated by tabs, as in
588  * a SAM file.
589  *
590  * @param bsp       Points to the location of a bam_seq_t *.  If (*bsp)->alloc
591  *                  is too small, the bam_seq_t struct will be reallocated.
592  *                  Neither bsp nor *bsp should be NULL.
593  * @param sam       SAM-foratted tag string.
594  *
595  * @return
596  * Returns  0 on success;
597  *         -1 on error
598  */
599 
600 int bam_aux_add_from_sam(bam_seq_t **bsp, char *sam);
601 
602 /*! Add preformated raw aux data to the bam_seq.
603  *
604  * This interface is similar to the samtools bam_aux_append function.
605  * It creates a tag with the given name and type, and then appends the
606  * supplied data to it as the value.  It is up to the caller to ensure that
607  * the data has been formatted correctly as given in the SAM specification.
608  * This function makes no checks on the data, but simply copies it.
609  *
610  * Consider using bam_aux_add instead if you have information in a more
611  * integer or string form.
612  *
613  * @param b         Points to the location of a bam_seq_t *.  If (*b)->alloc
614  *                  is too small, the bam_seq_t struct will be reallocated.
615  *                  Neither b nor *b should be NULL.
616  * @param tag       The tag name (RG, NM, etc.)
617  * @param type      The tag data type.
618  * @param len       The number of bytes of data present.
619  * @param data      Pre-formatted data.
620  *
621  * @return
622  * Returns 0 on success;
623  *        -1 on failure
624  */
625 int bam_aux_add_data(bam_seq_t **b, const char tag[2],
626 		     char type, size_t len, const uint8_t *data);
627 
628 /*! Add raw data to a bam structure.
629  *
630  * This could be useful if you wish to manually construct your own bam
631  * entries or if you need to append an entire block of preformatting
632  * aux data.
633  *
634  * @param b         Points to the location of a bam_seq_t *.  If (*b)->alloc
635  *                  is too small, the bam_seq_t struct will be reallocated.
636  *                  Neither b nor *b should be NULL.
637  * @param len       The number of bytes of data present.
638  * @param data      Pre-formatted data.
639  *
640  * @retrun
641  * Returns 0 on success;
642  *        -1 on failure
643  */
644 int bam_add_raw(bam_seq_t **b, size_t len, const uint8_t *data);
645 
646 /*! An iterator on bam_aux_t fields.
647  *
648  * NB: This code is not reentrant or multi-thread capable. The values
649  * returned are valid until the next call to this function.
650  *
651  * @param key  points to an array of 2 characters (eg "RG", "NM")
652  * @param type points to an address of 1 character (eg 'Z', 'i')
653  * @paran val  points to an address of a bam_aux_t union.
654  * @param iter_handle NULL to initialise the search, and then the
655  * returned (modified) iter_handle on each subsequent call to continue
656  * the iteration.
657  *
658  * @return
659  * Returns 0 if the next value is valid, setting key, type and val;
660  *        -1 when no more found.
661  */
662 int bam_aux_iter(bam_seq_t *b, char **iter_handle,
663 		 char *key, char *type, bam_aux_t *val);
664 
665 /* Taken from samtools/bam.h */
666 #define bam_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf)
667 #define bam_nt16_rev_table "=ACMGRSVTWYHKDBN"
668 
669 /* Output code */
670 
671 /*! Writes a single bam sequence object.
672  *
673  * @param fp The SAM/BAM file handle.
674  * @param b  The bam_seq_t pointer
675  *
676  * @return
677  * Returns 0 on success;
678  *        -1 on failure
679  */
680 int bam_put_seq(bam_file_t *fp, bam_seq_t *b);
681 
682 /*! Constructs a bam_seq_t from separate components.
683  *
684  * Note: ignores auxiliary tags for now. These need to be appended
685  * manually by the calling function.
686  *
687  * @param b         Points to the location of a bam_seq_t *.  If *b is NULL
688  *                  or (*b)->alloc is too small, the bam_seq_t struct will
689  *                  be reallocated.
690  * @param extra_len Extra space to allocate for auxiliary tags
691  * @param qname     Query name
692  * @param qname_len Query name length
693  * @param flag      BAM flags
694  * @param rname     Reference ID
695  * @param pos       Mapped position (N.B. 1-based)
696  * @param end       Last aligned base
697  * @param mapq      Mapping quality
698  * @param ncigar    Number of CIGAR elements
699  * @param cigar     CIGAR alignment information
700  * @param mrnm      Mate reference ID
701  * @param mpos      Mate position (N.B. 1-based)
702  * @param isize     Insert size
703  * @param len       Sequence length
704  * @param seq       Sequence (ASCII format)
705  * @param qual      Quality values (phred scale, 8-bit binary, no offset)
706  *                  Passing in NULL to qual will cause all quality values
707  *                  to be treated as absent (i.e. set to 0xff).
708  *
709  * @return
710  * Returns number of bytes written to bam_seq_t on success (ie tag offset);
711  *         -1 on error.
712  */
713 int bam_construct_seq(bam_seq_t **b, size_t extra_len,
714 		      const char *qname, size_t qname_len,
715 		      int flag,
716 		      int rname,      // Ref ID
717 		      int64_t pos,
718 		      int64_t end, // aligned start/end coords
719 		      int mapq,
720 		      uint32_t ncigar, const uint32_t *cigar,
721 		      int mrnm,       // Mate Ref ID
722 		      int64_t mpos,
723 		      int64_t isize,
724 		      int len,
725 		      const char *seq,
726 		      const char *qual);
727 
728 /*! Duplicates a bam_seq_t structure.
729  *
730  * @return
731  * Returns the new bam_seq_t pointer on success;
732  *         NULL on failure.
733  */
734 bam_seq_t *bam_dup(bam_seq_t *b);
735 
736 /*! Writes a SAM header block.
737  *
738  * @return
739  * Returns 0 for success;
740  *        -1 for failure
741  */
742 int bam_write_header(bam_file_t *out);
743 
744 enum bam_option {
745     BAM_OPT_THREAD_POOL,
746     BAM_OPT_BINNING,
747     BAM_OPT_IGNORE_CHKSUM,
748     BAM_OPT_WITH_BGZIP_IDX,
749     BAM_OPT_OUTPUT_BGZIP_IDX
750 };
751 
752 /*! Sets options on the bam_file_t.
753  *
754  * Sets options on the bam_file_t. See BAM_OPT_* definitions in bam.h.
755  * Use this immediately after opening.
756  *
757  * @return
758  * Returns 0 on success;
759  *        -1 on failure
760  */
761 int bam_set_option(bam_file_t *fd, enum bam_option opt, ...);
762 
763 /*! Sets options on the bam_file_t.
764  *
765  * Sets options on the bam_file_t. See BAM_OPT_* definitions in bam.h.
766  * Use this immediately after opening.
767  *
768  * @return
769  * Returns 0 on success;
770  *        -1 on failure
771  */
772 int bam_set_voption(bam_file_t *fd, enum bam_option opt, va_list args);
773 
774 
775 
776 /*
777  * Signed and unsigned fast functions to act as equiv to sprintf(cp, "%d", i)
778  */
779 unsigned char *append_int(unsigned char *cp, int32_t i);
780 unsigned char *append_uint(unsigned char *cp, uint32_t i);
781 
782 #ifdef __cplusplus
783 }
784 #endif
785 
786 #endif /* _BAM_H_ */
787