1 /*
2 * Copyright (c) 2013 Genome Research Ltd.
3 * Author(s): James Bonfield, Rob Davies
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above
12 * copyright notice, this list of conditions and the following
13 * disclaimer in the documentation and/or other materials provided
14 * with the distribution.
15 *
16 * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17 * Institute nor the names of its contributors may be used to endorse
18 * or promote products derived from this software without specific
19 * prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 /*
35 * Author: James Bonfield, Wellcome Trust Sanger Institute. 2010-3
36 */
37
38 /*! \file
39 * The primary SAM/BAM API.
40 *
41 * Consider using scram.h if you wish to also have support for CRAM.
42 */
43
44 #ifndef _BAM_H_
45 #define _BAM_H_
46
47 #ifdef __cplusplus
48 extern "C" {
49 #endif
50
51 #include <inttypes.h>
52 #include <zlib.h>
53
54 #include "io_lib/os.h"
55 #include "io_lib/hash_table.h"
56 #include "io_lib/sam_header.h"
57 #include "io_lib/thread_pool.h"
58 #include "io_lib/binning.h"
59 #include "io_lib/bgzip.h"
60
61 /* BAM header structs */
62 typedef struct tag_list {
63 char *value; /* NULL => end of tags */
64 int key;
65 int length;
66 } tag_list_t;
67
68 /* The main bam sequence struct */
69 typedef struct bam_seq_s {
70 uint32_t alloc; /* total size of this struct + 'data' onwards */
71 uint32_t blk_size;
72
73 // 64-bit variant of our position, for general use
74 int64_t pos;
75 int64_t mate_pos;
76 int64_t ins_size;
77
78 /* The raw bam block follows, in same order as on the disk */
79 /* This is the analogue of a bam1_core_t in samtools */
80 int32_t ref;
81 int32_t pos_32;
82
83 union {
84 struct {
85 uint32_t name_len:8, map_qual:8, bin:16;
86 };
87 uint32_t bin_packed;
88 };
89 union {
90 struct {
91 uint32_t cigar_len:16, flag:16;
92 };
93 uint32_t flag_packed;
94 };
95
96 int32_t len;
97 int32_t mate_ref;
98 int32_t mate_pos_32;
99 int32_t ins_size_32;
100
101 /* Followed by arbitrary bytes of packed data */
102 unsigned char data; /* unknown size */
103 } bam_seq_t;
104
105
106 /* Auxillary field handling */
107 typedef union {
108 char *s;
109 int i;
110 uint64_t i64;
111 float f;
112 double d;
113 struct {
114 int n, t;
115 unsigned char *s;
116 } B;
117 } bam_aux_t;
118
119 /* Struct for making arrays of aux tags */
120
121 typedef struct {
122 char tag[2];
123 char type;
124 uint32_t array_len;
125 union {
126 char *z;
127 uint8_t *h;
128 char a;
129 int32_t i;
130 uint32_t ui;
131 float f;
132 double d;
133 void *array;
134 } value;
135 } bam_aux_tag_t;
136
137 /*
138 * Our bam stream consists of a zlib gzFile stream and a buffer for it to
139 * output to. This allows us to call small bam_read requests while
140 * translating these to fewer, larger, gzread calls. The overhead is
141 * therefore minimal.
142 */
143 #define Z_BUFF_SIZE 65536 /* Max size of a zlib block */
144 #define BGZF_BUFF_SIZE 65273 // 65535 - MIN_LOOKAHEAD to avoid fill_window()
145 typedef struct {
146 FILE *fp;
147
148 int mode, binary, level;
149 z_stream s;
150
151 unsigned char comp[Z_BUFF_SIZE];
152 unsigned char *comp_p;
153 size_t comp_sz;
154
155 unsigned char uncomp[Z_BUFF_SIZE];
156 unsigned char *uncomp_p;
157 size_t uncomp_sz;
158
159 /* BAM specifics */
160 int32_t next_len;
161
162 SAM_hdr *header; /* Parsed SAM header */
163
164 /* Cached bam_seq_t, to avoid excessive mallocs */
165 bam_seq_t *bs;
166 int bs_size;
167
168 /* Boolean to indicate if we've finished the most recent z stream */
169 int z_finish;
170
171 /* Indicates whether gzipped, and if so with bgzf extra fields */
172 int gzip;
173 int bgzf;
174
175 /* Whether BAM or SAM format */
176 int bam;
177
178 /* If true, skip auxillary field parsing while reading SAM */
179 int no_aux;
180
181 /* line number (when in SAM mode) */
182 int line;
183
184 /* EOF block present in BAM */
185 int eof_block;
186
187 /* Static avoidance: used in sam_next_seq() */
188 unsigned char *sam_str;
189 size_t alloc_l;
190
191 /* Thread pool for encoding */
192 t_pool *pool;
193 t_results_queue *equeue;
194
195 /* Decoding queue */
196 t_results_queue *dqueue;
197 void *job_pending;
198 int eof;
199 int nd_jobs, ne_jobs;
200
201 /* Quality binning */
202 enum quality_binning binning;
203
204 /* Disabling CRC checks */
205 int ignore_chksum;
206
207 /* Used when gzi files are supplied. */
208 gzi *idx;
209 char *idx_fn;
210 uint64_t current_block;
211 unsigned char bgbuf[Z_BUFF_SIZE];
212 unsigned char *bgbuf_p;
213 size_t bgbuf_sz;
214 } bam_file_t;
215
216 /* BAM flags */
217 #define BAM_FPAIRED 1
218 #define BAM_FPROPER_PAIR 2
219 #define BAM_FUNMAP 4
220 #define BAM_FMUNMAP 8
221 #define BAM_FREVERSE 16
222 #define BAM_FMREVERSE 32
223 #define BAM_FREAD1 64
224 #define BAM_FREAD2 128
225 #define BAM_FSECONDARY 256
226 #define BAM_FQCFAIL 512
227 #define BAM_FDUP 1024
228 #define BAM_FSUPPLEMENTARY 2048
229
230 #define BAM_CIGAR32 32768
231
232 /* Decoding the bam_seq_t struct */
233 #define bam_blk_size(b) ((b)->blk_size)
234 #define bam_set_blk_size(b,v) ((b)->blk_size = (v))
235
236 #define bam_ref(b) ((b)->ref)
237 #define bam_pos(b) ((b)->pos)
238 #define bam_mate_ref(b) ((b)->mate_ref)
239 #define bam_mate_pos(b) ((b)->mate_pos)
240 #define bam_ins_size(b) ((b)->ins_size)
241 #define bam_cigar_len(b) (((b)->flag & BAM_CIGAR32 ? ((b)->bin<<16) : 0) + (b)->cigar_len)
242 #define bam_name_len(b) ((b)->name_len)
243 #define bam_map_qual(b) ((b)->map_qual)
244 #define bam_bin(b) ((b)->flag & BAM_CIGAR32 ? 0 : (b)->bin)
245 #define bam_flag(b) ((b)->flag)
246 #define bam_seq_len(b) ((b)->len)
247 #define bam_strand(b) ((bam_flag((b)) & BAM_FREVERSE) != 0)
248 #define bam_mstrand(b) ((bam_flag((b)) & BAM_FMREVERSE) != 0)
249
250 #define bam_set_ref(b,v) ((b)->ref = (v))
251 #define bam_set_pos(b,v) ((b)->pos = (v))
252 #define bam_set_mate_ref(b,v) ((b)->mate_ref = (v))
253 #define bam_set_mate_pos(b,v) ((b)->mate_pos = (v))
254 #define bam_set_ins_size(b,v) ((b)->ins_size = (v))
255 #define bam_set_cigar_len(b, v) (((v)>>16) ? (((b)->flag |= BAM_CIGAR32), (b)->bin = ((v)>>16), (b)->cigar_len = (v)&0xffff) : ((b)->cigar_len = (v)))
256 #define bam_set_name_len(b,v) ((b)->name_len = (v))
257 #define bam_set_map_qual(b,v) ((b)->map_qual = (v))
bam_set_bin(bam_seq_t * b,uint32_t v)258 static inline void bam_set_bin(bam_seq_t *b, uint32_t v) {
259 if (!(b->flag & BAM_CIGAR32)) b->bin = v;
260 }
261 #define bam_set_flag(b,v) ((b)->flag = (v))
262 #define bam_set_seq_len(b,v) ((b)->len = (v))
263
264 #define bam_name(b) ((char *)(&(b)->data))
265 #ifdef ALLOW_UAC
266 #define bam_cigar(b) ((uint32_t *)(bam_name((b)) + bam_name_len((b))))
267 #else
268 #define bam_cigar(b) ((uint32_t *)(bam_name((b)) + round4(bam_name_len((b)))))
269 #endif
270 #define bam_seq(b) (((char *)bam_cigar((b))) + 4*bam_cigar_len(b))
271 #define bam_qual(b) (bam_seq(b) + (int)(((b)->len+1)/2))
272 #define bam_aux(b) (bam_qual(b) + (b)->len)
273
274 /* Rounds up to the next multiple of 4 */
275 #define round4(v) (((v-1)&~3)+4)
276
277 /* CIGAR operations, taken from samtools bam.h */
278 #define BAM_CIGAR_SHIFT 4
279 #define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)
280
281 enum cigar_op {
282 BAM_UNKNOWN=-1,
283 BAM_CMATCH=0,
284 BAM_CINS=1,
285 BAM_CDEL=2,
286 BAM_CREF_SKIP=3,
287 BAM_CSOFT_CLIP=4,
288 BAM_CHARD_CLIP=5,
289 BAM_CPAD=6,
290 BAM_CBASE_MATCH=7,
291 BAM_CBASE_MISMATCH=8
292 };
293
294 /*
295 * Whether this cigar op marches along ref, seq or both
296 *
297 * Op Ref Seq
298 * M 1 1
299 * I 0 1
300 * D 1 0
301 * N 1 0
302 * S 0 1
303 * H 0 0
304 * P 0 0
305 * = 1 1
306 * X 1 1
307 */
308 #define BAM_CONSUME_REF(op) ((0x18d>>(op))&1)
309 #define BAM_CONSUME_SEQ(op) ((0x193>>(op))&1)
310
311
312 /* ----------------------------------------------------------------------
313 * Function prototypes
314 * We only support reading, so basically we have open, read, close along
315 * with some utility functions for querying aux records.
316 */
317
318 /*! Opens a SAM or BAM file.
319 *
320 * The mode parameter indicates the file
321 * type (if not auto-detecting) and whether it is for reading or
322 * writing. Use "rb" or "wb" for reading or writing BAM and "r" or
323 * "w" or reading or writing SAM. When writing BAM, the mode may end
324 * with a digit from 0 to 9 to indicate the compression to use with 0
325 * indicating uncompressed data.
326 *
327 * @param fn The filename to open or create.
328 * @param mode The input/output mode, similar to fopen().
329 *
330 * @return
331 * Returns a bam_file_t pointer on success;
332 * NULL on failure.
333 */
334 bam_file_t *bam_open(const char *fn, const char *mode);
335
336 bam_file_t *bam_open_block(const char *blk, size_t blk_size, SAM_hdr *sh);
337
338 /*! Closes a SAM or BAM file.
339 *
340 * @param b The file to close.
341 *
342 * @return
343 * Retrurns 0 on success;
344 * -1 on failure.
345 */
346 int bam_close(bam_file_t *b);
347
348 /*! Deprecated: please use bam_get_seq() instead.
349 */
350 int bam_next_seq(bam_file_t *b, bam_seq_t **bsp);
351
352 /*! Reads the next sequence.
353 *
354 * Fills out the next bam_seq_t struct.
355 * This function will alloc and/or grow the memory accordingly, allowing for
356 * efficient reuse.
357 *
358 * @param bsp Must be non-null, but *bsp may be NULL or an existing
359 * bam_seq_t pointer.
360 *
361 * @return
362 * Returns 1 on success;
363 * 0 on eof;
364 * -1 on error.
365 */
366 int bam_get_seq(bam_file_t *b, bam_seq_t **bsp);
367
368 /*!Looks for aux field 'key' and returns the value.
369 * The type is the first char and the value is the 2nd character onwards.
370 *
371 * @return
372 * Returns the value for key; NULL if not found.
373 */
374 char *bam_aux_find(bam_seq_t *b, const char *key);
375
376 //!Converts an encoded integer value return by bam_aux_find to an integer.
377 //
378 //Analogous to the bam_aux2i functions in samtools.
379 int32_t bam_aux_i(const uint8_t *dat);
380 float bam_aux_f(const uint8_t *s);
381 double bam_aux_d(const uint8_t *s);
382 char bam_aux_A(const uint8_t *s);
383 char *bam_aux_Z(const uint8_t *s);
384
385 //int bam_aux_del(bam_seq_t *b, uint8_t *s); // not implemented yet
386
387 /*! Add auxiliary tags to a bam_seq_t structure.
388 *
389 * Appends a tag onto the end of a bam_seq_t structure. The tag name
390 * is supplied in the 'tag' parameter, and the data type code in 'type'.
391 * Valid type codes are [AcCsSiIfdHZ], as described in the SAM specification.
392 *
393 * The array_len parameter is used for B type (i.e. array) tags. If
394 * array_len is 0, an ordinary non-array tag is added. If it is greater
395 * than zero, a B type tag of the apropriate data type is made. Arrays
396 * of types H and Z are not allowed, so array_len must be zero if these types
397 * are specified.
398 *
399 * data should point to the data to be added. This should be of appropriate
400 * size for the tag type, i.e. (u)int8_t for A, C and c; (u)int16_t for S and s;
401 * (u)int32_t for I and i; a float for f; a double for d; a NUL-terminated
402 * string for H and Z. If array_len is greater than zero, then data should
403 * point to an array of the given type. All data should be in the native
404 * format for the machine - it will be converted to little-endian if necessary
405 * by the function.
406 *
407 * @param b Points to the location of a bam_seq_t *. If (*b)->alloc
408 * is too small, the bam_seq_t struct will be reallocated.
409 * Neither b nor *b should be NULL.
410 * @param tag The tag name (RG, NM, etc.)
411 * @param type The tag data type.
412 * @param array_len Array length for array tags, zero for ordinary ones.
413 * @param data Pointer to the tag value.
414 *
415 * @return
416 * Returns 0 on success;
417 * -1 on error
418 */
419
420 int bam_aux_add(bam_seq_t **b, const char tag[2], char type,
421 uint32_t array_len, const void *data);
422
423 /*! Add multiple auxiliary tags to a bam_seq_t structure
424 *
425 * bam_aux_add_vec adds one or more tags listed in the bam_aux_tag_t array
426 * to a bam_seq_t structure. The bam_aux_tag_t struct has four elements,
427 * the tag name (char[2]), the type code (char), the array length for B tags and
428 * the value which is a union.
429 *
430 * The type code determines both the data type of the tag, and the member
431 * of value used to access the data. If array_len is zero, data types 'H' and
432 * 'Z' are accessed via member value.h or value.z; 'f' via value.f; 'd' via
433 * value.d; 'A' via value.a. Signed integers should have type 'i', and are
434 * accessed through value.i. Unsigned integers should use type 'I' and
435 * value.ui. The actual type used to store the integer will be the smallest
436 * that it will fit in, so signed and unsigned integers that fit in one byte
437 * will be stored as type 'c' or 'C'. Similarly, integers that fit in two
438 * bytes will be stored as type 's' or 'S' for unsigned.
439 *
440 * If array_len is non-zero, a B-type (array) tag is stored. In this case
441 * value.array should point to the data to be stored. The type of array
442 * is interpreted according to the requested tag type, i.e. char for 'A';
443 * int8_t for 'c'; uint8_t for 'C'; int16_t for 's'; uint16_t for 'S';
444 * int32_t for 'i'; uint32_t for 'I'; float for 'f'; double for 'd'. No
445 * attempt is made to adjust the size of integers when storing arrays. All
446 * data should be in the native format for the machine - it will be converted
447 * to little-endian if necessary by the function.
448 *
449 * @param b Points to the location of a bam_seq_t *. If (*b)->alloc
450 * is too small, the bam_seq_t struct will be reallocated.
451 * Neither b nor *b should be NULL.
452 * @param count The number of elements in the tags array
453 * @param tags Array of bam_aux_tag_t structs, listing the tags to add.
454 *
455 * @return
456 * Returns 0 on success;
457 * -1 on error
458 */
459
460 int bam_aux_add_vec(bam_seq_t **b, uint32_t count, bam_aux_tag_t *tags);
461
462 /*! Calculate the amount of space needed to store auxiliary tags
463 *
464 * The tags array should be filled out as described in bam_aux_add_vec.
465 * This function iterates through the list of items in the tags array
466 * to work out how much space will be needed to store them. This value
467 * can be passed to bam_construct_seq in the extra_len parameter to
468 * ensure that enough space is allocated to store the tags.
469 *
470 * @param count The number of elements in the tags array
471 * @param tags Array of bam_aux_tag_t structs, listing the tags to add.
472 *
473 * @return
474 * Returns the total space required for the tags on success;
475 * -1 on failure.
476 */
477
478 ssize_t bam_aux_size_vec(uint32_t count, bam_aux_tag_t *tags);
479
480 /*! Helper for filling in the bam_aux_tag_t struct
481 * @param tag Pointer to bam_aux_tag_t struct
482 * @param name Tag name
483 * @param val Tag value
484 */
bam_aux_tag_char(bam_aux_tag_t * tag,char * name,char val)485 static inline void bam_aux_tag_char(bam_aux_tag_t *tag, char *name, char val) {
486 tag->tag[0] = name[0];
487 tag->tag[1] = name[1];
488 tag->type = 'A';
489 tag->array_len = 0;
490 tag->value.a = val;
491 }
492 /*! Helper for filling in the bam_aux_tag_t struct
493 * @param tag Pointer to bam_aux_tag_t struct
494 * @param name Tag name
495 * @param val Tag value
496 */
bam_aux_tag_int(bam_aux_tag_t * tag,char * name,int32_t val)497 static inline void bam_aux_tag_int(bam_aux_tag_t *tag,
498 char *name, int32_t val) {
499 tag->tag[0] = name[0];
500 tag->tag[1] = name[1];
501 tag->type = 'i';
502 tag->array_len = 0;
503 tag->value.i = val;
504 }
505 /*! Helper for filling in the bam_aux_tag_t struct
506 * @param tag Pointer to bam_aux_tag_t struct
507 * @param name Tag name
508 * @param val Tag value
509 */
bam_aux_tag_uint(bam_aux_tag_t * tag,char * name,uint32_t val)510 static inline void bam_aux_tag_uint(bam_aux_tag_t *tag,
511 char *name, uint32_t val) {
512 tag->tag[0] = name[0];
513 tag->tag[1] = name[1];
514 tag->type = 'I';
515 tag->array_len = 0;
516 tag->value.ui = val;
517 }
518 /*! Helper for filling in the bam_aux_tag_t struct
519 * @param tag Pointer to bam_aux_tag_t struct
520 * @param name Tag name
521 * @param val Tag value
522 */
bam_aux_tag_float(bam_aux_tag_t * tag,char * name,float val)523 static inline void bam_aux_tag_float(bam_aux_tag_t *tag,
524 char *name, float val) {
525 tag->tag[0] = name[0];
526 tag->tag[1] = name[1];
527 tag->type = 'f';
528 tag->array_len = 0;
529 tag->value.f = val;
530 }
531 /*! Helper for filling in the bam_aux_tag_t struct
532 * @param tag Pointer to bam_aux_tag_t struct
533 * @param name Tag name
534 * @param val Tag value
535 */
bam_aux_tag_double(bam_aux_tag_t * tag,char * name,double val)536 static inline void bam_aux_tag_double(bam_aux_tag_t *tag,
537 char *name, double val) {
538 tag->tag[0] = name[0];
539 tag->tag[1] = name[1];
540 tag->type = 'd';
541 tag->array_len = 0;
542 tag->value.d = val;
543 }
544 /*! Helper for filling in the bam_aux_tag_t struct
545 * @param tag Pointer to bam_aux_tag_t struct
546 * @param name Tag name
547 * @param val Tag value
548 */
bam_aux_tag_string(bam_aux_tag_t * tag,char * name,char * str)549 static inline void bam_aux_tag_string(bam_aux_tag_t *tag,
550 char *name, char *str) {
551 tag->tag[0] = name[0];
552 tag->tag[1] = name[1];
553 tag->type = 'Z';
554 tag->array_len = 0;
555 tag->value.z = str;
556 }
557 /*! Helper for filling in the bam_aux_tag_t struct
558 * @param tag Pointer to bam_aux_tag_t struct
559 * @param name Tag name
560 * @param val Tag value
561 */
bam_aux_tag_hexstring(bam_aux_tag_t * tag,char * name,uint8_t * str)562 static inline void bam_aux_tag_hexstring(bam_aux_tag_t *tag,
563 char *name, uint8_t *str) {
564 tag->tag[0] = name[0];
565 tag->tag[1] = name[1];
566 tag->type = 'H';
567 tag->array_len = 0;
568 tag->value.h = str;
569 }
570 /*! Helper for filling in the bam_aux_tag_t struct
571 * @param tag Pointer to bam_aux_tag_t struct
572 * @param name Tag name
573 * @param val Tag value
574 */
bam_aux_tag_array(bam_aux_tag_t * tag,char * name,char type,uint32_t array_len,void * data)575 static inline void bam_aux_tag_array(bam_aux_tag_t *tag, char *name, char type,
576 uint32_t array_len, void *data) {
577 tag->tag[0] = name[0];
578 tag->tag[1] = name[1];
579 tag->type = type;
580 tag->array_len = array_len;
581 tag->value.array = data;
582 }
583
584 /*! Add SAM formatted aux tags to a bam_seq_t.
585 *
586 * Appends one or more SAM-format tags onto the end of a bam_seq_t structure.
587 * If multiple tags are present, they should be separated by tabs, as in
588 * a SAM file.
589 *
590 * @param bsp Points to the location of a bam_seq_t *. If (*bsp)->alloc
591 * is too small, the bam_seq_t struct will be reallocated.
592 * Neither bsp nor *bsp should be NULL.
593 * @param sam SAM-foratted tag string.
594 *
595 * @return
596 * Returns 0 on success;
597 * -1 on error
598 */
599
600 int bam_aux_add_from_sam(bam_seq_t **bsp, char *sam);
601
602 /*! Add preformated raw aux data to the bam_seq.
603 *
604 * This interface is similar to the samtools bam_aux_append function.
605 * It creates a tag with the given name and type, and then appends the
606 * supplied data to it as the value. It is up to the caller to ensure that
607 * the data has been formatted correctly as given in the SAM specification.
608 * This function makes no checks on the data, but simply copies it.
609 *
610 * Consider using bam_aux_add instead if you have information in a more
611 * integer or string form.
612 *
613 * @param b Points to the location of a bam_seq_t *. If (*b)->alloc
614 * is too small, the bam_seq_t struct will be reallocated.
615 * Neither b nor *b should be NULL.
616 * @param tag The tag name (RG, NM, etc.)
617 * @param type The tag data type.
618 * @param len The number of bytes of data present.
619 * @param data Pre-formatted data.
620 *
621 * @return
622 * Returns 0 on success;
623 * -1 on failure
624 */
625 int bam_aux_add_data(bam_seq_t **b, const char tag[2],
626 char type, size_t len, const uint8_t *data);
627
628 /*! Add raw data to a bam structure.
629 *
630 * This could be useful if you wish to manually construct your own bam
631 * entries or if you need to append an entire block of preformatting
632 * aux data.
633 *
634 * @param b Points to the location of a bam_seq_t *. If (*b)->alloc
635 * is too small, the bam_seq_t struct will be reallocated.
636 * Neither b nor *b should be NULL.
637 * @param len The number of bytes of data present.
638 * @param data Pre-formatted data.
639 *
640 * @retrun
641 * Returns 0 on success;
642 * -1 on failure
643 */
644 int bam_add_raw(bam_seq_t **b, size_t len, const uint8_t *data);
645
646 /*! An iterator on bam_aux_t fields.
647 *
648 * NB: This code is not reentrant or multi-thread capable. The values
649 * returned are valid until the next call to this function.
650 *
651 * @param key points to an array of 2 characters (eg "RG", "NM")
652 * @param type points to an address of 1 character (eg 'Z', 'i')
653 * @paran val points to an address of a bam_aux_t union.
654 * @param iter_handle NULL to initialise the search, and then the
655 * returned (modified) iter_handle on each subsequent call to continue
656 * the iteration.
657 *
658 * @return
659 * Returns 0 if the next value is valid, setting key, type and val;
660 * -1 when no more found.
661 */
662 int bam_aux_iter(bam_seq_t *b, char **iter_handle,
663 char *key, char *type, bam_aux_t *val);
664
665 /* Taken from samtools/bam.h */
666 #define bam_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf)
667 #define bam_nt16_rev_table "=ACMGRSVTWYHKDBN"
668
669 /* Output code */
670
671 /*! Writes a single bam sequence object.
672 *
673 * @param fp The SAM/BAM file handle.
674 * @param b The bam_seq_t pointer
675 *
676 * @return
677 * Returns 0 on success;
678 * -1 on failure
679 */
680 int bam_put_seq(bam_file_t *fp, bam_seq_t *b);
681
682 /*! Constructs a bam_seq_t from separate components.
683 *
684 * Note: ignores auxiliary tags for now. These need to be appended
685 * manually by the calling function.
686 *
687 * @param b Points to the location of a bam_seq_t *. If *b is NULL
688 * or (*b)->alloc is too small, the bam_seq_t struct will
689 * be reallocated.
690 * @param extra_len Extra space to allocate for auxiliary tags
691 * @param qname Query name
692 * @param qname_len Query name length
693 * @param flag BAM flags
694 * @param rname Reference ID
695 * @param pos Mapped position (N.B. 1-based)
696 * @param end Last aligned base
697 * @param mapq Mapping quality
698 * @param ncigar Number of CIGAR elements
699 * @param cigar CIGAR alignment information
700 * @param mrnm Mate reference ID
701 * @param mpos Mate position (N.B. 1-based)
702 * @param isize Insert size
703 * @param len Sequence length
704 * @param seq Sequence (ASCII format)
705 * @param qual Quality values (phred scale, 8-bit binary, no offset)
706 * Passing in NULL to qual will cause all quality values
707 * to be treated as absent (i.e. set to 0xff).
708 *
709 * @return
710 * Returns number of bytes written to bam_seq_t on success (ie tag offset);
711 * -1 on error.
712 */
713 int bam_construct_seq(bam_seq_t **b, size_t extra_len,
714 const char *qname, size_t qname_len,
715 int flag,
716 int rname, // Ref ID
717 int64_t pos,
718 int64_t end, // aligned start/end coords
719 int mapq,
720 uint32_t ncigar, const uint32_t *cigar,
721 int mrnm, // Mate Ref ID
722 int64_t mpos,
723 int64_t isize,
724 int len,
725 const char *seq,
726 const char *qual);
727
728 /*! Duplicates a bam_seq_t structure.
729 *
730 * @return
731 * Returns the new bam_seq_t pointer on success;
732 * NULL on failure.
733 */
734 bam_seq_t *bam_dup(bam_seq_t *b);
735
736 /*! Writes a SAM header block.
737 *
738 * @return
739 * Returns 0 for success;
740 * -1 for failure
741 */
742 int bam_write_header(bam_file_t *out);
743
744 enum bam_option {
745 BAM_OPT_THREAD_POOL,
746 BAM_OPT_BINNING,
747 BAM_OPT_IGNORE_CHKSUM,
748 BAM_OPT_WITH_BGZIP_IDX,
749 BAM_OPT_OUTPUT_BGZIP_IDX
750 };
751
752 /*! Sets options on the bam_file_t.
753 *
754 * Sets options on the bam_file_t. See BAM_OPT_* definitions in bam.h.
755 * Use this immediately after opening.
756 *
757 * @return
758 * Returns 0 on success;
759 * -1 on failure
760 */
761 int bam_set_option(bam_file_t *fd, enum bam_option opt, ...);
762
763 /*! Sets options on the bam_file_t.
764 *
765 * Sets options on the bam_file_t. See BAM_OPT_* definitions in bam.h.
766 * Use this immediately after opening.
767 *
768 * @return
769 * Returns 0 on success;
770 * -1 on failure
771 */
772 int bam_set_voption(bam_file_t *fd, enum bam_option opt, va_list args);
773
774
775
776 /*
777 * Signed and unsigned fast functions to act as equiv to sprintf(cp, "%d", i)
778 */
779 unsigned char *append_int(unsigned char *cp, int32_t i);
780 unsigned char *append_uint(unsigned char *cp, uint32_t i);
781
782 #ifdef __cplusplus
783 }
784 #endif
785
786 #endif /* _BAM_H_ */
787