1 /// @file htslib/sam.h
2 /// High-level SAM/BAM/CRAM sequence file operations.
3 /*
4 Copyright (C) 2008, 2009, 2013-2021 Genome Research Ltd.
5 Copyright (C) 2010, 2012, 2013 Broad Institute.
6
7 Author: Heng Li <lh3@sanger.ac.uk>
8
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 DEALINGS IN THE SOFTWARE. */
26
27 #ifndef HTSLIB_SAM_H
28 #define HTSLIB_SAM_H
29
30 #include <errno.h>
31 #include <stdint.h>
32 #include <sys/types.h>
33 #include "hts.h"
34 #include "hts_endian.h"
35
36 #ifdef __cplusplus
37 extern "C" {
38 #endif
39
40 /// Highest SAM format version supported by this library
41 #define SAM_FORMAT_VERSION "1.6"
42
43 /***************************
44 *** SAM/BAM/CRAM header ***
45 ***************************/
46
47 /*! @typedef
48 * @abstract Header extension structure, grouping a collection
49 * of hash tables that contain the parsed header data.
50 */
51
52 typedef struct sam_hrecs_t sam_hrecs_t;
53
54 /*! @typedef
55 @abstract Structure for the alignment header.
56 @field n_targets number of reference sequences
57 @field l_text length of the plain text in the header (may be zero if
58 the header has been edited)
59 @field target_len lengths of the reference sequences
60 @field target_name names of the reference sequences
61 @field text plain text (may be NULL if the header has been edited)
62 @field sdict header dictionary
63 @field hrecs pointer to the extended header struct (internal use only)
64 @field ref_count reference count
65
66 @note The text and l_text fields are included for backwards compatibility.
67 These fields may be set to NULL and zero respectively as a side-effect
68 of calling some header API functions. New code that needs to access the
69 header text should use the sam_hdr_str() and sam_hdr_length() functions
70 instead of these fields.
71 */
72
73 typedef struct sam_hdr_t {
74 int32_t n_targets, ignore_sam_err;
75 size_t l_text;
76 uint32_t *target_len;
77 const int8_t *cigar_tab HTS_DEPRECATED("Use bam_cigar_table[] instead");
78 char **target_name;
79 char *text;
80 void *sdict;
81 sam_hrecs_t *hrecs;
82 uint32_t ref_count;
83 } sam_hdr_t;
84
85 /*! @typedef
86 * @abstract Old name for compatibility with existing code.
87 */
88 typedef sam_hdr_t bam_hdr_t;
89
90 /****************************
91 *** CIGAR related macros ***
92 ****************************/
93
94 #define BAM_CMATCH 0
95 #define BAM_CINS 1
96 #define BAM_CDEL 2
97 #define BAM_CREF_SKIP 3
98 #define BAM_CSOFT_CLIP 4
99 #define BAM_CHARD_CLIP 5
100 #define BAM_CPAD 6
101 #define BAM_CEQUAL 7
102 #define BAM_CDIFF 8
103 #define BAM_CBACK 9
104
105 #define BAM_CIGAR_STR "MIDNSHP=XB"
106 #define BAM_CIGAR_SHIFT 4
107 #define BAM_CIGAR_MASK 0xf
108 #define BAM_CIGAR_TYPE 0x3C1A7
109
110 /*! @abstract Table for converting a CIGAR operator character to BAM_CMATCH etc.
111 Result is operator code or -1. Be sure to cast the index if it is a plain char:
112 int op = bam_cigar_table[(unsigned char) ch];
113 */
114 extern const int8_t bam_cigar_table[256];
115
116 #define bam_cigar_op(c) ((c)&BAM_CIGAR_MASK)
117 #define bam_cigar_oplen(c) ((c)>>BAM_CIGAR_SHIFT)
118 // Note that BAM_CIGAR_STR is padded to length 16 bytes below so that
119 // the array look-up will not fall off the end. '?' is chosen as the
120 // padding character so it's easy to spot if one is emitted, and will
121 // result in a parsing failure (in sam_parse1(), at least) if read.
122 #define bam_cigar_opchr(c) (BAM_CIGAR_STR "??????" [bam_cigar_op(c)])
123 #define bam_cigar_gen(l, o) ((l)<<BAM_CIGAR_SHIFT|(o))
124
125 /* bam_cigar_type returns a bit flag with:
126 * bit 1 set if the cigar operation consumes the query
127 * bit 2 set if the cigar operation consumes the reference
128 *
129 * For reference, the unobfuscated truth table for this function is:
130 * BAM_CIGAR_TYPE QUERY REFERENCE
131 * --------------------------------
132 * BAM_CMATCH 1 1
133 * BAM_CINS 1 0
134 * BAM_CDEL 0 1
135 * BAM_CREF_SKIP 0 1
136 * BAM_CSOFT_CLIP 1 0
137 * BAM_CHARD_CLIP 0 0
138 * BAM_CPAD 0 0
139 * BAM_CEQUAL 1 1
140 * BAM_CDIFF 1 1
141 * BAM_CBACK 0 0
142 * --------------------------------
143 */
144 #define bam_cigar_type(o) (BAM_CIGAR_TYPE>>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference
145
146 /*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */
147 #define BAM_FPAIRED 1
148 /*! @abstract the read is mapped in a proper pair */
149 #define BAM_FPROPER_PAIR 2
150 /*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */
151 #define BAM_FUNMAP 4
152 /*! @abstract the mate is unmapped */
153 #define BAM_FMUNMAP 8
154 /*! @abstract the read is mapped to the reverse strand */
155 #define BAM_FREVERSE 16
156 /*! @abstract the mate is mapped to the reverse strand */
157 #define BAM_FMREVERSE 32
158 /*! @abstract this is read1 */
159 #define BAM_FREAD1 64
160 /*! @abstract this is read2 */
161 #define BAM_FREAD2 128
162 /*! @abstract not primary alignment */
163 #define BAM_FSECONDARY 256
164 /*! @abstract QC failure */
165 #define BAM_FQCFAIL 512
166 /*! @abstract optical or PCR duplicate */
167 #define BAM_FDUP 1024
168 /*! @abstract supplementary alignment */
169 #define BAM_FSUPPLEMENTARY 2048
170
171 /*************************
172 *** Alignment records ***
173 *************************/
174
175 /*
176 * Assumptions made here. While pos can be 64-bit, no sequence
177 * itself is that long, but due to ref skip CIGAR fields it
178 * may span more than that. (CIGAR itself is 28-bit len + 4 bit
179 * type, but in theory we can combine multiples together.)
180 *
181 * Mate position and insert size also need to be 64-bit, but
182 * we won't accept more than 32-bit for tid.
183 *
184 * The bam_core_t structure is the *in memory* layout and not
185 * the same as the on-disk format. 64-bit changes here permit
186 * SAM to work with very long chromosomes and permit BAM and CRAM
187 * to seamlessly update in the future without further API/ABI
188 * revisions.
189 */
190
191 /*! @typedef
192 @abstract Structure for core alignment information.
193 @field pos 0-based leftmost coordinate
194 @field tid chromosome ID, defined by sam_hdr_t
195 @field bin bin calculated by bam_reg2bin()
196 @field qual mapping quality
197 @field l_extranul length of extra NULs between qname & cigar (for alignment)
198 @field flag bitwise flag
199 @field l_qname length of the query name
200 @field n_cigar number of CIGAR operations
201 @field l_qseq length of the query sequence (read)
202 @field mtid chromosome ID of next read in template, defined by sam_hdr_t
203 @field mpos 0-based leftmost coordinate of next read in template
204 @field isize observed template length ("insert size")
205 */
206 typedef struct bam1_core_t {
207 hts_pos_t pos;
208 int32_t tid;
209 uint16_t bin; // NB: invalid on 64-bit pos
210 uint8_t qual;
211 uint8_t l_extranul;
212 uint16_t flag;
213 uint16_t l_qname;
214 uint32_t n_cigar;
215 int32_t l_qseq;
216 int32_t mtid;
217 hts_pos_t mpos;
218 hts_pos_t isize;
219 } bam1_core_t;
220
221 /*! @typedef
222 @abstract Structure for one alignment.
223 @field core core information about the alignment
224 @field id
225 @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux
226 @field l_data current length of bam1_t::data
227 @field m_data maximum length of bam1_t::data
228 @field mempolicy memory handling policy, see bam_set_mempolicy()
229
230 @discussion Notes:
231
232 1. The data blob should be accessed using bam_get_qname, bam_get_cigar,
233 bam_get_seq, bam_get_qual and bam_get_aux macros. These returns pointers
234 to the start of each type of data.
235 2. qname is terminated by one to four NULs, so that the following
236 cigar data is 32-bit aligned; core.l_qname includes these trailing NULs,
237 while core.l_extranul counts the excess NULs (so 0 <= l_extranul <= 3).
238 3. Cigar data is encoded 4 bytes per CIGAR operation.
239 See the bam_cigar_* macros for manipulation.
240 4. seq is nibble-encoded according to bam_nt16_table.
241 See the bam_seqi macro for retrieving individual bases.
242 5. Per base qualities are stored in the Phred scale with no +33 offset.
243 Ie as per the BAM specification and not the SAM ASCII printable method.
244 */
245 typedef struct bam1_t {
246 bam1_core_t core;
247 uint64_t id;
248 uint8_t *data;
249 int l_data;
250 uint32_t m_data;
251 uint32_t mempolicy:2, :30 /* Reserved */;
252 } bam1_t;
253
254 /*! @function
255 @abstract Get whether the query is on the reverse strand
256 @param b pointer to an alignment
257 @return boolean true if query is on the reverse strand
258 */
259 #define bam_is_rev(b) (((b)->core.flag&BAM_FREVERSE) != 0)
260 /*! @function
261 @abstract Get whether the query's mate is on the reverse strand
262 @param b pointer to an alignment
263 @return boolean true if query's mate on the reverse strand
264 */
265 #define bam_is_mrev(b) (((b)->core.flag&BAM_FMREVERSE) != 0)
266 /*! @function
267 @abstract Get the name of the query
268 @param b pointer to an alignment
269 @return pointer to the name string, null terminated
270 */
271 #define bam_get_qname(b) ((char*)(b)->data)
272 /*! @function
273 @abstract Get the CIGAR array
274 @param b pointer to an alignment
275 @return pointer to the CIGAR array
276
277 @discussion In the CIGAR array, each element is a 32-bit integer. The
278 lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
279 length of a CIGAR.
280 */
281 #define bam_get_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname))
282 /*! @function
283 @abstract Get query sequence
284 @param b pointer to an alignment
285 @return pointer to sequence
286
287 @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G,
288 8 for T and 15 for N. Two bases are packed in one byte with the base
289 at the higher 4 bits having smaller coordinate on the read. It is
290 recommended to use bam_seqi() macro to get the base.
291 */
292 #define bam_get_seq(b) ((b)->data + ((b)->core.n_cigar<<2) + (b)->core.l_qname)
293 /*! @function
294 @abstract Get query quality
295 @param b pointer to an alignment
296 @return pointer to quality string
297 */
298 #define bam_get_qual(b) ((b)->data + ((b)->core.n_cigar<<2) + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1))
299 /*! @function
300 @abstract Get auxiliary data
301 @param b pointer to an alignment
302 @return pointer to the concatenated auxiliary data
303 */
304 #define bam_get_aux(b) ((b)->data + ((b)->core.n_cigar<<2) + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1) + (b)->core.l_qseq)
305 /*! @function
306 @abstract Get length of auxiliary data
307 @param b pointer to an alignment
308 @return length of the concatenated auxiliary data
309 */
310 #define bam_get_l_aux(b) ((b)->l_data - ((b)->core.n_cigar<<2) - (b)->core.l_qname - (b)->core.l_qseq - (((b)->core.l_qseq + 1)>>1))
311 /*! @function
312 @abstract Get a base on read
313 @param s Query sequence returned by bam_get_seq()
314 @param i The i-th position, 0-based
315 @return 4-bit integer representing the base.
316 */
317 #define bam_seqi(s, i) ((s)[(i)>>1] >> ((~(i)&1)<<2) & 0xf)
318 /*!
319 @abstract Modifies a single base in the bam structure.
320 @param s Query sequence returned by bam_get_seq()
321 @param i The i-th position, 0-based
322 @param b Base in nt16 nomenclature (see seq_nt16_table)
323 */
324 #define bam_set_seqi(s,i,b) ((s)[(i)>>1] = ((s)[(i)>>1] & (0xf0 >> ((~(i)&1)<<2))) | ((b)<<((~(i)&1)<<2)))
325
326 /**************************
327 *** Exported functions ***
328 **************************/
329
330 /***************
331 *** BAM I/O ***
332 ***************/
333
334 /* Header */
335
336 /// Generates a new unpopulated header structure.
337 /*!
338 *
339 * @return A valid pointer to new header on success, NULL on failure
340 *
341 * The sam_hdr_t struct returned by a successful call should be freed
342 * via sam_hdr_destroy() when it is no longer needed.
343 */
344 HTSLIB_EXPORT
345 sam_hdr_t *sam_hdr_init(void);
346
347 /// Read the header from a BAM compressed file.
348 /*!
349 * @param fp File pointer
350 * @return A valid pointer to new header on success, NULL on failure
351 *
352 * This function only works with BAM files. It is usually better to use
353 * sam_hdr_read(), which works on SAM, BAM and CRAM files.
354 *
355 * The sam_hdr_t struct returned by a successful call should be freed
356 * via sam_hdr_destroy() when it is no longer needed.
357 */
358 HTSLIB_EXPORT
359 sam_hdr_t *bam_hdr_read(BGZF *fp);
360
361 /// Writes the header to a BAM file.
362 /*!
363 * @param fp File pointer
364 * @param h Header pointer
365 * @return 0 on success, -1 on failure
366 *
367 * This function only works with BAM files. Use sam_hdr_write() to
368 * write in any of the SAM, BAM or CRAM formats.
369 */
370 HTSLIB_EXPORT
371 int bam_hdr_write(BGZF *fp, const sam_hdr_t *h) HTS_RESULT_USED;
372
373 /*!
374 * Frees the resources associated with a header.
375 */
376 HTSLIB_EXPORT
377 void sam_hdr_destroy(sam_hdr_t *h);
378
379 /// Duplicate a header structure.
380 /*!
381 * @return A valid pointer to new header on success, NULL on failure
382 *
383 * The sam_hdr_t struct returned by a successful call should be freed
384 * via sam_hdr_destroy() when it is no longer needed.
385 */
386 HTSLIB_EXPORT
387 sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0);
388
389 /*!
390 * @abstract Old names for compatibility with existing code.
391 */
bam_hdr_init(void)392 static inline sam_hdr_t *bam_hdr_init(void) { return sam_hdr_init(); }
bam_hdr_destroy(sam_hdr_t * h)393 static inline void bam_hdr_destroy(sam_hdr_t *h) { sam_hdr_destroy(h); }
bam_hdr_dup(const sam_hdr_t * h0)394 static inline sam_hdr_t *bam_hdr_dup(const sam_hdr_t *h0) { return sam_hdr_dup(h0); }
395
396 typedef htsFile samFile;
397
398 /// Create a header from existing text.
399 /*!
400 * @param l_text Length of text
401 * @param text Header text
402 * @return A populated sam_hdr_t structure on success; NULL on failure.
403 * @note The text field of the returned header will be NULL, and the l_text
404 * field will be zero.
405 *
406 * The sam_hdr_t struct returned by a successful call should be freed
407 * via sam_hdr_destroy() when it is no longer needed.
408 */
409 HTSLIB_EXPORT
410 sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text);
411
412 /// Read a header from a SAM, BAM or CRAM file.
413 /*!
414 * @param fp Pointer to a SAM, BAM or CRAM file handle
415 * @return A populated sam_hdr_t struct on success; NULL on failure.
416 *
417 * The sam_hdr_t struct returned by a successful call should be freed
418 * via sam_hdr_destroy() when it is no longer needed.
419 */
420 HTSLIB_EXPORT
421 sam_hdr_t *sam_hdr_read(samFile *fp);
422
423 /// Write a header to a SAM, BAM or CRAM file.
424 /*!
425 * @param fp SAM, BAM or CRAM file header
426 * @param h Header structure to write
427 * @return 0 on success; -1 on failure
428 */
429 HTSLIB_EXPORT
430 int sam_hdr_write(samFile *fp, const sam_hdr_t *h) HTS_RESULT_USED;
431
432 /// Returns the current length of the header text.
433 /*!
434 * @return >= 0 on success, SIZE_MAX on failure
435 */
436 HTSLIB_EXPORT
437 size_t sam_hdr_length(sam_hdr_t *h);
438
439 /// Returns the text representation of the header.
440 /*!
441 * @return valid char pointer on success, NULL on failure
442 *
443 * The returned string is part of the header structure. It will remain
444 * valid until a call to a header API function causes the string to be
445 * invalidated, or the header is destroyed.
446 *
447 * The caller should not attempt to free or realloc this pointer.
448 */
449 HTSLIB_EXPORT
450 const char *sam_hdr_str(sam_hdr_t *h);
451
452 /// Returns the number of references in the header.
453 /*!
454 * @return >= 0 on success, -1 on failure
455 */
456 HTSLIB_EXPORT
457 int sam_hdr_nref(const sam_hdr_t *h);
458
459 /* ==== Line level methods ==== */
460
461 /// Add formatted lines to an existing header.
462 /*!
463 * @param lines Full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
464 * optional new-line. If it contains more than 1 line then
465 * multiple lines will be added in order
466 * @param len The maximum length of lines (if an early NUL is not
467 * encountered). len may be 0 if unknown, in which case
468 * lines must be NUL-terminated
469 * @return 0 on success, -1 on failure
470 *
471 * The lines will be appended to the end of the existing header
472 * (apart from HD, which always comes first).
473 */
474 HTSLIB_EXPORT
475 int sam_hdr_add_lines(sam_hdr_t *h, const char *lines, size_t len);
476
477 /// Adds a single line to an existing header.
478 /*!
479 * Specify type and one or more key,value pairs, ending with the NULL key.
480 * Eg. sam_hdr_add_line(h, "SQ", "ID", "foo", "LN", "100", NULL).
481 *
482 * @param type Type of the added line. Eg. "SQ"
483 * @return 0 on success, -1 on failure
484 *
485 * The new line will be added immediately after any others of the same
486 * type, or at the end of the existing header if no lines of the
487 * given type currently exist. The exception is HD lines, which always
488 * come first. If an HD line already exists, it will be replaced.
489 */
490 HTSLIB_EXPORT
491 int sam_hdr_add_line(sam_hdr_t *h, const char *type, ...);
492
493 /// Returns a complete line of formatted text for a given type and ID.
494 /*!
495 * @param type Type of the searched line. Eg. "SQ"
496 * @param ID_key Tag key defining the line. Eg. "SN"
497 * @param ID_value Tag value associated with the key above. Eg. "ref1"
498 * @param ks kstring to hold the result
499 * @return 0 on success;
500 * -1 if no matching line is found
501 * -2 on other failures
502 *
503 * Puts a complete line of formatted text for a specific header type/ID
504 * combination into @p ks. If ID_key is NULL then it returns the first line of
505 * the specified type.
506 *
507 * Any existing content in @p ks will be overwritten.
508 */
509 HTSLIB_EXPORT
510 int sam_hdr_find_line_id(sam_hdr_t *h, const char *type,
511 const char *ID_key, const char *ID_val, kstring_t *ks);
512
513 /// Returns a complete line of formatted text for a given type and index.
514 /*!
515 * @param type Type of the searched line. Eg. "SQ"
516 * @param position Index in lines of this type (zero-based)
517 * @param ks kstring to hold the result
518 * @return 0 on success;
519 * -1 if no matching line is found
520 * -2 on other failures
521 *
522 * Puts a complete line of formatted text for a specific line into @p ks.
523 * The header line is selected using the @p type and @p position parameters.
524 *
525 * Any existing content in @p ks will be overwritten.
526 */
527 HTSLIB_EXPORT
528 int sam_hdr_find_line_pos(sam_hdr_t *h, const char *type,
529 int pos, kstring_t *ks);
530
531 /// Remove a line with given type / id from a header
532 /*!
533 * @param type Type of the searched line. Eg. "SQ"
534 * @param ID_key Tag key defining the line. Eg. "SN"
535 * @param ID_value Tag value associated with the key above. Eg. "ref1"
536 * @return 0 on success, -1 on error
537 *
538 * Remove a line from the header by specifying a tag:value that uniquely
539 * identifies the line, i.e. the @SQ line containing "SN:ref1".
540 *
541 * \@SQ line is uniquely identified by the SN tag.
542 * \@RG line is uniquely identified by the ID tag.
543 * \@PG line is uniquely identified by the ID tag.
544 * Eg. sam_hdr_remove_line_id(h, "SQ", "SN", "ref1")
545 *
546 * If no key:value pair is specified, the type MUST be followed by a NULL argument and
547 * the first line of the type will be removed, if any.
548 * Eg. sam_hdr_remove_line_id(h, "SQ", NULL, NULL)
549 *
550 * @note Removing \@PG lines is currently unsupported.
551 */
552 HTSLIB_EXPORT
553 int sam_hdr_remove_line_id(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_value);
554
555 /// Remove nth line of a given type from a header
556 /*!
557 * @param type Type of the searched line. Eg. "SQ"
558 * @param position Index in lines of this type (zero-based). E.g. 3
559 * @return 0 on success, -1 on error
560 *
561 * Remove a line from the header by specifying the position in the type
562 * group, i.e. 3rd @SQ line.
563 */
564 HTSLIB_EXPORT
565 int sam_hdr_remove_line_pos(sam_hdr_t *h, const char *type, int position);
566
567 /// Add or update tag key,value pairs in a header line.
568 /*!
569 * @param type Type of the searched line. Eg. "SQ"
570 * @param ID_key Tag key defining the line. Eg. "SN"
571 * @param ID_value Tag value associated with the key above. Eg. "ref1"
572 * @return 0 on success, -1 on error
573 *
574 * Adds or updates tag key,value pairs in a header line.
575 * Eg. for adding M5 tags to @SQ lines or updating sort order for the
576 * @HD line.
577 *
578 * Specify multiple key,value pairs ending in NULL. Eg.
579 * sam_hdr_update_line(h, "RG", "ID", "rg1", "DS", "description", "PG", "samtools", NULL)
580 *
581 * Attempting to update the record name (i.e. @SQ SN or @RG ID) will
582 * work as long as the new name is not already in use, however doing this
583 * on a file opened for reading may produce unexpected results.
584 *
585 * Renaming an @RG record in this way will only change the header. Alignment
586 * records written later will not be updated automatically even if they
587 * reference the old read group name.
588 *
589 * Attempting to change an @PG ID tag is not permitted.
590 */
591 HTSLIB_EXPORT
592 int sam_hdr_update_line(sam_hdr_t *h, const char *type,
593 const char *ID_key, const char *ID_value, ...);
594
595 /// Remove all lines of a given type from a header, except the one matching an ID
596 /*!
597 * @param type Type of the searched line. Eg. "SQ"
598 * @param ID_key Tag key defining the line. Eg. "SN"
599 * @param ID_value Tag value associated with the key above. Eg. "ref1"
600 * @return 0 on success, -1 on failure
601 *
602 * Remove all lines of type <type> from the header, except the one
603 * specified by tag:value, i.e. the @SQ line containing "SN:ref1".
604 *
605 * If no line matches the key:value ID, all lines of the given type are removed.
606 * To remove all lines of a given type, use NULL for both ID_key and ID_value.
607 */
608 HTSLIB_EXPORT
609 int sam_hdr_remove_except(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_value);
610
611 /// Remove header lines of a given type, except those in a given ID set
612 /*!
613 * @param type Type of the searched line. Eg. "RG"
614 * @param id Tag key defining the line. Eg. "ID"
615 * @param rh Hash set initialised by the caller with the values to be kept.
616 * See description for how to create this. If @p rh is NULL, all
617 * lines of this type will be removed.
618 * @return 0 on success, -1 on failure
619 *
620 * Remove all lines of type @p type from the header, except the ones
621 * specified in the hash set @p rh. If @p rh is NULL, all lines of
622 * this type will be removed.
623 * Declaration of @p rh is done using KHASH_SET_INIT_STR macro. Eg.
624 * @code{.c}
625 * #include "htslib/khash.h"
626 * KHASH_SET_INIT_STR(keep)
627 * typedef khash_t(keep) *keephash_t;
628 *
629 * void your_method() {
630 * samFile *sf = sam_open("alignment.bam", "r");
631 * sam_hdr_t *h = sam_hdr_read(sf);
632 * keephash_t rh = kh_init(keep);
633 * int ret = 0;
634 * kh_put(keep, rh, strdup("chr2"), &ret);
635 * kh_put(keep, rh, strdup("chr3"), &ret);
636 * if (sam_hdr_remove_lines(h, "SQ", "SN", rh) == -1)
637 * fprintf(stderr, "Error removing lines\n");
638 * khint_t k;
639 * for (k = 0; k < kh_end(rh); ++k)
640 * if (kh_exist(rh, k)) free((char*)kh_key(rh, k));
641 * kh_destroy(keep, rh);
642 * sam_hdr_destroy(h);
643 * sam_close(sf);
644 * }
645 * @endcode
646 *
647 */
648 HTSLIB_EXPORT
649 int sam_hdr_remove_lines(sam_hdr_t *h, const char *type, const char *id, void *rh);
650
651 /// Count the number of lines for a given header type
652 /*!
653 * @param h BAM header
654 * @param type Header type to count. Eg. "RG"
655 * @return Number of lines of this type on success; -1 on failure
656 */
657 HTSLIB_EXPORT
658 int sam_hdr_count_lines(sam_hdr_t *h, const char *type);
659
660 /// Index of the line for the types that have dedicated look-up tables (SQ, RG, PG)
661 /*!
662 * @param h BAM header
663 * @param type Type of the searched line. Eg. "RG"
664 * @param key The value of the identifying key. Eg. "rg1"
665 * @return 0-based index on success; -1 if line does not exist; -2 on failure
666 */
667 HTSLIB_EXPORT
668 int sam_hdr_line_index(sam_hdr_t *bh, const char *type, const char *key);
669
670 /// Id key of the line for the types that have dedicated look-up tables (SQ, RG, PG)
671 /*!
672 * @param h BAM header
673 * @param type Type of the searched line. Eg. "RG"
674 * @param pos Zero-based index inside the type group. Eg. 2 (for the third RG line)
675 * @return Valid key string on success; NULL on failure
676 */
677 HTSLIB_EXPORT
678 const char *sam_hdr_line_name(sam_hdr_t *bh, const char *type, int pos);
679
680 /* ==== Key:val level methods ==== */
681
682 /// Return the value associated with a key for a header line identified by ID_key:ID_val
683 /*!
684 * @param type Type of the line to which the tag belongs. Eg. "SQ"
685 * @param ID_key Tag key defining the line. Eg. "SN". Can be NULL, if looking for the first line.
686 * @param ID_value Tag value associated with the key above. Eg. "ref1". Can be NULL, if ID_key is NULL.
687 * @param key Key of the searched tag. Eg. "LN"
688 * @param ks kstring where the value will be written
689 * @return 0 on success
690 * -1 if the requested tag does not exist
691 * -2 on other errors
692 *
693 * Looks for a specific key in a single SAM header line and writes the
694 * associated value into @p ks. The header line is selected using the ID_key
695 * and ID_value parameters. Any pre-existing content in @p ks will be
696 * overwritten.
697 */
698 HTSLIB_EXPORT
699 int sam_hdr_find_tag_id(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_value, const char *key, kstring_t *ks);
700
701 /// Return the value associated with a key for a header line identified by position
702 /*!
703 * @param type Type of the line to which the tag belongs. Eg. "SQ"
704 * @param position Index in lines of this type (zero-based). E.g. 3
705 * @param key Key of the searched tag. Eg. "LN"
706 * @param ks kstring where the value will be written
707 * @return 0 on success
708 * -1 if the requested tag does not exist
709 * -2 on other errors
710 *
711 * Looks for a specific key in a single SAM header line and writes the
712 * associated value into @p ks. The header line is selected using the @p type
713 * and @p position parameters. Any pre-existing content in @p ks will be
714 * overwritten.
715 */
716 HTSLIB_EXPORT
717 int sam_hdr_find_tag_pos(sam_hdr_t *h, const char *type, int pos, const char *key, kstring_t *ks);
718
719 /// Remove the key from the line identified by type, ID_key and ID_value.
720 /*!
721 * @param type Type of the line to which the tag belongs. Eg. "SQ"
722 * @param ID_key Tag key defining the line. Eg. "SN"
723 * @param ID_value Tag value associated with the key above. Eg. "ref1"
724 * @param key Key of the targeted tag. Eg. "M5"
725 * @return 1 if the key was removed; 0 if it was not present; -1 on error
726 */
727 HTSLIB_EXPORT
728 int sam_hdr_remove_tag_id(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_value, const char *key);
729
730 /// Get the target id for a given reference sequence name
731 /*!
732 * @param ref Reference name
733 * @return Positive value on success,
734 * -1 if unknown reference,
735 * -2 if the header could not be parsed
736 *
737 * Looks up a reference sequence by name in the reference hash table
738 * and returns the numerical target id.
739 */
740 HTSLIB_EXPORT
741 int sam_hdr_name2tid(sam_hdr_t *h, const char *ref);
742
743 /// Get the reference sequence name from a target index
744 /*!
745 * @param tid Target index
746 * @return Valid reference name on success, NULL on failure
747 *
748 * Fetch the reference sequence name from the target name array,
749 * using the numerical target id.
750 */
751 HTSLIB_EXPORT
752 const char *sam_hdr_tid2name(const sam_hdr_t *h, int tid);
753
754 /// Get the reference sequence length from a target index
755 /*!
756 * @param tid Target index
757 * @return Strictly positive value on success, 0 on failure
758 *
759 * Fetch the reference sequence length from the target length array,
760 * using the numerical target id.
761 */
762 HTSLIB_EXPORT
763 hts_pos_t sam_hdr_tid2len(const sam_hdr_t *h, int tid);
764
765 /// Alias of sam_hdr_name2tid(), for backwards compatibility.
766 /*!
767 * @param ref Reference name
768 * @return Positive value on success,
769 * -1 if unknown reference,
770 * -2 if the header could not be parsed
771 */
bam_name2id(sam_hdr_t * h,const char * ref)772 static inline int bam_name2id(sam_hdr_t *h, const char *ref) { return sam_hdr_name2tid(h, ref); }
773
774 /// Generate a unique \@PG ID: value
775 /*!
776 * @param name Name of the program. Eg. samtools
777 * @return Valid ID on success, NULL on failure
778 *
779 * Returns a unique ID from a base name. The string returned will remain
780 * valid until the next call to this function, or the header is destroyed.
781 * The caller should not attempt to free() or realloc() it.
782 */
783 HTSLIB_EXPORT
784 const char *sam_hdr_pg_id(sam_hdr_t *h, const char *name);
785
786 /// Add an \@PG line.
787 /*!
788 * @param name Name of the program. Eg. samtools
789 * @return 0 on success, -1 on failure
790 *
791 * If we wish complete control over this use sam_hdr_add_line() directly. This
792 * function uses that, but attempts to do a lot of tedious house work for
793 * you too.
794 *
795 * - It will generate a suitable ID if the supplied one clashes.
796 * - It will generate multiple \@PG records if we have multiple PG chains.
797 *
798 * Call it as per sam_hdr_add_line() with a series of key,value pairs ending
799 * in NULL.
800 */
801 HTSLIB_EXPORT
802 int sam_hdr_add_pg(sam_hdr_t *h, const char *name, ...);
803
804 /*!
805 * A function to help with construction of CL tags in @PG records.
806 * Takes an argc, argv pair and returns a single space-separated string.
807 * This string should be deallocated by the calling function.
808 *
809 * @return
810 * Returns malloced char * on success;
811 * NULL on failure
812 */
813 HTSLIB_EXPORT
814 char *stringify_argv(int argc, char *argv[]);
815
816 /// Increments the reference count on a header
817 /*!
818 * This permits multiple files to share the same header, all calling
819 * sam_hdr_destroy when done, without causing errors for other open files.
820 */
821 HTSLIB_EXPORT
822 void sam_hdr_incr_ref(sam_hdr_t *h);
823
824 /*
825 * Macros for changing the \@HD line. They eliminate the need to use NULL method arguments.
826 */
827
828 /// Returns the SAM formatted text of the \@HD header line
829 #define sam_hdr_find_hd(h, ks) sam_hdr_find_line_id((h), "HD", NULL, NULL, (ks))
830 /// Returns the value associated with a given \@HD line tag
831 #define sam_hdr_find_tag_hd(h, key, ks) sam_hdr_find_tag_id((h), "HD", NULL, NULL, (key), (ks))
832 /// Adds or updates tags on the header \@HD line
833 #define sam_hdr_update_hd(h, ...) sam_hdr_update_line((h), "HD", NULL, NULL, __VA_ARGS__, NULL)
834 /// Removes the \@HD line tag with the given key
835 #define sam_hdr_remove_tag_hd(h, key) sam_hdr_remove_tag_id((h), "HD", NULL, NULL, (key))
836
837 /* Alignment */
838
839 /// Create a new bam1_t alignment structure
840 /**
841 @return An empty bam1_t structure on success, NULL on failure
842
843 The bam1_t struct returned by a successful call should be freed
844 via bam_destroy1() when it is no longer needed.
845 */
846 HTSLIB_EXPORT
847 bam1_t *bam_init1(void);
848
849 /// Destroy a bam1_t structure
850 /**
851 @param b structure to destroy
852
853 Does nothing if @p b is NULL. If not, all memory associated with @p b
854 will be freed, along with the structure itself. @p b should not be
855 accessed after calling this function.
856 */
857 HTSLIB_EXPORT
858 void bam_destroy1(bam1_t *b);
859
860 #define BAM_USER_OWNS_STRUCT 1
861 #define BAM_USER_OWNS_DATA 2
862
863 /// Set alignment record memory policy
864 /**
865 @param b Alignment record
866 @param policy Desired policy
867
868 Allows the way HTSlib reallocates and frees bam1_t data to be
869 changed. @policy can be set to the bitwise-or of the following
870 values:
871
872 \li \c BAM_USER_OWNS_STRUCT
873 If this is set then bam_destroy1() will not try to free the bam1_t struct.
874
875 \li \c BAM_USER_OWNS_DATA
876 If this is set, bam_destroy1() will not free the bam1_t::data pointer.
877 Also, functions which need to expand bam1_t::data memory will change
878 behaviour. Instead of calling realloc() on the pointer, they will
879 allocate a new data buffer and copy any existing content in to it.
880 The existing memory will \b not be freed. bam1_t::data will be
881 set to point to the new memory and the BAM_USER_OWNS_DATA flag will be
882 cleared.
883
884 BAM_USER_OWNS_STRUCT allows bam_destroy1() to be called on bam1_t
885 structures that are members of an array.
886
887 BAM_USER_OWNS_DATA can be used by applications that want more control
888 over where the variable-length parts of the bam record will be stored.
889 By preventing calls to free() and realloc(), it allows bam1_t::data
890 to hold pointers to memory that cannot be passed to those functions.
891
892 Example: Read a block of alignment records, storing the variable-length
893 data in a single buffer and the records in an array. Stop when either
894 the array or the buffer is full.
895
896 \code{.c}
897 #define MAX_RECS 1000
898 #define REC_LENGTH 400 // Average length estimate, to get buffer size
899 size_t bufsz = MAX_RECS * REC_LENGTH, nrecs, buff_used = 0;
900 bam1_t *recs = calloc(MAX_RECS, sizeof(bam1_t));
901 uint8_t *buffer = malloc(bufsz);
902 int res = 0, result = EXIT_FAILURE;
903 uint32_t new_m_data;
904
905 if (!recs || !buffer) goto cleanup;
906 for (nrecs = 0; nrecs < MAX_RECS; nrecs++) {
907 bam_set_mempolicy(&recs[nrecs], BAM_USER_OWNS_STRUCT|BAM_USER_OWNS_DATA);
908
909 // Set data pointer to unused part of buffer
910 recs[nrecs].data = &buffer[buff_used];
911
912 // Set m_data to size of unused part of buffer. On 64-bit platforms it
913 // will be necessary to limit this to UINT32_MAX due to the size of
914 // bam1_t::m_data (not done here as our buffer is only 400K).
915 recs[nrecs].m_data = bufsz - buff_used;
916
917 // Read the record
918 res = sam_read1(file_handle, header, &recs[nrecs]);
919 if (res <= 0) break; // EOF or error
920
921 // Check if the record data didn't fit - if not, stop reading
922 if ((bam_get_mempolicy(&recs[nrecs]) & BAM_USER_OWNS_DATA) == 0) {
923 nrecs++; // Include last record in count
924 break;
925 }
926
927 // Adjust m_data to the space actually used. If space is available,
928 // round up to eight bytes so the next record aligns nicely.
929 new_m_data = ((uint32_t) recs[nrecs].l_data + 7) & (~7U);
930 if (new_m_data < recs[nrecs].m_data) recs[nrecs].m_data = new_m_data;
931
932 buff_used += recs[nrecs].m_data;
933 }
934 if (res < 0) goto cleanup;
935 result = EXIT_SUCCESS;
936
937 // ... use data ...
938
939 cleanup:
940 for (size_t i = 0; i < nrecs; i++)
941 bam_destroy1(i);
942 free(buffer);
943 free(recs);
944
945 \endcode
946 */
bam_set_mempolicy(bam1_t * b,uint32_t policy)947 static inline void bam_set_mempolicy(bam1_t *b, uint32_t policy) {
948 b->mempolicy = policy;
949 }
950
951 /// Get alignment record memory policy
952 /** @param b Alignment record
953
954 See bam_set_mempolicy()
955 */
bam_get_mempolicy(bam1_t * b)956 static inline uint32_t bam_get_mempolicy(bam1_t *b) {
957 return b->mempolicy;
958 }
959
960 /// Read a BAM format alignment record
961 /**
962 @param fp BGZF file being read
963 @param b Destination for the alignment data
964 @return number of bytes read on success
965 -1 at end of file
966 < -1 on failure
967
968 This function can only read BAM format files. Most code should use
969 sam_read1() instead, which can be used with BAM, SAM and CRAM formats.
970 */
971 HTSLIB_EXPORT
972 int bam_read1(BGZF *fp, bam1_t *b) HTS_RESULT_USED;
973
974 /// Write a BAM format alignment record
975 /**
976 @param fp BGZF file being written
977 @param b Alignment record to write
978 @return number of bytes written on success
979 -1 on error
980
981 This function can only write BAM format files. Most code should use
982 sam_write1() instead, which can be used with BAM, SAM and CRAM formats.
983 */
984 HTSLIB_EXPORT
985 int bam_write1(BGZF *fp, const bam1_t *b) HTS_RESULT_USED;
986
987 /// Copy alignment record data
988 /**
989 @param bdst Destination alignment record
990 @param bsrc Source alignment record
991 @return bdst on success; NULL on failure
992 */
993 HTSLIB_EXPORT
994 bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) HTS_RESULT_USED;
995
996 /// Create a duplicate alignment record
997 /**
998 @param bsrc Source alignment record
999 @return Pointer to a new alignment record on success; NULL on failure
1000
1001 The bam1_t struct returned by a successful call should be freed
1002 via bam_destroy1() when it is no longer needed.
1003 */
1004 HTSLIB_EXPORT
1005 bam1_t *bam_dup1(const bam1_t *bsrc);
1006
1007 /// Sets all components of an alignment structure
1008 /**
1009 @param bam Target alignment structure. Must be initialized by a call to bam_init1().
1010 The data field will be reallocated automatically as needed.
1011 @param l_qname Length of the query name. If set to 0, the placeholder query name "*" will be used.
1012 @param qname Query name, may be NULL if l_qname = 0
1013 @param flag Bitwise flag, a combination of the BAM_F* constants.
1014 @param tid Chromosome ID, defined by sam_hdr_t (a.k.a. RNAME).
1015 @param pos 0-based leftmost coordinate.
1016 @param mapq Mapping quality.
1017 @param n_cigar Number of CIGAR operations.
1018 @param cigar CIGAR data, may be NULL if n_cigar = 0.
1019 @param mtid Chromosome ID of next read in template, defined by sam_hdr_t (a.k.a. RNEXT).
1020 @param mpos 0-based leftmost coordinate of next read in template (a.k.a. PNEXT).
1021 @param isize Observed template length ("insert size") (a.k.a. TLEN).
1022 @param l_seq Length of the query sequence (read) and sequence quality string.
1023 @param seq Sequence, may be NULL if l_seq = 0.
1024 @param qual Sequence quality, may be NULL.
1025 @param l_aux Length to be reserved for auxiliary field data, may be 0.
1026
1027 @return >= 0 on success (number of bytes written to bam->data), negative (with errno set) on failure.
1028 */
1029 HTSLIB_EXPORT
1030 int bam_set1(bam1_t *bam,
1031 size_t l_qname, const char *qname,
1032 uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq,
1033 size_t n_cigar, const uint32_t *cigar,
1034 int32_t mtid, hts_pos_t mpos, hts_pos_t isize,
1035 size_t l_seq, const char *seq, const char *qual,
1036 size_t l_aux);
1037
1038 /// Calculate query length from CIGAR data
1039 /**
1040 @param n_cigar Number of items in @p cigar
1041 @param cigar CIGAR data
1042 @return Query length
1043
1044 CIGAR data is stored as in the BAM format, i.e. (op_len << 4) | op
1045 where op_len is the length in bases and op is a value between 0 and 8
1046 representing one of the operations "MIDNSHP=X" (M = 0; X = 8)
1047
1048 This function returns the sum of the lengths of the M, I, S, = and X
1049 operations in @p cigar (these are the operations that "consume" query
1050 bases). All other operations (including invalid ones) are ignored.
1051
1052 @note This return type of this function is hts_pos_t so that it can
1053 correctly return the length of CIGAR sequences including many long
1054 operations without overflow. However, other restrictions (notably the sizes
1055 of bam1_core_t::l_qseq and bam1_t::data) limit the maximum query sequence
1056 length supported by HTSlib to fewer than INT_MAX bases.
1057 */
1058 HTSLIB_EXPORT
1059 hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar);
1060
1061 /// Calculate reference length from CIGAR data
1062 /**
1063 @param n_cigar Number of items in @p cigar
1064 @param cigar CIGAR data
1065 @return Reference length
1066
1067 CIGAR data is stored as in the BAM format, i.e. (op_len << 4) | op
1068 where op_len is the length in bases and op is a value between 0 and 8
1069 representing one of the operations "MIDNSHP=X" (M = 0; X = 8)
1070
1071 This function returns the sum of the lengths of the M, D, N, = and X
1072 operations in @p cigar (these are the operations that "consume" reference
1073 bases). All other operations (including invalid ones) are ignored.
1074 */
1075 HTSLIB_EXPORT
1076 hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar);
1077
1078 /*!
1079 @abstract Calculate the rightmost base position of an alignment on the
1080 reference genome.
1081
1082 @param b pointer to an alignment
1083 @return the coordinate of the first base after the alignment, 0-based
1084
1085 @discussion For a mapped read, this is just b->core.pos + bam_cigar2rlen.
1086 For an unmapped read (either according to its flags or if it has no cigar
1087 string) or a read whose cigar string consumes no reference bases at all,
1088 we return b->core.pos + 1 by convention.
1089 */
1090 HTSLIB_EXPORT
1091 hts_pos_t bam_endpos(const bam1_t *b);
1092
1093 HTSLIB_EXPORT
1094 int bam_str2flag(const char *str); /** returns negative value on error */
1095
1096 HTSLIB_EXPORT
1097 char *bam_flag2str(int flag); /** The string must be freed by the user */
1098
1099 /*! @function
1100 @abstract Set the name of the query
1101 @param b pointer to an alignment
1102 @return 0 on success, -1 on failure
1103 */
1104 HTSLIB_EXPORT
1105 int bam_set_qname(bam1_t *b, const char *qname);
1106
1107 /*! @function
1108 @abstract Parse a CIGAR string into a uint32_t array
1109 @param in [in] pointer to the source string
1110 @param end [out] address of the pointer to the new end of the input string
1111 can be NULL
1112 @param a_cigar [in/out] address of the destination uint32_t buffer
1113 @param a_mem [in/out] address of the allocated number of buffer elements
1114 @return number of processed CIGAR operators; -1 on error
1115 */
1116 HTSLIB_EXPORT
1117 ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem);
1118
1119 /*! @function
1120 @abstract Parse a CIGAR string into a bam1_t struct
1121 @param in [in] pointer to the source string
1122 @param end [out] address of the pointer to the new end of the input string
1123 can be NULL
1124 @param b [in/out] address of the destination bam1_t struct
1125 @return number of processed CIGAR operators; -1 on error
1126 */
1127 HTSLIB_EXPORT
1128 ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b);
1129
1130 /*************************
1131 *** BAM/CRAM indexing ***
1132 *************************/
1133
1134 // These BAM iterator functions work only on BAM files. To work with either
1135 // BAM or CRAM files use the sam_index_load() & sam_itr_*() functions.
1136 #define bam_itr_destroy(iter) hts_itr_destroy(iter)
1137 #define bam_itr_queryi(idx, tid, beg, end) sam_itr_queryi(idx, tid, beg, end)
1138 #define bam_itr_querys(idx, hdr, region) sam_itr_querys(idx, hdr, region)
1139 #define bam_itr_next(htsfp, itr, r) sam_itr_next((htsfp), (itr), (r))
1140
1141 // Load/build .csi or .bai BAM index file. Does not work with CRAM.
1142 // It is recommended to use the sam_index_* functions below instead.
1143 #define bam_index_load(fn) hts_idx_load((fn), HTS_FMT_BAI)
1144 #define bam_index_build(fn, min_shift) (sam_index_build((fn), (min_shift)))
1145
1146 /// Initialise fp->idx for the current format type for SAM, BAM and CRAM types .
1147 /** @param fp File handle for the data file being written.
1148 @param h Bam header structured (needed for BAI and CSI).
1149 @param min_shift 0 for BAI, or larger for CSI (CSI defaults to 14).
1150 @param fnidx Filename to write index to. This pointer must remain valid
1151 until after sam_idx_save is called.
1152 @return 0 on success, <0 on failure.
1153
1154 @note This must be called after the header has been written, but before
1155 any other data.
1156 */
1157 HTSLIB_EXPORT
1158 int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx);
1159
1160 /// Writes the index initialised with sam_idx_init to disk.
1161 /** @param fp File handle for the data file being written.
1162 @return 0 on success, <0 on failure.
1163 */
1164 HTSLIB_EXPORT
1165 int sam_idx_save(htsFile *fp) HTS_RESULT_USED;
1166
1167 /// Load a BAM (.csi or .bai) or CRAM (.crai) index file
1168 /** @param fp File handle of the data file whose index is being opened
1169 @param fn BAM/CRAM/etc filename to search alongside for the index file
1170 @return The index, or NULL if an error occurred.
1171
1172 Equivalent to sam_index_load3(fp, fn, NULL, HTS_IDX_SAVE_REMOTE);
1173 */
1174 HTSLIB_EXPORT
1175 hts_idx_t *sam_index_load(htsFile *fp, const char *fn);
1176
1177 /// Load a specific BAM (.csi or .bai) or CRAM (.crai) index file
1178 /** @param fp File handle of the data file whose index is being opened
1179 @param fn BAM/CRAM/etc data file filename
1180 @param fnidx Index filename, or NULL to search alongside @a fn
1181 @return The index, or NULL if an error occurred.
1182
1183 Equivalent to sam_index_load3(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE);
1184 */
1185 HTSLIB_EXPORT
1186 hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx);
1187
1188 /// Load or stream a BAM (.csi or .bai) or CRAM (.crai) index file
1189 /** @param fp File handle of the data file whose index is being opened
1190 @param fn BAM/CRAM/etc data file filename
1191 @param fnidx Index filename, or NULL to search alongside @a fn
1192 @param flags Flags to alter behaviour (see description)
1193 @return The index, or NULL if an error occurred.
1194
1195 The @p flags parameter can be set to a combination of the following values:
1196
1197 HTS_IDX_SAVE_REMOTE Save a local copy of any remote indexes
1198 HTS_IDX_SILENT_FAIL Fail silently if the index is not present
1199
1200 Note that HTS_IDX_SAVE_REMOTE has no effect for remote CRAM indexes. They
1201 are always downloaded and never cached locally.
1202
1203 The index struct returned by a successful call should be freed
1204 via hts_idx_destroy() when it is no longer needed.
1205 */
1206 HTSLIB_EXPORT
1207 hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags);
1208
1209 /// Generate and save an index file
1210 /** @param fn Input BAM/etc filename, to which .csi/etc will be added
1211 @param min_shift Positive to generate CSI, or 0 to generate BAI
1212 @return 0 if successful, or negative if an error occurred (usually -1; or
1213 -2: opening fn failed; -3: format not indexable; -4:
1214 failed to create and/or save the index)
1215 */
1216 HTSLIB_EXPORT
1217 int sam_index_build(const char *fn, int min_shift) HTS_RESULT_USED;
1218
1219 /// Generate and save an index to a specific file
1220 /** @param fn Input BAM/CRAM/etc filename
1221 @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn
1222 @param min_shift Positive to generate CSI, or 0 to generate BAI
1223 @return 0 if successful, or negative if an error occurred (see
1224 sam_index_build for error codes)
1225 */
1226 HTSLIB_EXPORT
1227 int sam_index_build2(const char *fn, const char *fnidx, int min_shift) HTS_RESULT_USED;
1228
1229 /// Generate and save an index to a specific file
1230 /** @param fn Input BAM/CRAM/etc filename
1231 @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn
1232 @param min_shift Positive to generate CSI, or 0 to generate BAI
1233 @param nthreads Number of threads to use when building the index
1234 @return 0 if successful, or negative if an error occurred (see
1235 sam_index_build for error codes)
1236 */
1237 HTSLIB_EXPORT
1238 int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads) HTS_RESULT_USED;
1239
1240 /// Free a SAM iterator
1241 /// @param iter Iterator to free
1242 #define sam_itr_destroy(iter) hts_itr_destroy(iter)
1243
1244 /// Create a BAM/CRAM iterator
1245 /** @param idx Index
1246 @param tid Target id
1247 @param beg Start position in target
1248 @param end End position in target
1249 @return An iterator on success; NULL on failure
1250
1251 The following special values (defined in htslib/hts.h)can be used for @p tid.
1252 When using one of these values, @p beg and @p end are ignored.
1253
1254 HTS_IDX_NOCOOR iterates over unmapped reads sorted at the end of the file
1255 HTS_IDX_START iterates over the entire file
1256 HTS_IDX_REST iterates from the current position to the end of the file
1257 HTS_IDX_NONE always returns "no more alignment records"
1258
1259 When using HTS_IDX_REST or HTS_IDX_NONE, NULL can be passed in to @p idx.
1260 */
1261 HTSLIB_EXPORT
1262 hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end);
1263
1264 /// Create a SAM/BAM/CRAM iterator
1265 /** @param idx Index
1266 @param hdr Header
1267 @param region Region specification
1268 @return An iterator on success; NULL on failure
1269
1270 Regions are parsed by hts_parse_reg(), and take one of the following forms:
1271
1272 region | Outputs
1273 --------------- | -------------
1274 REF | All reads with RNAME REF
1275 REF: | All reads with RNAME REF
1276 REF:START | Reads with RNAME REF overlapping START to end of REF
1277 REF:-END | Reads with RNAME REF overlapping start of REF to END
1278 REF:START-END | Reads with RNAME REF overlapping START to END
1279 . | All reads from the start of the file
1280 * | Unmapped reads at the end of the file (RNAME '*' in SAM)
1281
1282 The form `REF:` should be used when the reference name itself contains a colon.
1283
1284 Note that SAM files must be bgzf-compressed for iterators to work.
1285 */
1286 HTSLIB_EXPORT
1287 hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region);
1288
1289 /// Create a multi-region iterator
1290 /** @param idx Index
1291 @param hdr Header
1292 @param reglist Array of regions to iterate over
1293 @param regcount Number of items in reglist
1294
1295 Each @p reglist entry should have the reference name in the `reg` field, an
1296 array of regions for that reference in `intervals` and the number of items
1297 in `intervals` should be stored in `count`. No other fields need to be filled
1298 in.
1299
1300 The iterator will return all reads overlapping the given regions. If a read
1301 overlaps more than one region, it will only be returned once.
1302 */
1303 HTSLIB_EXPORT
1304 hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount);
1305
1306 /// Create a multi-region iterator
1307 /** @param idx Index
1308 @param hdr Header
1309 @param regarray Array of ref:interval region specifiers
1310 @param regcount Number of items in regarray
1311
1312 Each @p regarray entry is parsed by hts_parse_reg(), and takes one of the
1313 following forms:
1314
1315 region | Outputs
1316 --------------- | -------------
1317 REF | All reads with RNAME REF
1318 REF: | All reads with RNAME REF
1319 REF:START | Reads with RNAME REF overlapping START to end of REF
1320 REF:-END | Reads with RNAME REF overlapping start of REF to END
1321 REF:START-END | Reads with RNAME REF overlapping START to END
1322 . | All reads from the start of the file
1323 * | Unmapped reads at the end of the file (RNAME '*' in SAM)
1324
1325 The form `REF:` should be used when the reference name itself contains a colon.
1326
1327 The iterator will return all reads overlapping the given regions. If a read
1328 overlaps more than one region, it will only be returned once.
1329 */
1330 HTSLIB_EXPORT
1331 hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount);
1332
1333 /// Get the next read from a SAM/BAM/CRAM iterator
1334 /** @param htsfp Htsfile pointer for the input file
1335 @param itr Iterator
1336 @param r Pointer to a bam1_t struct
1337 @return >= 0 on success; -1 when there is no more data; < -1 on error
1338 */
sam_itr_next(htsFile * htsfp,hts_itr_t * itr,bam1_t * r)1339 static inline int sam_itr_next(htsFile *htsfp, hts_itr_t *itr, bam1_t *r) {
1340 if (!htsfp->is_bgzf && !htsfp->is_cram) {
1341 hts_log_error("%s not BGZF compressed", htsfp->fn ? htsfp->fn : "File");
1342 return -2;
1343 }
1344 if (!itr) {
1345 hts_log_error("Null iterator");
1346 return -2;
1347 }
1348
1349 if (itr->multi)
1350 return hts_itr_multi_next(htsfp, itr, r);
1351 else
1352 return hts_itr_next(htsfp->is_bgzf ? htsfp->fp.bgzf : NULL, itr, r, htsfp);
1353 }
1354
1355 /// Get the next read from a BAM/CRAM multi-iterator
1356 /** @param htsfp Htsfile pointer for the input file
1357 @param itr Iterator
1358 @param r Pointer to a bam1_t struct
1359 @return >= 0 on success; -1 when there is no more data; < -1 on error
1360 */
1361 #define sam_itr_multi_next(htsfp, itr, r) sam_itr_next(htsfp, itr, r)
1362
1363 HTSLIB_EXPORT
1364 const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid,
1365 hts_pos_t *beg, hts_pos_t *end, int flags);
1366
1367 /***************
1368 *** SAM I/O ***
1369 ***************/
1370
1371 #define sam_open(fn, mode) (hts_open((fn), (mode)))
1372 #define sam_open_format(fn, mode, fmt) (hts_open_format((fn), (mode), (fmt)))
1373 #define sam_flush(fp) hts_flush((fp))
1374 #define sam_close(fp) hts_close(fp)
1375
1376 HTSLIB_EXPORT
1377 int sam_open_mode(char *mode, const char *fn, const char *format);
1378
1379 // A version of sam_open_mode that can handle ,key=value options.
1380 // The format string is allocated and returned, to be freed by the caller.
1381 // Prefix should be "r" or "w",
1382 HTSLIB_EXPORT
1383 char *sam_open_mode_opts(const char *fn,
1384 const char *mode,
1385 const char *format);
1386
1387 HTSLIB_EXPORT
1388 int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val);
1389
1390 HTSLIB_EXPORT
1391 int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) HTS_RESULT_USED;
1392 HTSLIB_EXPORT
1393 int sam_format1(const sam_hdr_t *h, const bam1_t *b, kstring_t *str) HTS_RESULT_USED;
1394
1395 /// sam_read1 - Read a record from a file
1396 /** @param fp Pointer to the source file
1397 * @param h Pointer to the header previously read (fully or partially)
1398 * @param b Pointer to the record placeholder
1399 * @return >= 0 on successfully reading a new record, -1 on end of stream, < -1 on error
1400 */
1401 HTSLIB_EXPORT
1402 int sam_read1(samFile *fp, sam_hdr_t *h, bam1_t *b) HTS_RESULT_USED;
1403 /// sam_write1 - Write a record to a file
1404 /** @param fp Pointer to the destination file
1405 * @param h Pointer to the header structure previously read
1406 * @param b Pointer to the record to be written
1407 * @return >= 0 on successfully writing the record, -1 on error
1408 */
1409 HTSLIB_EXPORT
1410 int sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b) HTS_RESULT_USED;
1411
1412 // Forward declaration, see hts_expr.h for full.
1413 struct hts_filter_t;
1414
1415 /// sam_passes_filter - Checks whether a record passes an hts_filter.
1416 /** @param h Pointer to the header structure previously read
1417 * @param b Pointer to the BAM record to be checked
1418 * @param filt Pointer to the filter, created from hts_filter_init.
1419 * @return 1 if passes, 0 if not, and <0 on error.
1420 */
1421 HTSLIB_EXPORT
1422 int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b,
1423 struct hts_filter_t *filt);
1424
1425 /*************************************
1426 *** Manipulating auxiliary fields ***
1427 *************************************/
1428
1429 /// Converts a BAM aux tag to SAM format
1430 /*
1431 * @param b Pointer to the bam record
1432 * @param key Two letter tag key
1433 * @param type Single letter type code: ACcSsIifHZB.
1434 * @param tag Tag data pointer, in BAM format
1435 * @param end Pointer to end of bam record (largest extent of tag)
1436 * @param ks kstring to write the formatted tag to
1437 *
1438 * @return pointer to end of tag on success,
1439 * NULL on failure.
1440 *
1441 * @discussion The three separate parameters key, type, tag may be
1442 * derived from a s=bam_aux_get() query as s-2, *s and s+1. However
1443 * it is recommended to use bam_aux_get_str in this situation.
1444 * The desire to split these parameters up is for potential processing
1445 * of non-BAM formats that encode using a BAM type mechanism
1446 * (such as the internal CRAM representation).
1447 */
sam_format_aux1(const uint8_t * key,const uint8_t type,const uint8_t * tag,const uint8_t * end,kstring_t * ks)1448 static inline const uint8_t *sam_format_aux1(const uint8_t *key,
1449 const uint8_t type,
1450 const uint8_t *tag,
1451 const uint8_t *end,
1452 kstring_t *ks) {
1453 int r = 0;
1454 const uint8_t *s = tag; // brevity and consistency with other code.
1455 r |= kputsn_((char*)key, 2, ks) < 0;
1456 r |= kputc_(':', ks) < 0;
1457 if (type == 'C') {
1458 r |= kputsn_("i:", 2, ks) < 0;
1459 r |= kputw(*s, ks) < 0;
1460 ++s;
1461 } else if (type == 'c') {
1462 r |= kputsn_("i:", 2, ks) < 0;
1463 r |= kputw(*(int8_t*)s, ks) < 0;
1464 ++s;
1465 } else if (type == 'S') {
1466 if (end - s >= 2) {
1467 r |= kputsn_("i:", 2, ks) < 0;
1468 r |= kputuw(le_to_u16(s), ks) < 0;
1469 s += 2;
1470 } else goto bad_aux;
1471 } else if (type == 's') {
1472 if (end - s >= 2) {
1473 r |= kputsn_("i:", 2, ks) < 0;
1474 r |= kputw(le_to_i16(s), ks) < 0;
1475 s += 2;
1476 } else goto bad_aux;
1477 } else if (type == 'I') {
1478 if (end - s >= 4) {
1479 r |= kputsn_("i:", 2, ks) < 0;
1480 r |= kputuw(le_to_u32(s), ks) < 0;
1481 s += 4;
1482 } else goto bad_aux;
1483 } else if (type == 'i') {
1484 if (end - s >= 4) {
1485 r |= kputsn_("i:", 2, ks) < 0;
1486 r |= kputw(le_to_i32(s), ks) < 0;
1487 s += 4;
1488 } else goto bad_aux;
1489 } else if (type == 'A') {
1490 r |= kputsn_("A:", 2, ks) < 0;
1491 r |= kputc_(*s, ks) < 0;
1492 ++s;
1493 } else if (type == 'f') {
1494 if (end - s >= 4) {
1495 ksprintf(ks, "f:%g", le_to_float(s));
1496 s += 4;
1497 } else goto bad_aux;
1498
1499 } else if (type == 'd') {
1500 // NB: "d" is not an official type in the SAM spec.
1501 // However for unknown reasons samtools has always supported this.
1502 // We believe, HOPE, it is not in general usage and we do not
1503 // encourage it.
1504 if (end - s >= 8) {
1505 ksprintf(ks, "d:%g", le_to_double(s));
1506 s += 8;
1507 } else goto bad_aux;
1508 } else if (type == 'Z' || type == 'H') {
1509 r |= kputc_(type, ks) < 0;
1510 r |= kputc_(':', ks) < 0;
1511 while (s < end && *s) r |= kputc_(*s++, ks) < 0;
1512 if (s >= end)
1513 goto bad_aux;
1514 ++s;
1515 } else if (type == 'B') {
1516 uint8_t sub_type = *(s++);
1517 int sub_type_size;
1518
1519 // or externalise sam.c's aux_type2size function?
1520 switch (sub_type) {
1521 case 'A': case 'c': case 'C':
1522 sub_type_size = 1;
1523 break;
1524 case 's': case 'S':
1525 sub_type_size = 2;
1526 break;
1527 case 'i': case 'I': case 'f':
1528 sub_type_size = 4;
1529 break;
1530 default:
1531 sub_type_size = 0;
1532 break;
1533 }
1534
1535 uint32_t i, n;
1536 if (sub_type_size == 0 || end - s < 4)
1537 goto bad_aux;
1538 n = le_to_u32(s);
1539 s += 4; // now points to the start of the array
1540 if ((end - s) / sub_type_size < n)
1541 goto bad_aux;
1542 r |= kputsn_("B:", 2, ks) < 0;
1543 r |= kputc(sub_type, ks) < 0; // write the type
1544 switch (sub_type) {
1545 case 'c':
1546 if (ks_expand(ks, n*2) < 0) goto mem_err;
1547 for (i = 0; i < n; ++i) {
1548 ks->s[ks->l++] = ',';
1549 r |= kputw(*(int8_t*)s, ks) < 0;
1550 ++s;
1551 }
1552 break;
1553 case 'C':
1554 if (ks_expand(ks, n*2) < 0) goto mem_err;
1555 for (i = 0; i < n; ++i) {
1556 ks->s[ks->l++] = ',';
1557 r |= kputuw(*(uint8_t*)s, ks) < 0;
1558 ++s;
1559 }
1560 break;
1561 case 's':
1562 if (ks_expand(ks, n*4) < 0) goto mem_err;
1563 for (i = 0; i < n; ++i) {
1564 ks->s[ks->l++] = ',';
1565 r |= kputw(le_to_i16(s), ks) < 0;
1566 s += 2;
1567 }
1568 break;
1569 case 'S':
1570 if (ks_expand(ks, n*4) < 0) goto mem_err;
1571 for (i = 0; i < n; ++i) {
1572 ks->s[ks->l++] = ',';
1573 r |= kputuw(le_to_u16(s), ks) < 0;
1574 s += 2;
1575 }
1576 break;
1577 case 'i':
1578 if (ks_expand(ks, n*6) < 0) goto mem_err;
1579 for (i = 0; i < n; ++i) {
1580 ks->s[ks->l++] = ',';
1581 r |= kputw(le_to_i32(s), ks) < 0;
1582 s += 4;
1583 }
1584 break;
1585 case 'I':
1586 if (ks_expand(ks, n*6) < 0) goto mem_err;
1587 for (i = 0; i < n; ++i) {
1588 ks->s[ks->l++] = ',';
1589 r |= kputuw(le_to_u32(s), ks) < 0;
1590 s += 4;
1591 }
1592 break;
1593 case 'f':
1594 if (ks_expand(ks, n*8) < 0) goto mem_err;
1595 for (i = 0; i < n; ++i) {
1596 ks->s[ks->l++] = ',';
1597 r |= kputd(le_to_float(s), ks) < 0;
1598 s += 4;
1599 }
1600 break;
1601 default:
1602 goto bad_aux;
1603 }
1604 } else { // Unknown type
1605 goto bad_aux;
1606 }
1607 return r ? NULL : s;
1608
1609 bad_aux:
1610 errno = EINVAL;
1611 return NULL;
1612
1613 mem_err:
1614 hts_log_error("Out of memory");
1615 errno = ENOMEM;
1616 return NULL;
1617 }
1618
1619 /// Return a pointer to an aux record
1620 /** @param b Pointer to the bam record
1621 @param tag Desired aux tag
1622 @return Pointer to the tag data, or NULL if tag is not present or on error
1623 If the tag is not present, this function returns NULL and sets errno to
1624 ENOENT. If the bam record's aux data is corrupt (either a tag has an
1625 invalid type, or the last record is incomplete) then errno is set to
1626 EINVAL and NULL is returned.
1627 */
1628 HTSLIB_EXPORT
1629 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]);
1630
1631 /// Return a SAM formatting string containing a BAM tag
1632 /** @param b Pointer to the bam record
1633 @param tag Desired aux tag
1634 @param s The kstring to write to.
1635
1636 @return 1 on success,
1637 0 on no tag found with errno = ENOENT,
1638 -1 on error (errno will be either EINVAL or ENOMEM).
1639 */
bam_aux_get_str(const bam1_t * b,const char tag[2],kstring_t * s)1640 static inline int bam_aux_get_str(const bam1_t *b,
1641 const char tag[2],
1642 kstring_t *s) {
1643 const uint8_t *t = bam_aux_get(b, tag);
1644 if (!t)
1645 return errno == ENOENT ? 0 : -1;
1646
1647 if (!sam_format_aux1(t-2, *t, t+1, b->data + b->l_data, s))
1648 return -1;
1649
1650 return 1;
1651 }
1652
1653 /// Get an integer aux value
1654 /** @param s Pointer to the tag data, as returned by bam_aux_get()
1655 @return The value, or 0 if the tag was not an integer type
1656 If the tag is not an integer type, errno is set to EINVAL. This function
1657 will not return the value of floating-point tags.
1658 */
1659 HTSLIB_EXPORT
1660 int64_t bam_aux2i(const uint8_t *s);
1661
1662 /// Get an integer aux value
1663 /** @param s Pointer to the tag data, as returned by bam_aux_get()
1664 @return The value, or 0 if the tag was not an integer type
1665 If the tag is not an numeric type, errno is set to EINVAL. The value of
1666 integer flags will be returned cast to a double.
1667 */
1668 HTSLIB_EXPORT
1669 double bam_aux2f(const uint8_t *s);
1670
1671 /// Get a character aux value
1672 /** @param s Pointer to the tag data, as returned by bam_aux_get().
1673 @return The value, or 0 if the tag was not a character ('A') type
1674 If the tag is not a character type, errno is set to EINVAL.
1675 */
1676 HTSLIB_EXPORT
1677 char bam_aux2A(const uint8_t *s);
1678
1679 /// Get a string aux value
1680 /** @param s Pointer to the tag data, as returned by bam_aux_get().
1681 @return Pointer to the string, or NULL if the tag was not a string type
1682 If the tag is not a string type ('Z' or 'H'), errno is set to EINVAL.
1683 */
1684 HTSLIB_EXPORT
1685 char *bam_aux2Z(const uint8_t *s);
1686
1687 /// Get the length of an array-type ('B') tag
1688 /** @param s Pointer to the tag data, as returned by bam_aux_get().
1689 @return The length of the array, or 0 if the tag is not an array type.
1690 If the tag is not an array type, errno is set to EINVAL.
1691 */
1692 HTSLIB_EXPORT
1693 uint32_t bam_auxB_len(const uint8_t *s);
1694
1695 /// Get an integer value from an array-type tag
1696 /** @param s Pointer to the tag data, as returned by bam_aux_get().
1697 @param idx 0-based Index into the array
1698 @return The idx'th value, or 0 on error.
1699 If the array is not an integer type, errno is set to EINVAL. If idx
1700 is greater than or equal to the value returned by bam_auxB_len(s),
1701 errno is set to ERANGE. In both cases, 0 will be returned.
1702 */
1703 HTSLIB_EXPORT
1704 int64_t bam_auxB2i(const uint8_t *s, uint32_t idx);
1705
1706 /// Get a floating-point value from an array-type tag
1707 /** @param s Pointer to the tag data, as returned by bam_aux_get().
1708 @param idx 0-based Index into the array
1709 @return The idx'th value, or 0.0 on error.
1710 If the array is not a numeric type, errno is set to EINVAL. This can
1711 only actually happen if the input record has an invalid type field. If
1712 idx is greater than or equal to the value returned by bam_auxB_len(s),
1713 errno is set to ERANGE. In both cases, 0.0 will be returned.
1714 */
1715 HTSLIB_EXPORT
1716 double bam_auxB2f(const uint8_t *s, uint32_t idx);
1717
1718 /// Append tag data to a bam record
1719 /* @param b The bam record to append to.
1720 @param tag Tag identifier
1721 @param type Tag data type
1722 @param len Length of the data in bytes
1723 @param data The data to append
1724 @return 0 on success; -1 on failure.
1725 If there is not enough space to store the additional tag, errno is set to
1726 ENOMEM. If the type is invalid, errno may be set to EINVAL. errno is
1727 also set to EINVAL if the bam record's aux data is corrupt.
1728 */
1729 HTSLIB_EXPORT
1730 int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data);
1731
1732 /// Delete tag data from a bam record
1733 /* @param b The bam record to update
1734 @param s Pointer to the tag to delete, as returned by bam_aux_get().
1735 @return 0 on success; -1 on failure
1736 If the bam record's aux data is corrupt, errno is set to EINVAL and this
1737 function returns -1;
1738 */
1739 HTSLIB_EXPORT
1740 int bam_aux_del(bam1_t *b, uint8_t *s);
1741
1742 /// Update or add a string-type tag
1743 /* @param b The bam record to update
1744 @param tag Tag identifier
1745 @param len The length of the new string
1746 @param data The new string
1747 @return 0 on success, -1 on failure
1748 This function will not change the ordering of tags in the bam record.
1749 New tags will be appended to any existing aux records.
1750
1751 If @p len is less than zero, the length of the input string will be
1752 calculated using strlen(). Otherwise exactly @p len bytes will be
1753 copied from @p data to make the new tag. If these bytes do not
1754 include a terminating NUL character, one will be added. (Note that
1755 versions of HTSlib up to 1.10.2 had different behaviour here and
1756 simply copied @p len bytes from data. To generate a valid tag it
1757 was necessary to ensure the last character was a NUL, and include
1758 it in @p len.)
1759
1760 On failure, errno may be set to one of the following values:
1761
1762 EINVAL: The bam record's aux data is corrupt or an existing tag with the
1763 given ID is not of type 'Z'.
1764
1765 ENOMEM: The bam data needs to be expanded and either the attempt to
1766 reallocate the data buffer failed or the resulting buffer would be
1767 longer than the maximum size allowed in a bam record (2Gbytes).
1768 */
1769 HTSLIB_EXPORT
1770 int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data);
1771
1772 /// Update or add an integer tag
1773 /* @param b The bam record to update
1774 @param tag Tag identifier
1775 @param val The new value
1776 @return 0 on success, -1 on failure
1777 This function will not change the ordering of tags in the bam record.
1778 New tags will be appended to any existing aux records.
1779
1780 On failure, errno may be set to one of the following values:
1781
1782 EINVAL: The bam record's aux data is corrupt or an existing tag with the
1783 given ID is not of an integer type (c, C, s, S, i or I).
1784
1785 EOVERFLOW (or ERANGE on systems that do not have EOVERFLOW): val is
1786 outside the range that can be stored in an integer bam tag (-2147483647
1787 to 4294967295).
1788
1789 ENOMEM: The bam data needs to be expanded and either the attempt to
1790 reallocate the data buffer failed or the resulting buffer would be
1791 longer than the maximum size allowed in a bam record (2Gbytes).
1792 */
1793 HTSLIB_EXPORT
1794 int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val);
1795
1796 /// Update or add a floating-point tag
1797 /* @param b The bam record to update
1798 @param tag Tag identifier
1799 @param val The new value
1800 @return 0 on success, -1 on failure
1801 This function will not change the ordering of tags in the bam record.
1802 New tags will be appended to any existing aux records.
1803
1804 On failure, errno may be set to one of the following values:
1805
1806 EINVAL: The bam record's aux data is corrupt or an existing tag with the
1807 given ID is not of a float type.
1808
1809 ENOMEM: The bam data needs to be expanded and either the attempt to
1810 reallocate the data buffer failed or the resulting buffer would be
1811 longer than the maximum size allowed in a bam record (2Gbytes).
1812 */
1813 HTSLIB_EXPORT
1814 int bam_aux_update_float(bam1_t *b, const char tag[2], float val);
1815
1816 /// Update or add an array tag
1817 /* @param b The bam record to update
1818 @param tag Tag identifier
1819 @param type Data type (one of c, C, s, S, i, I or f)
1820 @param items Number of items
1821 @param data Pointer to data
1822 @return 0 on success, -1 on failure
1823 The type parameter indicates the how the data is interpreted:
1824
1825 Letter code | Data type | Item Size (bytes)
1826 ----------- | --------- | -----------------
1827 c | int8_t | 1
1828 C | uint8_t | 1
1829 s | int16_t | 2
1830 S | uint16_t | 2
1831 i | int32_t | 4
1832 I | uint32_t | 4
1833 f | float | 4
1834
1835 This function will not change the ordering of tags in the bam record.
1836 New tags will be appended to any existing aux records. The bam record
1837 will grow or shrink in order to accommodate the new data.
1838
1839 The data parameter must not point to any data in the bam record itself or
1840 undefined behaviour may result.
1841
1842 On failure, errno may be set to one of the following values:
1843
1844 EINVAL: The bam record's aux data is corrupt, an existing tag with the
1845 given ID is not of an array type or the type parameter is not one of
1846 the values listed above.
1847
1848 ENOMEM: The bam data needs to be expanded and either the attempt to
1849 reallocate the data buffer failed or the resulting buffer would be
1850 longer than the maximum size allowed in a bam record (2Gbytes).
1851 */
1852 HTSLIB_EXPORT
1853 int bam_aux_update_array(bam1_t *b, const char tag[2],
1854 uint8_t type, uint32_t items, void *data);
1855
1856 /**************************
1857 *** Pileup and Mpileup ***
1858 **************************/
1859
1860 #if !defined(BAM_NO_PILEUP)
1861
1862 /*! @typedef
1863 @abstract Generic pileup 'client data'.
1864
1865 @discussion The pileup iterator allows setting a constructor and
1866 destructor function, which will be called every time a sequence is
1867 fetched and discarded. This permits caching of per-sequence data in
1868 a tidy manner during the pileup process. This union is the cached
1869 data to be manipulated by the "client" (the caller of pileup).
1870 */
1871 typedef union {
1872 void *p;
1873 int64_t i;
1874 double f;
1875 } bam_pileup_cd;
1876
1877 /*! @typedef
1878 @abstract Structure for one alignment covering the pileup position.
1879 @field b pointer to the alignment
1880 @field qpos position of the read base at the pileup site, 0-based
1881 @field indel indel length; 0 for no indel, positive for ins and negative for del
1882 @field level the level of the read in the "viewer" mode
1883 @field is_del 1 iff the base on the padded read is a deletion
1884 @field is_head 1 iff this is the first base in the query sequence
1885 @field is_tail 1 iff this is the last base in the query sequence
1886 @field is_refskip 1 iff the base on the padded read is part of CIGAR N op
1887 @field aux (used by bcf_call_gap_prep())
1888 @field cigar_ind index of the CIGAR operator that has just been processed
1889
1890 @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The
1891 difference between the two functions is that the former does not
1892 set bam_pileup1_t::level, while the later does. Level helps the
1893 implementation of alignment viewers, but calculating this has some
1894 overhead.
1895 */
1896 typedef struct bam_pileup1_t {
1897 bam1_t *b;
1898 int32_t qpos;
1899 int indel, level;
1900 uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, /* reserved */ :1, aux:27;
1901 bam_pileup_cd cd; // generic per-struct data, owned by caller.
1902 int cigar_ind;
1903 } bam_pileup1_t;
1904
1905 typedef int (*bam_plp_auto_f)(void *data, bam1_t *b);
1906
1907 struct bam_plp_s;
1908 typedef struct bam_plp_s *bam_plp_t;
1909
1910 struct bam_mplp_s;
1911 typedef struct bam_mplp_s *bam_mplp_t;
1912
1913 /**
1914 * bam_plp_init() - sets an iterator over multiple
1915 * @func: see mplp_func in bam_plcmd.c in samtools for an example. Expected return
1916 * status: 0 on success, -1 on end, < -1 on non-recoverable errors
1917 * @data: user data to pass to @func
1918 *
1919 * The struct returned by a successful call should be freed
1920 * via bam_plp_destroy() when it is no longer needed.
1921 */
1922 HTSLIB_EXPORT
1923 bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data);
1924
1925 HTSLIB_EXPORT
1926 void bam_plp_destroy(bam_plp_t iter);
1927
1928 HTSLIB_EXPORT
1929 int bam_plp_push(bam_plp_t iter, const bam1_t *b);
1930
1931 HTSLIB_EXPORT
1932 const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
1933
1934 HTSLIB_EXPORT
1935 const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
1936
1937 HTSLIB_EXPORT
1938 const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp);
1939
1940 HTSLIB_EXPORT
1941 const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp);
1942
1943 HTSLIB_EXPORT
1944 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt);
1945
1946 HTSLIB_EXPORT
1947 void bam_plp_reset(bam_plp_t iter);
1948
1949 /**
1950 * bam_plp_constructor() - sets a callback to initialise any per-pileup1_t fields.
1951 * @plp: The bam_plp_t initialised using bam_plp_init.
1952 * @func: The callback function itself. When called, it is given
1953 * the data argument (specified in bam_plp_init), the bam
1954 * structure and a pointer to a locally allocated
1955 * bam_pileup_cd union. This union will also be present in
1956 * each bam_pileup1_t created.
1957 * The callback function should have a negative return
1958 * value to indicate an error. (Similarly for destructor.)
1959 */
1960 HTSLIB_EXPORT
1961 void bam_plp_constructor(bam_plp_t plp,
1962 int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd));
1963 HTSLIB_EXPORT
1964 void bam_plp_destructor(bam_plp_t plp,
1965 int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd));
1966
1967 /// Get pileup padded insertion sequence
1968 /**
1969 * @param p pileup data
1970 * @param ins the kstring where the insertion sequence will be written
1971 * @param del_len location for deletion length
1972 * @return the length of insertion string on success; -1 on failure.
1973 *
1974 * Fills out the kstring with the padded insertion sequence for the current
1975 * location in 'p'. If this is not an insertion site, the string is blank.
1976 *
1977 * If del_len is not NULL, the location pointed to is set to the length of
1978 * any deletion immediately following the insertion, or zero if none.
1979 */
1980 HTSLIB_EXPORT
1981 int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) HTS_RESULT_USED;
1982
1983
1984 /*! @typedef
1985 @abstract An opaque type used for caching base modification state between
1986 successive calls to bam_mods_* functions.
1987 */
1988 typedef struct hts_base_mod_state hts_base_mod_state;
1989
1990 /// Get pileup padded insertion sequence, including base modifications
1991 /**
1992 * @param p pileup data
1993 * @param m state data for the base modification finder
1994 * @param ins the kstring where the insertion sequence will be written
1995 * @param del_len location for deletion length
1996 * @return the number of insertion string on success, with string length
1997 * being accessable via ins->l; -1 on failure.
1998 *
1999 * Fills out the kstring with the padded insertion sequence for the current
2000 * location in 'p'. If this is not an insertion site, the string is blank.
2001 *
2002 * The modification state needs to have been previously initialised using
2003 * bam_parse_basemod. It is permitted to be passed in as NULL, in which
2004 * case this function outputs identically to bam_plp_insertion.
2005 *
2006 * If del_len is not NULL, the location pointed to is set to the length of
2007 * any deletion immediately following the insertion, or zero if none.
2008 */
2009 HTSLIB_EXPORT
2010 int bam_plp_insertion_mod(const bam_pileup1_t *p, hts_base_mod_state *m,
2011 kstring_t *ins, int *del_len) HTS_RESULT_USED;
2012
2013 /// Create a new bam_mplp_t structure
2014 /** The struct returned by a successful call should be freed
2015 * via bam_mplp_destroy() when it is no longer needed.
2016 */
2017 HTSLIB_EXPORT
2018 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data);
2019
2020 /// Set up mpileup overlap detection
2021 /**
2022 * @param iter mpileup iterator
2023 * @return 0 on success; a negative value on error
2024 *
2025 * If called, mpileup will detect overlapping
2026 * read pairs and for each base pair set the base quality of the
2027 * lower-quality base to zero, thus effectively discarding it from
2028 * calling. If the two bases are identical, the quality of the other base
2029 * is increased to the sum of their qualities (capped at 200), otherwise
2030 * it is multiplied by 0.8.
2031 */
2032 HTSLIB_EXPORT
2033 int bam_mplp_init_overlaps(bam_mplp_t iter);
2034
2035 HTSLIB_EXPORT
2036 void bam_mplp_destroy(bam_mplp_t iter);
2037
2038 HTSLIB_EXPORT
2039 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt);
2040
2041 HTSLIB_EXPORT
2042 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp);
2043
2044 HTSLIB_EXPORT
2045 int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp);
2046
2047 HTSLIB_EXPORT
2048 void bam_mplp_reset(bam_mplp_t iter);
2049
2050 HTSLIB_EXPORT
2051 void bam_mplp_constructor(bam_mplp_t iter,
2052 int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd));
2053
2054 HTSLIB_EXPORT
2055 void bam_mplp_destructor(bam_mplp_t iter,
2056 int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd));
2057
2058 #endif // ~!defined(BAM_NO_PILEUP)
2059
2060
2061 /***********************************
2062 * BAQ calculation and realignment *
2063 ***********************************/
2064
2065 HTSLIB_EXPORT
2066 int sam_cap_mapq(bam1_t *b, const char *ref, hts_pos_t ref_len, int thres);
2067
2068 // Used as flag parameter in sam_prob_realn.
2069 enum htsRealnFlags {
2070 BAQ_APPLY = 1,
2071 BAQ_EXTEND = 2,
2072 BAQ_REDO = 4,
2073
2074 // Platform subfield, in bit position 3 onwards
2075 BAQ_AUTO = 0<<3,
2076 BAQ_ILLUMINA = 1<<3,
2077 BAQ_PACBIOCCS = 2<<3,
2078 BAQ_PACBIO = 3<<3,
2079 BAQ_ONT = 4<<3,
2080 BAQ_GENAPSYS = 5<<3
2081 };
2082
2083 /// Calculate BAQ scores
2084 /** @param b BAM record
2085 @param ref Reference sequence
2086 @param ref_len Reference sequence length
2087 @param flag Flags, see description
2088 @return 0 on success \n
2089 -1 if the read was unmapped, zero length, had no quality values, did not have at least one M, X or = CIGAR operator, or included a reference skip. \n
2090 -3 if BAQ alignment has already been done and does not need to be applied, or has already been applied. \n
2091 -4 if alignment failed (most likely due to running out of memory)
2092
2093 This function calculates base alignment quality (BAQ) values using the method
2094 described in "Improving SNP discovery by base alignment quality", Heng Li,
2095 Bioinformatics, Volume 27, Issue 8 (https://doi.org/10.1093/bioinformatics/btr076).
2096
2097 The @param flag value can be generated using the htsRealnFlags enum, but for
2098 backwards compatibilty reasons is retained as an "int". An example usage
2099 of the enum could be this, equivalent to flag 19:
2100
2101 sam_prob_realn(b, ref, len, BAQ_APPLY | BAQ_EXTEND | BAQ_PACBIOCCS);
2102
2103 The following @param flag bits can be used:
2104
2105 Bit 0 (BAQ_APPLY): Adjust the quality values using the BAQ values
2106
2107 If set, the data in the BQ:Z tag is used to adjust the quality values, and
2108 the BQ:Z tag is renamed to ZQ:Z.
2109
2110 If clear, and a ZQ:Z tag is present, the quality values are reverted using
2111 the data in the tag, and the tag is renamed to BQ:Z.
2112
2113 Bit 1 (BAQ_EXTEND): Use "extended" BAQ.
2114
2115 Changes the BAQ calculation to increase sensitivity at the expense of
2116 reduced specificity.
2117
2118 Bit 2 (BAQ_REDO): Recalculate BAQ, even if a BQ tag is present.
2119
2120 Force BAQ to be recalculated. Note that a ZQ:Z tag will always disable
2121 recalculation.
2122
2123 Bits 3-10: Choose parameters tailored to a specific instrument type.
2124
2125 One of BAQ_AUTO, BAQ_ILLUMINA, BAQ_PACBIOCCS, BAQ_PACBIO, BAQ_ONT and
2126 BAQ_GENAPSYS. The BAQ parameter tuning are still a work in progress and
2127 at the time of writing mainly consist of Illumina vs long-read technology
2128 adjustments.
2129
2130 @bug
2131 If the input read has both BQ:Z and ZQ:Z tags, the ZQ:Z one will be removed.
2132 Depending on what previous processing happened, this may or may not be the
2133 correct thing to do. It would be wise to avoid this situation if possible.
2134 */
2135 HTSLIB_EXPORT
2136 int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag);
2137
2138 // ---------------------------
2139 // Base modification retrieval
2140
2141 /*! @typedef
2142 @abstract Holds a single base modification.
2143 @field modified_base The short base code (m, h, etc) or -ChEBI (negative)
2144 @field canonical_base The canonical base referred to in the MM tag.
2145 One of A, C, G, T or N. Note this may not be the
2146 explicit base recorded in the SEQ column (esp. if N).
2147 @field stran 0 or 1, indicating + or - strand from MM tag.
2148 @field qual Quality code (256*probability), or -1 if unknown
2149
2150 @discussion
2151 Note this doesn't hold any location data or information on which other
2152 modifications may be possible at this site.
2153 */
2154 typedef struct hts_base_mod {
2155 int modified_base;
2156 int canonical_base;
2157 int strand;
2158 int qual;
2159 } hts_base_mod;
2160
2161 /// Allocates an hts_base_mode_state.
2162 /**
2163 * @return An hts_base_mode_state pointer on success,
2164 * NULL on failure.
2165 *
2166 * This just allocates the memory. The initialisation of the contents is
2167 * done using bam_parse_basemod. Successive calls may be made to that
2168 * without the need to free and allocate a new state.
2169 *
2170 * The state be destroyed using the hts_base_mode_state_free function.
2171 */
2172 HTSLIB_EXPORT
2173 hts_base_mod_state *hts_base_mod_state_alloc(void);
2174
2175 /// Destroys an hts_base_mode_state.
2176 /**
2177 * @param state The base modification state pointer.
2178 *
2179 * The should have previously been created by hts_base_mode_state_alloc.
2180 */
2181 HTSLIB_EXPORT
2182 void hts_base_mod_state_free(hts_base_mod_state *state);
2183
2184 /// Parses the Mm and Ml tags out of a bam record.
2185 /**
2186 * @param b BAM alignment record
2187 * @param state The base modification state pointer.
2188 * @return 0 on success,
2189 * -1 on failure.
2190 *
2191 * This fills out the contents of the modification state, resetting the
2192 * iterator location to the first sequence base.
2193 */
2194 HTSLIB_EXPORT
2195 int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state);
2196
2197 /// Returns modification status for the next base position in the query seq.
2198 /**
2199 * @param b BAM alignment record
2200 * @param state The base modification state pointer.
2201 * @param mods A supplied array for returning base modifications
2202 * @param n_mods The size of the mods array
2203 * @return The number of modifications found on success,
2204 * -1 on failure.
2205 *
2206 * This is intended to be used as an iterator, with one call per location
2207 * along the query sequence.
2208 *
2209 * If no modifications are found, the returned value is zero.
2210 * If more than n_mods modifications are found, the total found is returned.
2211 * Note this means the caller needs to check whether this is higher than
2212 * n_mods.
2213 */
2214 HTSLIB_EXPORT
2215 int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state,
2216 hts_base_mod *mods, int n_mods);
2217
2218 /// Finds the next location containing base modifications and returns them
2219 /**
2220 * @param b BAM alignment record
2221 * @param state The base modification state pointer.
2222 * @param mods A supplied array for returning base modifications
2223 * @param n_mods The size of the mods array
2224 * @return The number of modifications found on success,
2225 * 0 if no more modifications are present,
2226 * -1 on failure.
2227 *
2228 * Unlike bam_mods_at_next_pos this skips ahead to the next site
2229 * with modifications.
2230 *
2231 * If more than n_mods modifications are found, the total found is returned.
2232 * Note this means the caller needs to check whether this is higher than
2233 * n_mods.
2234 */
2235 HTSLIB_EXPORT
2236 int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state,
2237 hts_base_mod *mods, int n_mods, int *pos);
2238
2239 /// Returns modification status for a specific query position.
2240 /**
2241 * @param b BAM alignment record
2242 * @param state The base modification state pointer.
2243 * @param mods A supplied array for returning base modifications
2244 * @param n_mods The size of the mods array
2245 * @return The number of modifications found on success,
2246 * -1 on failure.
2247 *
2248 * Note if called multipled times, qpos must be higher than the previous call.
2249 * Hence this is suitable for use from a pileup iterator. If more random
2250 * access is required, bam_parse_basemod must be called each time to reset
2251 * the state although this has an efficiency cost.
2252 *
2253 * If no modifications are found, the returned value is zero.
2254 * If more than n_mods modifications are found, the total found is returned.
2255 * Note this means the caller needs to check whether this is higher than
2256 * n_mods.
2257 */
2258 HTSLIB_EXPORT
2259 int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state,
2260 hts_base_mod *mods, int n_mods);
2261
2262
2263 #ifdef __cplusplus
2264 }
2265 #endif
2266
2267 #endif
2268